In [5]:
import pandas as pd
import os 
import torch
os.environ["TOKENIZERS_PARALLELISM"] = "false"


In [6]:
from transformers import AutoModelForSequenceClassification
from transformers import AutoTokenizer
import tqdm
import evaluate

  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(


In [7]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("mps") if torch.backends.mps.is_available() else torch.device("cpu")
device

device(type='mps')

In [8]:
def model_evaluation(predicted_class=None, true_class=None):
    if predicted_class is None or true_class is None:
        raise ValueError("predicted and true must be not None")
    
    accuracy = evaluate.load("accuracy")
    acc = accuracy.compute(predictions=predicted_class, references=true_class, )
    
    precision = evaluate.load("precision")
    prec = precision.compute(predictions=predicted_class, references=true_class, average=None)
    prec_weigted = precision.compute(predictions=predicted_class, references=true_class, average="macro")

    recall = evaluate.load("recall")
    rec = recall.compute(predictions=predicted_class, references=true_class, average=None)
    rec_weigted = recall.compute(predictions=predicted_class, references=true_class, average="macro")

    f1 = evaluate.load("f1")
    calc_f1 = f1.compute(predictions=predicted_class, references=true_class, average=None)
    f1_weighted = f1.compute(predictions=predicted_class, references=true_class, average="macro")
    return acc, prec, prec_weigted, rec, rec_weigted, calc_f1, f1_weighted
    # return acc

# Imbalanced Dataset

In [9]:
test_dataset = pd.read_csv(os.path.join("data-imbalance", "test-imbalance.csv"))
len(test_dataset)

2000

## Model trained on imbalanced data

In [10]:
from transformers import AutoModelForSequenceClassification
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("J1mb0o/semantic-bert-imbalanced-dataset")
model = AutoModelForSequenceClassification.from_pretrained("J1mb0o/semantic-bert-imbalanced-dataset").to(device)

predicted_class = []
true_class = []

for text, label in tqdm.tqdm( zip(test_dataset["tweet"], test_dataset["label"])):
    inputs = tokenizer(text, return_tensors="pt").to(device)
    with torch.no_grad():
        logits = model(**inputs).logits
    predicted_class_id = logits.argmax().item()
    predicted_class.append(predicted_class_id)
    true_class.append(label)
    

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

2000it [00:17, 111.53it/s]


In [11]:
results_unbalanced = model_evaluation(predicted_class=predicted_class, true_class=true_class)


In [12]:
print(results_unbalanced)

({'accuracy': 0.6425}, {'precision': array([0.57435897, 0.5144357 , 0.74880153])}, {'precision': 0.6125320679778218}, {'recall': array([0.34461538, 0.57562408, 0.78571429])}, {'recall': 0.568651250853894}, {'f1': array([0.43076923, 0.54331254, 0.76681394])}, {'f1': 0.5802985720511494})


## Model trained on Balanced Dataset

In [13]:
from transformers import AutoModelForSequenceClassification
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("J1mb0o/semantic-bert-balanced-dataset")
model = AutoModelForSequenceClassification.from_pretrained("J1mb0o/semantic-bert-balanced-dataset").to(device)

predicted_class = []
true_class = []

for text, label in tqdm.tqdm( zip(test_dataset["tweet"], test_dataset["label"])):
    inputs = tokenizer(text, return_tensors="pt").to(device)
    with torch.no_grad():
        logits = model(**inputs).logits
    predicted_class_id = logits.argmax().item()
    predicted_class.append(predicted_class_id)
    true_class.append(label)
    

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

2000it [00:15, 127.08it/s]


In [14]:
results_balanced = model_evaluation(predicted_class=predicted_class, true_class=true_class)


In [15]:
print(results_balanced)

({'accuracy': 0.5785}, {'precision': array([0.37605634, 0.50555556, 0.76129032])}, {'precision': 0.5476340720547899}, {'recall': array([0.82153846, 0.26725404, 0.71227364])}, {'recall': 0.6003553805229055}, {'f1': array([0.51594203, 0.34966378, 0.73596674])}, {'f1': 0.5338575165915098})


## Mistral 7b Instruct

In [16]:
def create_prompt1(text):
    return f"""Your task is to classify a tweet sentiment as positive, negative or neutral only. 
    
    Tweet: {text}
    
    Just generate the JSON object without explanations. Don't forget to close the JSON object with a curly bracket.
    """

def create_prompt2(text):
    return f"""Your task is to classify a tweet sentiment as positive, negative or neutral only. 
    Think step by step. 
    
    Tweet: {text}
    
    Just generate the JSON object without explanations. Don't forget to close the JSON object with a curly bracket.
    """


In [17]:
from langchain.llms import Ollama
import string
import json
llm = Ollama(model="mistral", temperature=0)


### Evaluate on first prompt

In [18]:
predicted_class = []
true_class = []

for text, label in tqdm.tqdm( zip(test_dataset["tweet"], test_dataset["label"])):
    prompt = create_prompt1(text)
    response = llm(prompt)
    response = json.loads(response)
    predicted_class.append(response["sentiment"])
    true_class.append(label)


2000it [28:59,  1.15it/s]


In [19]:
for i in predicted_class:
    if i not in ["positive", "negative", "neutral"]:
        print(i)

In [20]:
predicted_class_id = [["negative", "neutral", "positive"].index(i) for i in predicted_class]

In [21]:
mistral_results_prompt1 = model_evaluation(predicted_class=predicted_class_id, true_class=true_class)

In [22]:
# import string
# import json
# text = test_dataset.sample(1)
# print(text["tweet"].values[0], text["label"].values[0])
# prompt = create_prompt(text["tweet"].values[0])
# response = llm(prompt)
# # response = ''.join(ch for ch in response.strip().lower() if ch not in string.punctuation)
# response = json.loads(response)
# print(response["sentiment"])


In [23]:
mistral_results_prompt1

({'accuracy': 0.608},
 {'precision': array([0.56743003, 0.46439024, 0.88831615])},
 {'precision': 0.640045473516827},
 {'recall': array([0.68615385, 0.6989721 , 0.52012072])},
 {'recall': 0.6350822234510266},
 {'f1': array([0.62116992, 0.55803048, 0.65609137])},
 {'f1': 0.6117639225498076})

### Evaluate on second prompt

In [24]:
predicted_class = []
true_class = []

for text, label in tqdm.tqdm( zip(test_dataset["tweet"], test_dataset["label"])):
    prompt = create_prompt2(text)
    response = llm(prompt)
    response = json.loads(response)
    predicted_class.append(response["sentiment"])
    true_class.append(label)


2000it [29:06,  1.15it/s]


In [25]:
for i in predicted_class:
    if i not in ["positive", "negative", "neutral"]:
        print(i)

In [26]:
predicted_class_id = [["negative", "neutral", "positive"].index(i) for i in predicted_class]

In [27]:
mistral_results_prompt2 = model_evaluation(predicted_class=predicted_class_id, true_class=true_class)

In [28]:
mistral_results_prompt2

({'accuracy': 0.611},
 {'precision': array([0.56377551, 0.46719682, 0.8820598 ])},
 {'precision': 0.6376773766513402},
 {'recall': array([0.68      , 0.69016153, 0.53420523])},
 {'recall': 0.6347889195180875},
 {'f1': array([0.61645746, 0.55720213, 0.66541353])},
 {'f1': 0.613024376481984})

## Export results to a dataframe

In [29]:
resutls_df = pd.DataFrame(columns=["Model", 
                      "Accuracy", 
                      "Pre_{Negative}", 
                      "Pre_{Neu}", 
                      "Pre_{Pos}", 
                      "Pre_{Weighted}", 
                      "Rec_{Negative}", 
                      "Rec_{Neu}", 
                      "Rec_{Pos}", 
                      "Rec_{Weighted}", 
                      "F1_{Negative}", 
                      "F1_{Neu}", 
                      "F1_{Pos}", 
                      "F1_{Weighted}"],
                      data=[["DistilBERT (unbalanced)", 
                              results_unbalanced[0]["accuracy"], 
                              results_unbalanced[1]["precision"][0], 
                              results_unbalanced[1]["precision"][1],
                              results_unbalanced[1]["precision"][2],
                              results_unbalanced[2]["precision"],
                              results_unbalanced[3]["recall"][0],
                              results_unbalanced[3]["recall"][1],
                              results_unbalanced[3]["recall"][2],
                              results_unbalanced[4]["recall"], 
                              results_unbalanced[5]["f1"][0],
                              results_unbalanced[5]["f1"][1],
                              results_unbalanced[5]["f1"][2],
                              results_unbalanced[6]["f1"]],
                              ["DistilBERT (balanced)",
                              results_balanced[0]["accuracy"],
                              results_balanced[1]["precision"][0],
                              results_balanced[1]["precision"][1],
                              results_balanced[1]["precision"][2],
                              results_balanced[2]["precision"],
                              results_balanced[3]["recall"][0],
                              results_balanced[3]["recall"][1],
                              results_balanced[3]["recall"][2],
                              results_balanced[4]["recall"],
                              results_balanced[5]["f1"][0],
                              results_balanced[5]["f1"][1],
                              results_balanced[5]["f1"][2],
                              results_balanced[6]["f1"]],
                              ["Mistral (prompt 1)",
                              mistral_results_prompt1[0]["accuracy"],
                              mistral_results_prompt1[1]["precision"][0],
                              mistral_results_prompt1[1]["precision"][1],
                              mistral_results_prompt1[1]["precision"][2],
                              mistral_results_prompt1[2]["precision"],
                              mistral_results_prompt1[3]["recall"][0],
                              mistral_results_prompt1[3]["recall"][1],
                              mistral_results_prompt1[3]["recall"][2],
                              mistral_results_prompt1[4]["recall"],
                              mistral_results_prompt1[5]["f1"][0],
                              mistral_results_prompt1[5]["f1"][1],
                              mistral_results_prompt1[5]["f1"][2],
                              mistral_results_prompt1[6]["f1"]],
                              ["Mistral (prompt 2)",
                              mistral_results_prompt2[0]["accuracy"],
                              mistral_results_prompt2[1]["precision"][0],
                              mistral_results_prompt2[1]["precision"][1],
                              mistral_results_prompt2[1]["precision"][2],
                              mistral_results_prompt2[2]["precision"],
                              mistral_results_prompt2[3]["recall"][0],
                              mistral_results_prompt2[3]["recall"][1],
                              mistral_results_prompt2[3]["recall"][2],
                              mistral_results_prompt2[4]["recall"],
                              mistral_results_prompt2[5]["f1"][0],
                              mistral_results_prompt2[5]["f1"][1],
                              mistral_results_prompt2[5]["f1"][2],
                              mistral_results_prompt2[6]["f1"]],
                                                  
                        ]
                      )

In [30]:
resutls_df.to_csv("results-unbalanced.csv", index=False)
resutls_df.to_latex("results-unbalanced.tex", index=False)

# Balanced Dataset

In [31]:
test_dataset = pd.read_csv(os.path.join("data-balance", "test-balance.csv"))
len(test_dataset)

975

## Model trained on imbalanced data

In [32]:
from transformers import AutoModelForSequenceClassification
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("J1mb0o/semantic-bert-imbalanced-dataset")
model = AutoModelForSequenceClassification.from_pretrained("J1mb0o/semantic-bert-imbalanced-dataset").to(device)

predicted_class = []
true_class = []

for text, label in tqdm.tqdm( zip(test_dataset["tweet"], test_dataset["label"])):
    inputs = tokenizer(text, return_tensors="pt").to(device)
    with torch.no_grad():
        logits = model(**inputs).logits
    predicted_class_id = logits.argmax().item()
    predicted_class.append(predicted_class_id)
    true_class.append(label)
    

975it [00:07, 126.35it/s]


In [33]:
results_unbalanced = model_evaluation(predicted_class=predicted_class, true_class=true_class)


In [34]:
print(results_unbalanced)

({'accuracy': 0.5641025641025641}, {'precision': array([0.73202614, 0.43891403, 0.64210526])}, {'precision': 0.6043484780326885}, {'recall': array([0.34461538, 0.59692308, 0.75076923])}, {'recall': 0.564102564102564}, {'f1': array([0.46861925, 0.50586701, 0.69219858])}, {'f1': 0.5555616142545996})


## Model trained on Balanced Dataset

In [35]:
from transformers import AutoModelForSequenceClassification
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("J1mb0o/semantic-bert-balanced-dataset")
model = AutoModelForSequenceClassification.from_pretrained("J1mb0o/semantic-bert-balanced-dataset").to(device)

predicted_class = []
true_class = []

for text, label in tqdm.tqdm( zip(test_dataset["tweet"], test_dataset["label"])):
    inputs = tokenizer(text, return_tensors="pt").to(device)
    with torch.no_grad():
        logits = model(**inputs).logits
    predicted_class_id = logits.argmax().item()
    predicted_class.append(predicted_class_id)
    true_class.append(label)
    

975it [00:07, 127.19it/s]


In [36]:
results_balanced = model_evaluation(predicted_class=predicted_class, true_class=true_class)


In [37]:
print(results_balanced)

({'accuracy': 0.598974358974359}, {'precision': array([0.57543103, 0.53409091, 0.66567164])}, {'precision': 0.5917311951215708}, {'recall': array([0.82153846, 0.28923077, 0.68615385])}, {'recall': 0.5989743589743589}, {'f1': array([0.67680608, 0.3752495 , 0.67575758])}, {'f1': 0.5759377201352566})


## Mistral 7b Instruct

In [38]:
def create_prompt1(text):
    return f"""Your task is to classify a tweet sentiment as positive, negative or neutral only. 
    
    Tweet: {text}
    
    Just generate the JSON object without explanations. Don't forget to close the JSON object with a curly bracket.
    """

def create_prompt2(text):
    return f"""Your task is to classify a tweet sentiment as positive, negative or neutral only. 
    Think step by step. 
    
    Tweet: {text}
    
    Just generate the JSON object without explanations. Don't forget to close the JSON object with a curly bracket.
    """


In [39]:
from langchain.llms import Ollama
import string
import json
llm = Ollama(model="mistral", temperature=0)


### Evaluate on first prompt

In [40]:
predicted_class = []
true_class = []

for text, label in tqdm.tqdm( zip(test_dataset["tweet"], test_dataset["label"])):
    prompt = create_prompt1(text)
    response = llm(prompt)
    response = json.loads(response)
    predicted_class.append(response["sentiment"])
    true_class.append(label)


975it [13:55,  1.17it/s]


In [41]:
for i in predicted_class:
    if i not in ["positive", "negative", "neutral"]:
        print(i)

In [42]:
predicted_class_id = [["negative", "neutral", "positive"].index(i) for i in predicted_class]

In [43]:
mistral_results_prompt1 = model_evaluation(predicted_class=predicted_class_id, true_class=true_class)

In [44]:
# import string
# import json
# text = test_dataset.sample(1)
# print(text["tweet"].values[0], text["label"].values[0])
# prompt = create_prompt(text["tweet"].values[0])
# response = llm(prompt)
# # response = ''.join(ch for ch in response.strip().lower() if ch not in string.punctuation)
# response = json.loads(response)
# print(response["sentiment"])


In [45]:
mistral_results_prompt1

({'accuracy': 0.6328205128205128},
 {'precision': array([0.72875817, 0.48523207, 0.84102564])},
 {'precision': 0.68500529282361},
 {'recall': array([0.68615385, 0.70769231, 0.50461538])},
 {'recall': 0.6328205128205129},
 {'f1': array([0.70681458, 0.57571965, 0.63076923])},
 {'f1': 0.6377678201209597})

### Evaluate on second prompt

In [46]:
predicted_class = []
true_class = []

for text, label in tqdm.tqdm( zip(test_dataset["tweet"], test_dataset["label"])):
    prompt = create_prompt2(text)
    response = llm(prompt)
    response = json.loads(response)
    predicted_class.append(response["sentiment"])
    true_class.append(label)


975it [14:07,  1.15it/s]


In [47]:
for i in predicted_class:
    if i not in ["positive", "negative", "neutral"]:
        print(i)

In [48]:
predicted_class_id = [["negative", "neutral", "positive"].index(i) for i in predicted_class]

In [49]:
mistral_results_prompt2 = model_evaluation(predicted_class=predicted_class_id, true_class=true_class)

In [50]:
mistral_results_prompt2

({'accuracy': 0.6369230769230769},
 {'precision': array([0.72459016, 0.49036403, 0.84236453])},
 {'precision': 0.6857729072166873},
 {'recall': array([0.68      , 0.70461538, 0.52615385])},
 {'recall': 0.6369230769230769},
 {'f1': array([0.7015873 , 0.57828283, 0.64772727])},
 {'f1': 0.6425324675324676})

## Export results to a dataframe

In [51]:
resutls_df = pd.DataFrame(columns=["Model", 
                      "Accuracy", 
                      "Pre_{Negative}", 
                      "Pre_{Neu}", 
                      "Pre_{Pos}", 
                      "Pre_{Weighted}", 
                      "Rec_{Negative}", 
                      "Rec_{Neu}", 
                      "Rec_{Pos}", 
                      "Rec_{Weighted}", 
                      "F1_{Negative}", 
                      "F1_{Neu}", 
                      "F1_{Pos}", 
                      "F1_{Weighted}"],
                      data=[["DistilBERT (unbalanced)", 
                              results_unbalanced[0]["accuracy"], 
                              results_unbalanced[1]["precision"][0], 
                              results_unbalanced[1]["precision"][1],
                              results_unbalanced[1]["precision"][2],
                              results_unbalanced[2]["precision"],
                              results_unbalanced[3]["recall"][0],
                              results_unbalanced[3]["recall"][1],
                              results_unbalanced[3]["recall"][2],
                              results_unbalanced[4]["recall"], 
                              results_unbalanced[5]["f1"][0],
                              results_unbalanced[5]["f1"][1],
                              results_unbalanced[5]["f1"][2],
                              results_unbalanced[6]["f1"]],
                              ["DistilBERT (balanced)",
                              results_balanced[0]["accuracy"],
                              results_balanced[1]["precision"][0],
                              results_balanced[1]["precision"][1],
                              results_balanced[1]["precision"][2],
                              results_balanced[2]["precision"],
                              results_balanced[3]["recall"][0],
                              results_balanced[3]["recall"][1],
                              results_balanced[3]["recall"][2],
                              results_balanced[4]["recall"],
                              results_balanced[5]["f1"][0],
                              results_balanced[5]["f1"][1],
                              results_balanced[5]["f1"][2],
                              results_balanced[6]["f1"]],
                              ["Mistral (prompt 1)",
                              mistral_results_prompt1[0]["accuracy"],
                              mistral_results_prompt1[1]["precision"][0],
                              mistral_results_prompt1[1]["precision"][1],
                              mistral_results_prompt1[1]["precision"][2],
                              mistral_results_prompt1[2]["precision"],
                              mistral_results_prompt1[3]["recall"][0],
                              mistral_results_prompt1[3]["recall"][1],
                              mistral_results_prompt1[3]["recall"][2],
                              mistral_results_prompt1[4]["recall"],
                              mistral_results_prompt1[5]["f1"][0],
                              mistral_results_prompt1[5]["f1"][1],
                              mistral_results_prompt1[5]["f1"][2],
                              mistral_results_prompt1[6]["f1"]],
                              ["Mistral (prompt 2)",
                              mistral_results_prompt2[0]["accuracy"],
                              mistral_results_prompt2[1]["precision"][0],
                              mistral_results_prompt2[1]["precision"][1],
                              mistral_results_prompt2[1]["precision"][2],
                              mistral_results_prompt2[2]["precision"],
                              mistral_results_prompt2[3]["recall"][0],
                              mistral_results_prompt2[3]["recall"][1],
                              mistral_results_prompt2[3]["recall"][2],
                              mistral_results_prompt2[4]["recall"],
                              mistral_results_prompt2[5]["f1"][0],
                              mistral_results_prompt2[5]["f1"][1],
                              mistral_results_prompt2[5]["f1"][2],
                              mistral_results_prompt2[6]["f1"]],
                                                  
                        ]
                      )

In [52]:
resutls_df.to_csv("results-balanced.csv", index=False)
resutls_df.to_latex("results-balanced.tex", index=False)

# Random Forest 

In [3]:
import pickle

rforest = pickle.load(open("random_forest_model.sav", "rb"))