In [1]:
%cd ../
import numpy as np
import pandas as pd
from tqdm import tqdm
%load_ext autoreload
%autoreload 2

/home/harshapk/Projects/CSS_Project


In [2]:
import pickle
with open("./data/data_processed/cure_data_cleaned.pkl", "rb") as f:
    dataset = pickle.load(f)

In [3]:
dataset = dataset[dataset["lang"] == "en"]

In [4]:
dataset = dataset[["tweet_text", "retweets", "likes", "user_description", "user_following",
"user_followers", "user_verified", "user_total_tweets", "user_total_likes", "label"]]

In [5]:
dataset["label"].unique()

array([0, 1, 2])

In [6]:
from css_proj.models.bertembed import get_sentence_embed
import transformers as ppb
model_class, tokenizer_class, pretrained_weights = (ppb.BertModel, ppb.BertTokenizer, 'bert-base-uncased')
tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
model = model_class.from_pretrained(pretrained_weights)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [7]:
from css_proj.models.clean_tweet import process_text
all_tw_text = dataset["tweet_text"].tolist()
all_tw_text_embed = [get_sentence_embed([process_text(text)], model, tokenizer).numpy() 
                     for text in tqdm(all_tw_text)]

100%|█████████████████████████████████████████████████████████████████████████████| 1970/1970 [02:46<00:00, 11.81it/s]


In [8]:
all_tw_text_embed = np.array(all_tw_text_embed)
all_tw_text_embed = all_tw_text_embed.squeeze()

In [9]:
all_tw_desc = dataset["user_description"].tolist()
all_tw_desc_embed = [get_sentence_embed([process_text(text)], model, tokenizer).numpy() for text in tqdm(all_tw_desc)]
all_tw_desc_embed = np.array(all_tw_desc_embed).squeeze()

100%|█████████████████████████████████████████████████████████████████████████████| 1970/1970 [02:50<00:00, 11.57it/s]


In [10]:
with open("./data/data_processed/covid_cure_embed.pkl", "wb") as f:
    pickle.dump([all_tw_text_embed, all_tw_desc_embed], f)

In [6]:
with open("./data/data_processed/covid_cure_embed.pkl", "rb") as f:
    all_tw_text_embed, all_tw_desc_embed = pickle.load(f)

In [7]:
dataset["num_label"] = dataset["label"].apply(lambda x: 1 if x == 0 else 0)

In [14]:
from empath import Empath
lexicon = Empath()
lexicon.create_category("medical", ["doctor", "physician", "hospital", "health", "disease"])

["doctors", "doctor", "treatment", "patient", "illness", "hospital", "health", "surgery", "condition", "medication", "patients", "disease", "treatments", "sickness", "labour", "labor", "chemo", "operation", "recovery", "surgeon", "Doctors", "psychiatrist", "medicine", "clinic", "pregnancy", "therapy", "cancer", "intensive_care", "nurse", "symptoms", "ward", "diagnosis", "therapist", "trauma", "midwife", "surgeons", "physician", "blood_tests", "injuries", "critical_condition", "infection", "psychologist", "procedure", "chemotherapy", "tumor", "pneumonia", "local_hospital", "healer", "emergency_room", "injury", "injection", "coma", "test_results", "donor", "serious_injuries", "medically", "prescribed", "emergency_surgery", "autopsy", "nurses", "side_effects", "Alzheimer", "medications", "healers", "cure", "minor_injuries", "flu", "meds", "discharged", "suffering", "scans", "pack_doctor", "tests", "cured", "complications", "life_support", "blood_transfusion", "ICU", "blood_test", "head_in

In [15]:
def get_words_from_txt(path):
    return pd.read_table(path, header=None).to_numpy().squeeze()
positive_words = get_words_from_txt("data/data_raw/positive-words.txt")
negative_words = get_words_from_txt("data/data_raw/negative-words.txt")
lexicon.create_category("positive_sent", list(positive_words))
lexicon.create_category("negative_sent", list(negative_words))

["remarkable", "admirable", "questionable", "exceptional", "astounding", "certainly", "yet", "Though", "desirable", "extraordinary", "wise", "although", "practical", "simplicity", "gifted", "pleasing", "humble", "adequate", "honorable", "outstanding", "respectable", "rarity", "bearing", "quality", "enlightened", "Yet", "potential", "agreeable", "certain", "appeal", "sensible", "lacking", "Yet", "appreciative", "endowed", "satisfactory", "valued", "truly", "devoted", "confident", "refined", "perceived", "formidable", "own_right", "successful", "knowledgeable", "astonishing", "intelligence", "Although", "ambition", "outrageous", "powerful", "thorough", "appealing", "promising", "costly", "ideal", "worthy", "wicked", "wondrous", "undoubtedly", "noble", "boast", "however", "enchanting", "superficial", "though", "miraculous", "spiritual", "potential", "primitive", "grateful", "though", "flawed", "superior", "splendid", "unlike", "inadequate", "fascinating", "individual", "Moreover", "reliab

In [19]:
lexicon.analyze(dataset.iloc[0]["tweet_text"], categories=["positive_sent", "negative_sent"])

{'positive_sent': 0.0, 'negative_sent': 1.0}

In [20]:
dataset["user_medical"] = dataset["user_description"].apply(
    lambda x: lexicon.analyze(x, categories=["medical"])["medical"])

In [21]:
dataset["positive_sent"] = dataset["tweet_text"].apply(
    lambda x: lexicon.analyze(x, categories=["positive_sent", "negative_sent"])["positive_sent"])
dataset["negative_sent"] = dataset["tweet_text"].apply(
    lambda x: lexicon.analyze(x, categories=["positive_sent", "negative_sent"])["negative_sent"])

In [22]:
dataset_array = dataset[["retweets", "likes", "user_following",
"user_followers", "user_verified", "user_total_tweets", "user_total_likes", "user_medical",
                         "positive_sent", "negative_sent","num_label"]]
dataset_array = np.array(dataset_array).astype(float)

In [23]:
dataset_array = np.hstack([all_tw_desc_embed, all_tw_text_embed, dataset_array])
dataset_x = dataset_array[:, :-1]
dataset_y = dataset_array[:, -1]

In [24]:
# Train test split
from sklearn.model_selection import train_test_split
train_x, test_x, train_y, test_y = train_test_split(dataset_x, dataset_y, test_size=0.4, random_state=42)

In [25]:
# Normalize features
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
train_x = scaler.fit_transform(train_x)
test_x = scaler.transform(test_x)

In [26]:
# Logistic Regression
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression()
logreg.fit(train_x, train_y)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression()

In [27]:
# Evaluate model
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, f1_score
pred_y = logreg.predict(test_x)
print(accuracy_score(test_y, pred_y))
print(confusion_matrix(test_y, pred_y))
print(classification_report(test_y, pred_y))
print(f1_score(test_y, pred_y))

0.6941624365482234
[[402 109]
 [132 145]]
              precision    recall  f1-score   support

         0.0       0.75      0.79      0.77       511
         1.0       0.57      0.52      0.55       277

    accuracy                           0.69       788
   macro avg       0.66      0.66      0.66       788
weighted avg       0.69      0.69      0.69       788

0.5461393596986817


In [28]:
# Random Forest
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(n_estimators=100, random_state=42)
rfc.fit(train_x, train_y)

RandomForestClassifier(random_state=42)

In [29]:
# Evaluate model
pred_y = rfc.predict(test_x)
print(accuracy_score(test_y, pred_y))
print(confusion_matrix(test_y, pred_y))
print(classification_report(test_y, pred_y))
print(f1_score(test_y, pred_y))

0.700507614213198
[[487  24]
 [212  65]]
              precision    recall  f1-score   support

         0.0       0.70      0.95      0.80       511
         1.0       0.73      0.23      0.36       277

    accuracy                           0.70       788
   macro avg       0.71      0.59      0.58       788
weighted avg       0.71      0.70      0.65       788

0.3551912568306011


In [30]:
# SVM
from sklearn.svm import SVC
svc = SVC(kernel="linear", random_state=42)
svc.fit(train_x, train_y)

SVC(kernel='linear', random_state=42)

In [31]:
# Evaluate model
pred_y = svc.predict(test_x)
print(accuracy_score(test_y, pred_y))
print(confusion_matrix(test_y, pred_y))
print(classification_report(test_y, pred_y))
print(f1_score(test_y, pred_y))

0.6687817258883249
[[374 137]
 [124 153]]
              precision    recall  f1-score   support

         0.0       0.75      0.73      0.74       511
         1.0       0.53      0.55      0.54       277

    accuracy                           0.67       788
   macro avg       0.64      0.64      0.64       788
weighted avg       0.67      0.67      0.67       788

0.5396825396825398


In [32]:
# SVM with RBF kernel
svc_rbf = SVC(kernel="rbf", random_state=42)
svc_rbf.fit(train_x, train_y)

SVC(random_state=42)

In [33]:
# Evaluate model
pred_y = svc_rbf.predict(test_x)
print(accuracy_score(test_y, pred_y))
print(confusion_matrix(test_y, pred_y))
print(classification_report(test_y, pred_y))
print(f1_score(test_y, pred_y))

0.6954314720812182
[[491  20]
 [220  57]]
              precision    recall  f1-score   support

         0.0       0.69      0.96      0.80       511
         1.0       0.74      0.21      0.32       277

    accuracy                           0.70       788
   macro avg       0.72      0.58      0.56       788
weighted avg       0.71      0.70      0.63       788

0.3220338983050847


In [34]:
# Neural Network
from sklearn.neural_network import MLPClassifier
mlp = MLPClassifier(random_state=42, hidden_layer_sizes=(100, 100, 100))
mlp.fit(train_x, train_y)

MLPClassifier(hidden_layer_sizes=(100, 100, 100), random_state=42)

In [35]:
# Evaluate model
pred_y = mlp.predict(test_x)
print(accuracy_score(test_y, pred_y))
print(confusion_matrix(test_y, pred_y))
print(classification_report(test_y, pred_y))
print(f1_score(test_y, pred_y))

0.6979695431472082
[[414  97]
 [141 136]]
              precision    recall  f1-score   support

         0.0       0.75      0.81      0.78       511
         1.0       0.58      0.49      0.53       277

    accuracy                           0.70       788
   macro avg       0.66      0.65      0.66       788
weighted avg       0.69      0.70      0.69       788

0.5333333333333332
