In [1]:
import os
import pandas as pd
from tqdm import tqdm
import warnings

from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import classification_report,f1_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

from src.preprocessor import data_reader, data_prep
from src.ml_functions import emotion_prediction

In [6]:
warnings.filterwarnings("ignore")
pd.set_option('display.max_colwidth',None)
tqdm.pandas()

In [7]:
data_dir =  os.path.join(os.path.sep.join(os.getcwd().split(os.path.sep)[:-1]), "data")
models_dir =  os.path.join(os.path.sep.join(os.getcwd().split(os.path.sep)[:-1]), "models")

In [8]:
df_train = pd.read_csv(os.path.join(data_dir,"dataset.csv")).loc[lambda x: x["emotion"] != "neutral"]
df_test = pd.read_csv(os.path.join(data_dir,"test_set.csv"))

In [9]:
tqdm.pandas()
pd.set_option('display.max_colwidth',None)
dfs, keys = data_reader(os.path.join(data_dir,"additional_datasets"))

skipping group_test.csv!
all_datasets loaded!
all_datasets_evenly_distributed loaded!
all_datasets_stem_no_stopwords loaded!
all_datasets_without_reddit loaded!
all_datasets_without_reddit_evenly_distributed loaded!
all_datasets_without_reddit_evenly_distributed_stem_no_stopwords loaded!
all_datasets_without_reddit_stem_no_stopwords loaded!


In [10]:
for i, variable_name in enumerate(keys):
    globals()[variable_name] = dfs[i]

In [12]:
df = data_prep(all_datasets, None)

100%|██████████| 646214/646214 [01:52<00:00, 5756.62it/s]


emotion
happiness    171812
sadness      117255
anger         57236
fear          43277
surprise      18657
disgust       10858
Name: count, dtype: int64


In [13]:
feature = 'sentence'
X  = all_datasets[feature]
y = all_datasets['emotion']
(X.shape, y.shape)

((646214,), (646214,))

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)

In [15]:
# Loading models
lr = LogisticRegression(max_iter=1000, class_weight='balanced',penalty='l2', C=0.1)
nb = MultinomialNB()
# Loading vectorizers
cv_vect = CountVectorizer()
tfidf_vect = TfidfVectorizer(ngram_range=(1, 3),smooth_idf=False)

In [16]:
models = [lr, nb]
vects = [cv_vect, tfidf_vect]

In [12]:
%%time
for model in models:
    for vect in vects:
        model_name = 'Logistic Regression' if model == lr else 'Naive Bayes'
        vectorizer_name = 'Count Vectorizer' if vect == cv_vect else 'Tfidf Vectorizer'
        print(f'{model_name} with {vectorizer_name}')
        emotion_prediction(df_test,model,vect,X_train,X_test,y_train,y_test,feature)

Logistic Regression with Count Vectorizer
F1 score on training set: 0.8137072090860415
F1 score on validation set: 0.7979001272093855
F1 score on test set: 0.5532366394476277 	
Logistic Regression with Tfidf Vectorizer
F1 score on training set: 0.8175747962590142
F1 score on validation set: 0.7922066404041194
F1 score on test set: 0.5838784323481987 	
Naive Bayes with Count Vectorizer
F1 score on training set: 0.8095896510379463
F1 score on validation set: 0.7783491207206138
F1 score on test set: 0.5216332315811568 	
Naive Bayes with Tfidf Vectorizer
F1 score on training set: 0.691843718843427
F1 score on validation set: 0.5975544304661733
F1 score on test set: 0.3271902727369587 	
