## Introduction

One of possible baseline approaches to solving NLP tasks like sentiment analysis is constructing bag of words with tf-idf vectorizing approach and then classifier/regression model

## Preparation

In [1]:
!pip install lets_plot pactools -q

[0m

In [2]:
from lets_plot import *
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
from sklearn.preprocessing import LabelEncoder
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from nltk.stem import PorterStemmer
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression

import matplotlib.pyplot as plt 
import pandas as pd
import numpy as np
import os
import pickle
import time
import warnings

warnings.filterwarnings('ignore')

In [3]:
seed = 42

In [4]:
LetsPlot.setup_html()

## Data Preprocessing

In [5]:
path_to_data_pkl = '/kaggle/input/emotions/emotions_data.pkl'

In [6]:
with open(path_to_data_pkl, 'rb') as file:
    data = pickle.load(file)
    print(f'Got data of shape : {data.shape}')

Got data of shape : (416809, 2)


In [7]:
data.rename(columns={'emotions' : 'label'}, inplace=True)
data.head()

Unnamed: 0,text,label
27383,i feel awful about it too because it s my job ...,sadness
110083,im alone i feel awful,sadness
140764,ive probably mentioned this before but i reall...,joy
100071,i was feeling a little low few days back,sadness
2837,i beleive that i am much more sensitive to oth...,love


In [8]:
data.isna().sum()

text     0
label    0
dtype: int64

In [9]:
frequency = data.label.value_counts()
frequency

joy         141067
sadness     121187
anger        57317
fear         47712
love         34554
surprise     14972
Name: label, dtype: int64

In [10]:
frequency = pd.DataFrame({
    'Labels' : frequency.index,
    'Total' : frequency.values
})

In [11]:
ggplot(frequency, aes(x=frequency.Labels, weight=frequency.Total, fill=frequency.Labels)) + \
    geom_bar() + labs(x='Label', y='Times Occured')

We got text messages and 6 classes corresponding to them.

In [12]:
label_encoder = LabelEncoder()
labels_data = label_encoder.fit_transform(data['label'].values.reshape(-1, 1)).astype(np.uint8)

In [13]:
data['label'] = labels_data

We got text messages in english and to begin with we need to make it lowercase, remove stop words and extract tokens' count features

In [14]:
data['text'] = data['text'].str.lower()

data['text'] = data['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stopwords.words('english'))]))

In [15]:
data.head()

Unnamed: 0,text,label
27383,feel awful job get position succeed happen,4
110083,im alone feel awful,4
140764,ive probably mentioned really feel proud actua...,2
100071,feeling little low days back,4
2837,beleive much sensitive peoples feelings tend c...,3


## TF-IDF

In [16]:
counts = CountVectorizer()

bag_of_words = counts.fit_transform(data['text'])

Now we are calculating tf-idf metrics, we also add `l2` regularization for future better performance of linear models

In [17]:
tf_idf = TfidfTransformer(use_idf=True,
                         norm='l2',
                         smooth_idf=True)

tf_idf.fit_transform(bag_of_words)

<416809x75132 sparse matrix of type '<class 'numpy.float64'>'
	with 3772102 stored elements in Compressed Sparse Row format>

We are also going to use simple tokenizer with stemming - reducing words to their root 

In [18]:
def tokenizer(text):
    stemming = PorterStemmer()
    tokens = [stemming.stem(word) for word in text.split()]
    return tokens

## Training & Cross-Validation

In [19]:
TEST_SIZE = 0.2

In [20]:
X_train, X_test, y_train, y_test = train_test_split(data.text, data.drop(columns=['text']), test_size=TEST_SIZE, random_state=seed)

Next we'll do grid search over parameters for our logistic regression model, this model was chosen because it's quite fast and stable for predicting probability of classes

In [21]:
tfidf = TfidfVectorizer(strip_accents=None,
                        lowercase=False,
                        preprocessor=None)

param_grid = [{'vect__ngram_range': [(1, 1), (1, 2)],
               'vect__tokenizer': [tokenizer],
               'clf__penalty': ['l1', 'l2'],
               'clf__C': [0.1, 1.0]}
              ]

lr_tfidf = Pipeline([('vect', tfidf),
                     ('clf', LogisticRegression(random_state=seed, multi_class='multinomial', solver='saga'))])

gs_lr_tfidf = GridSearchCV(lr_tfidf, param_grid,
                           scoring='f1_macro',
                           cv=3,
                           verbose=5,
                           n_jobs=1)

In [22]:
gs_lr_tfidf.fit(X_train, y_train)

Fitting 3 folds for each of 8 candidates, totalling 24 fits
[CV 1/3] END clf__C=0.1, clf__penalty=l1, vect__ngram_range=(1, 1), vect__tokenizer=<function tokenizer at 0x7fa9e680ab90>;, score=0.836 total time= 1.9min
[CV 2/3] END clf__C=0.1, clf__penalty=l1, vect__ngram_range=(1, 1), vect__tokenizer=<function tokenizer at 0x7fa9e680ab90>;, score=0.834 total time= 1.9min
[CV 3/3] END clf__C=0.1, clf__penalty=l1, vect__ngram_range=(1, 1), vect__tokenizer=<function tokenizer at 0x7fa9e680ab90>;, score=0.836 total time= 2.0min
[CV 1/3] END clf__C=0.1, clf__penalty=l1, vect__ngram_range=(1, 2), vect__tokenizer=<function tokenizer at 0x7fa9e680ab90>;, score=0.847 total time= 2.3min
[CV 2/3] END clf__C=0.1, clf__penalty=l1, vect__ngram_range=(1, 2), vect__tokenizer=<function tokenizer at 0x7fa9e680ab90>;, score=0.846 total time= 2.3min
[CV 3/3] END clf__C=0.1, clf__penalty=l1, vect__ngram_range=(1, 2), vect__tokenizer=<function tokenizer at 0x7fa9e680ab90>;, score=0.844 total time= 2.3min
[CV 

GridSearchCV(cv=3,
             estimator=Pipeline(steps=[('vect',
                                        TfidfVectorizer(lowercase=False)),
                                       ('clf',
                                        LogisticRegression(multi_class='multinomial',
                                                           random_state=42,
                                                           solver='saga'))]),
             n_jobs=1,
             param_grid=[{'clf__C': [0.1, 1.0], 'clf__penalty': ['l1', 'l2'],
                          'vect__ngram_range': [(1, 1), (1, 2)],
                          'vect__tokenizer': [<function tokenizer at 0x7fa9e680ab90>]}],
             scoring='f1_macro', verbose=5)

In [23]:
print('Best parameter set: ' + str(gs_lr_tfidf.best_params_))
print('Best f1_macro: %.3f' % gs_lr_tfidf.best_score_)

Best parameter set: {'clf__C': 1.0, 'clf__penalty': 'l1', 'vect__ngram_range': (1, 2), 'vect__tokenizer': <function tokenizer at 0x7fa9e680ab90>}
Best f1_macro: 0.865


## Evaluating

Now let's evaluate our baseline logistic regression model (best estimator in cross validation). Later we will compare accuracy and f1 scores on test data using more complicated model - BERT

In [24]:
log_reg_clf = gs_lr_tfidf.best_estimator_

y_pred = log_reg_clf.predict(X_test)

In [25]:
test_acc = accuracy_score(y_test, y_pred)
test_f1 = f1_score(y_test, y_pred, average='macro')

In [26]:
print(f'Test Accuracy : {test_acc}')
print(f'Test F1-Score : {test_f1}')

Test Accuracy : 0.9049327031501163
Test F1-Score : 0.8659889474270882
