# Logistic Regression

### Import relevant libraries

In [None]:
# import relevant libraries

# for data manipulation
import numpy as np
import pandas as pd

# split data into training, validation and test set
from sklearn.model_selection import train_test_split

# for counter
from tqdm import tqdm

# for classifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

# for oversampling
from imblearn.over_sampling import RandomOverSampler

### Import data after pre-processing

In [None]:
# import the data
df = pd.read_csv('df_tokenized.csv')

In [None]:
# check shape
df.shape

(660645, 2)

### Split into training, validation and test set

##### Check dataframe

In [None]:
df.head()

Unnamed: 0,tokens,sentiment
0,"['great', 'locat', 'close', 'main', 'public', ...",positive
1,"['famili', 'four', 'thi', 'flat', 'can', 'acco...",positive
2,"['place', 'wonder', 'plenti', 'room', 'us', 'h...",positive
3,"['great', 'locat', 'truli', 'onli', 'coupl', '...",positive
4,"['great', 'place', 'perfect', 'weekend', 'not'...",positive


In [None]:
df['sentiment'].value_counts()

sentiment
positive    654007
negative      6638
Name: count, dtype: int64

In [None]:
y = np.where(df['sentiment']=='positive',1,0)

##### Split 60:20:20

In [None]:
# create test set
X_train, X_test, y_train, y_test = train_test_split(df['tokens'], y, test_size=0.2, stratify=y, random_state=42)

In [None]:
# create training and validation set
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=42)

In [None]:
# check distributions
print(y_train.sum()/y_train.shape[0])
print(y_val.sum()/y_val.shape[0])
print(y_test.sum()/y_test.shape[0])

0.9900753556499078
0.9895859349575037
0.9899492162961954


There is a strong imbalance, thus, some techniques against imbalance have to be applied.

In [None]:
# before that we will use grid-search 5-fold cross validation to find the optimum value for alpha therefore no validation set is needed
X_train = pd.concat([X_train,X_val])
y_train = np.append(y_train,y_val)

### Create feature space (DTM)

A computer cannot deal with raw text data, the texts have to be converted to feature space.

In [None]:
# initiate CountVectorizer
vectorizer = CountVectorizer()

In [None]:
# learn the vocabulary from training set
vectorizer.fit(X_train)

In [None]:
# create document-term matrix
X_train = vectorizer.transform(X_train)

In [None]:
# transform also test set to document-term matrix
X_test = vectorizer.transform(X_test)

In [None]:
# check the features of the document-term matrix
vectorizer.get_feature_names_out()[1000:1010]

array(['ac_help', 'ac_host', 'ac_hot', 'ac_howev', 'ac_huge', 'ac_kept',
       'ac_live', 'ac_live_room', 'ac_make', 'ac_much'], dtype=object)

In [None]:
# check amount of unique vocabulary 
vectorizer.get_feature_names_out().shape

(302023,)

### Apply oversampling to tackle class imbalance 

##### Oversampling

Oversample the minority class so that there are more observations of it. Create as many so that half of the documents have negative sentiment.

In [None]:
# oversample the minority class
oversampler = RandomOverSampler(sampling_strategy='minority', random_state=42)
X_train, y_train = oversampler.fit_resample(X_train, y_train)

In [None]:
# check new ratio of sentiment labels
print(y_train.sum()/y_train.shape[0])

0.5


### Logistic Regression

First, build logistic regression classifier with no tuning.

In [None]:
# build Logistic Regression classifier
logistic_classifier = LogisticRegression(C=1, max_iter=1000, random_state=42)

In [None]:
# train the classifier
logistic_classifier.fit(X_train, y_train)

In [None]:
# predict on the test set
y_pred = logistic_classifier.predict(X_test)

In [None]:
# evaluate model performance
print("Accuracy: ", accuracy_score(y_test, y_pred))
print("Precision: ", precision_score(y_test, y_pred, average='macro'))
print("Recall: ", recall_score(y_test, y_pred, average='macro'))
print("F1: ", f1_score(y_test, y_pred, average='macro'))

Accuracy:  0.9924543438609238
Precision:  0.7961079641485221
Recall:  0.8896014079789929
F1:  0.836293215203429


In [None]:
# look at the classification report
report = classification_report(y_test, y_pred)
print("Classification Report:\n", report)

Classification Report:
               precision    recall  f1-score   support

           0       0.59      0.78      0.68      1328
           1       1.00      0.99      1.00    130801

    accuracy                           0.99    132129
   macro avg       0.80      0.89      0.84    132129
weighted avg       0.99      0.99      0.99    132129



In [None]:
# have a look at the confusion matrix
confusion_matrix(y_test, y_pred)

array([[  1042,    286],
       [   711, 130090]])

##### Grid-search to optimize regularization parameter

In [None]:
# build Logistic Regression classifier
logistic_classifier = LogisticRegression(random_state=42)

In [None]:
# define hyperparameters to tune
param_grid = {
    # c is the amount of regularization to apply
    'C': [0.005,0.01,0.025,0.1,1],
    'max_iter': [1000]
}

# grid search with 5-fold cross validation
grid_search = GridSearchCV(logistic_classifier, param_grid, cv=5, scoring='f1_macro', refit=True, return_train_score=True, verbose=3)
grid_search.fit(X_train,y_train)

Fitting 5 folds for each of 5 candidates, totalling 25 fits
[CV 1/5] END C=0.005, max_iter=1000;, score=(train=0.984, test=0.983) total time=  48.6s
[CV 2/5] END C=0.005, max_iter=1000;, score=(train=0.984, test=0.983) total time=  46.5s
[CV 3/5] END C=0.005, max_iter=1000;, score=(train=0.984, test=0.984) total time=  46.9s
[CV 4/5] END C=0.005, max_iter=1000;, score=(train=0.984, test=0.984) total time=  46.6s
[CV 5/5] END C=0.005, max_iter=1000;, score=(train=0.984, test=0.983) total time=  47.9s
[CV 1/5] END C=0.01, max_iter=1000;, score=(train=0.990, test=0.988) total time=  57.0s
[CV 2/5] END C=0.01, max_iter=1000;, score=(train=0.989, test=0.988) total time= 1.1min
[CV 3/5] END C=0.01, max_iter=1000;, score=(train=0.989, test=0.989) total time= 1.0min
[CV 4/5] END C=0.01, max_iter=1000;, score=(train=0.989, test=0.988) total time= 1.0min
[CV 5/5] END C=0.01, max_iter=1000;, score=(train=0.989, test=0.988) total time=  55.1s
[CV 1/5] END C=0.025, max_iter=1000;, score=(train=0.99

In [None]:
# check the models
pd.concat([pd.DataFrame(grid_search.cv_results_["params"]),pd.DataFrame(grid_search.cv_results_["mean_train_score"], columns=["Training Mean F1 Score"]),pd.DataFrame(grid_search.cv_results_["mean_test_score"], columns=["Validation Mean F1 Score"])], axis=1)

Unnamed: 0,C,max_iter,Training Mean F1 Score,Validation Mean F1 Score
0,0.005,1000,0.984,0.983478
1,0.01,1000,0.989087,0.98842
2,0.025,1000,0.993244,0.992405
3,0.1,1000,0.996175,0.99482
4,1.0,1000,0.998675,0.99657


In [None]:
# print the best alpha value and f1 score
print("Best Hyperparameters: ", grid_search.best_params_)
print("Best F1 Score: ", grid_search.best_score_)

Best Hyperparameters:  {'C': 1, 'max_iter': 1000}
Best F1 Score:  0.996570168558056


In [None]:
# predict on the test set
y_pred = grid_search.predict(X_test)

In [None]:
# look at the classification report
report = classification_report(y_test, y_pred , digits=4)
print("Classification Report:\n", report)

Classification Report:
               precision    recall  f1-score   support

           0     0.5944    0.7846    0.6764      1328
           1     0.9978    0.9946    0.9962    130801

    accuracy                         0.9925    132129
   macro avg     0.7961    0.8896    0.8363    132129
weighted avg     0.9938    0.9925    0.9930    132129



In [None]:
# look at the confusion matrix
confusion_matrix(y_test, y_pred)

array([[  1042,    286],
       [   711, 130090]])