# Logistic Regression

In [2]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer, TfidfTransformer
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

## Using TF-IDF on the cleaned tweets

In [3]:
df = pd.DataFrame()
df = pd.read_csv('Datasets/train_tweets_clean.csv', encoding = 'utf-8', usecols = ['SpellCheckTweets', 'Sentiment'])
df.head()

Unnamed: 0,Sentiment,SpellCheckTweets
0,1,ll sown sure hayne chill
1,1,life eh hard good happy us may nagpapasaya na ...
2,1,phrase rolling totally make sense
3,0,companion dragonlance meeting extent vol mass ...
4,0,word advanced window advanced short course sho...


In [3]:
df['SpellCheckTweets'] = df['SpellCheckTweets'].astype('str')

In [4]:
X = df['SpellCheckTweets']
y = df['Sentiment']


vectorizer = TfidfVectorizer()
tf_idf_X = vectorizer.fit_transform(X)
tf_idf_X

<199998x47617 sparse matrix of type '<class 'numpy.float64'>'
	with 1336499 stored elements in Compressed Sparse Row format>

Splitting the data with an 80:20 ratio

In [5]:
# We are performing a 80:20 split.

X_train1, X_test1, y_train1, y_test1 = train_test_split(tf_idf_X, y, random_state=200)

Fitting a logistic regression model to the transformed data.

In [6]:
lr = LogisticRegression(solver='saga')

In [7]:
lr.fit(X_train1, y_train1)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='saga',
          tol=0.0001, verbose=0, warm_start=False)

In [8]:
predicted_test = lr.predict(X_test1)
predicted_proba_test = lr.predict_proba(X_test1)

In [9]:
predicted_test = lr.predict(X_test1)

y_test = y_test.astype('category')
print(metrics.classification_report(y_test, predicted_test,
    labels=y_test.cat.categories.tolist()))

              precision    recall  f1-score   support

           0       0.80      0.76      0.78     24890
           1       0.77      0.81      0.79     25110

   micro avg       0.78      0.78      0.78     50000
   macro avg       0.78      0.78      0.78     50000
weighted avg       0.78      0.78      0.78     50000



This model has a precision of 0.80 for negative tweets and 0.77 for positive tweets.

In [12]:
ConMatTest = metrics.confusion_matrix(y_test1, predicted_test)
accuracy = lr.score(X_test1, y_test1)

print(ConMatTest)
print(f'Accuracy = {accuracy:.2f}')

[[18925  5965]
 [ 4849 20261]]
Accuracy = 0.78


This model has an accuracy of 78%

## Validation Set

Splitting the data into training:testing sets with a ratio 80:20.

Then, by splitting the training set, we will create a validation set, so that the overall ratio of train:validation:test is 60:20:20

In [33]:
# Splitting to create train and test sets
X_train, X_test, y_train, y_test = train_test_split(tf_idf_X, y, test_size=0.2, random_state=200)

# Splitting to create train and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=200)

Comparing the different hyperparameters offered by sklearn's logistic regression model 

In [14]:
lr = LogisticRegression(solver='saga')
lr.fit(X_train, y_train)
print('Accuracy: ' + str(lr.score(X_val,y_val)) + ' Solver: saga')
lr = LogisticRegression(solver='newton-cg')
lr.fit(X_train, y_train)
print('Accuracy: ' + str(lr.score(X_val,y_val)) + ' Solver: newton-cg')
lr = LogisticRegression(solver='lbfgs')
lr.fit(X_train, y_train)
print('Accuracy: ' + str(lr.score(X_val,y_val)) + ' Solver: lbfgs')
lr = LogisticRegression(solver='liblinear')
lr.fit(X_train, y_train)
print('Accuracy: ' + str(lr.score(X_val,y_val)) + ' Solver: liblinear')
lr = LogisticRegression(solver='sag')
lr.fit(X_train, y_train)
print('Accuracy: ' + str(lr.score(X_val,y_val)) + ' Solver: sag')

Accuracy: 0.815175 Solver: saga
Accuracy: 0.815175 Solver: newton-cg




Accuracy: 0.814875 Solver: lbfgs
Accuracy: 0.815175 Solver: liblinear
Accuracy: 0.815175 Solver: sag


Here we can see that all the 'solvers' perform equally as well (accuracy = 0.815175), except 'lbfgs' which has a slightly lower accuracy (0.814875).

Testing each combination of solver, penalty and C by using a gridsearch technique

In [6]:
from sklearn.model_selection import GridSearchCV

In [7]:
lr = LogisticRegression()
solvers = ['saga','newton-cg','lbfgs','liblinear','sag']
penalties = ['l1', 'l2', 'elasticnet', 'none']
C_values = [0.01,0.1,1,10,100]
grid = dict(solver=solvers, penalty=penalties, C=C_values)
grid_search = GridSearchCV(lr, grid, n_jobs=-1, scoring='accuracy', error_score=0)
grid_result = grid_search.fit(X_train1, y_train1)

  pickler.file_handle.write(chunk.tostring('C'))
  pickler.file_handle.write(chunk.tostring('C'))
  pickler.file_handle.write(chunk.tostring('C'))
  pickler.file_handle.write(chunk.tostring('C'))
  pickler.file_handle.write(chunk.tostring('C'))
  pickler.file_handle.write(chunk.tostring('C'))
  pickler.file_handle.write(chunk.tostring('C'))
  pickler.file_handle.write(chunk.tostring('C'))
  pickler.file_handle.write(chunk.tostring('C'))
  pickler.file_handle.write(chunk.tostring('C'))
  pickler.file_handle.write(chunk.tostring('C'))
  pickler.file_handle.write(chunk.tostring('C'))
  pickler.file_handle.write(chunk.tostring('C'))
  pickler.file_handle.write(chunk.tostring('C'))
  pickler.file_handle.write(chunk.tostring('C'))
  pickler.file_handle.write(chunk.tostring('C'))
  pickler.file_handle.write(chunk.tostring('C'))
  pickler.file_handle.write(chunk.tostring('C'))
  pickler.file_handle.write(chunk.tostring('C'))
  pickler.file_handle.write(chunk.tostring('C'))
  pickler.file_handl

  pickler.file_handle.write(chunk.tostring('C'))
  pickler.file_handle.write(chunk.tostring('C'))
  pickler.file_handle.write(chunk.tostring('C'))
  pickler.file_handle.write(chunk.tostring('C'))
  pickler.file_handle.write(chunk.tostring('C'))
  pickler.file_handle.write(chunk.tostring('C'))
  pickler.file_handle.write(chunk.tostring('C'))
  pickler.file_handle.write(chunk.tostring('C'))
  pickler.file_handle.write(chunk.tostring('C'))
  pickler.file_handle.write(chunk.tostring('C'))
  pickler.file_handle.write(chunk.tostring('C'))
  pickler.file_handle.write(chunk.tostring('C'))
  pickler.file_handle.write(chunk.tostring('C'))
  pickler.file_handle.write(chunk.tostring('C'))
  pickler.file_handle.write(chunk.tostring('C'))
  pickler.file_handle.write(chunk.tostring('C'))
  pickler.file_handle.write(chunk.tostring('C'))
  pickler.file_handle.write(chunk.tostring('C'))
  pickler.file_handle.write(chunk.tostring('C'))
  pickler.file_handle.write(chunk.tostring('C'))
  pickler.file_handl

  pickler.file_handle.write(chunk.tostring('C'))
  pickler.file_handle.write(chunk.tostring('C'))
  pickler.file_handle.write(chunk.tostring('C'))
  pickler.file_handle.write(chunk.tostring('C'))
  pickler.file_handle.write(chunk.tostring('C'))
  pickler.file_handle.write(chunk.tostring('C'))
  pickler.file_handle.write(chunk.tostring('C'))
  pickler.file_handle.write(chunk.tostring('C'))
  pickler.file_handle.write(chunk.tostring('C'))
  pickler.file_handle.write(chunk.tostring('C'))
  pickler.file_handle.write(chunk.tostring('C'))
  pickler.file_handle.write(chunk.tostring('C'))
  pickler.file_handle.write(chunk.tostring('C'))
  pickler.file_handle.write(chunk.tostring('C'))
  pickler.file_handle.write(chunk.tostring('C'))
  pickler.file_handle.write(chunk.tostring('C'))
  pickler.file_handle.write(chunk.tostring('C'))
  pickler.file_handle.write(chunk.tostring('C'))
  pickler.file_handle.write(chunk.tostring('C'))
  pickler.file_handle.write(chunk.tostring('C'))
  pickler.file_handl

  pickler.file_handle.write(chunk.tostring('C'))
  pickler.file_handle.write(chunk.tostring('C'))
  pickler.file_handle.write(chunk.tostring('C'))
  pickler.file_handle.write(chunk.tostring('C'))
  pickler.file_handle.write(chunk.tostring('C'))
  pickler.file_handle.write(chunk.tostring('C'))
  pickler.file_handle.write(chunk.tostring('C'))
  pickler.file_handle.write(chunk.tostring('C'))
  pickler.file_handle.write(chunk.tostring('C'))
  pickler.file_handle.write(chunk.tostring('C'))
  pickler.file_handle.write(chunk.tostring('C'))
  pickler.file_handle.write(chunk.tostring('C'))
  pickler.file_handle.write(chunk.tostring('C'))
  pickler.file_handle.write(chunk.tostring('C'))
  pickler.file_handle.write(chunk.tostring('C'))
  pickler.file_handle.write(chunk.tostring('C'))
  pickler.file_handle.write(chunk.tostring('C'))
  pickler.file_handle.write(chunk.tostring('C'))
  pickler.file_handle.write(chunk.tostring('C'))
  pickler.file_handle.write(chunk.tostring('C'))
  pickler.file_handl

  pickler.file_handle.write(chunk.tostring('C'))
  pickler.file_handle.write(chunk.tostring('C'))
  pickler.file_handle.write(chunk.tostring('C'))
  pickler.file_handle.write(chunk.tostring('C'))
  pickler.file_handle.write(chunk.tostring('C'))
  pickler.file_handle.write(chunk.tostring('C'))
  pickler.file_handle.write(chunk.tostring('C'))
  pickler.file_handle.write(chunk.tostring('C'))
  pickler.file_handle.write(chunk.tostring('C'))
  pickler.file_handle.write(chunk.tostring('C'))
  pickler.file_handle.write(chunk.tostring('C'))
  pickler.file_handle.write(chunk.tostring('C'))
  pickler.file_handle.write(chunk.tostring('C'))
  pickler.file_handle.write(chunk.tostring('C'))
  pickler.file_handle.write(chunk.tostring('C'))
  pickler.file_handle.write(chunk.tostring('C'))
  pickler.file_handle.write(chunk.tostring('C'))
  pickler.file_handle.write(chunk.tostring('C'))
  pickler.file_handle.write(chunk.tostring('C'))
  pickler.file_handle.write(chunk.tostring('C'))
  pickler.file_handl

  pickler.file_handle.write(chunk.tostring('C'))
  pickler.file_handle.write(chunk.tostring('C'))
  pickler.file_handle.write(chunk.tostring('C'))
  pickler.file_handle.write(chunk.tostring('C'))
  pickler.file_handle.write(chunk.tostring('C'))
  pickler.file_handle.write(chunk.tostring('C'))
  pickler.file_handle.write(chunk.tostring('C'))
  pickler.file_handle.write(chunk.tostring('C'))
  pickler.file_handle.write(chunk.tostring('C'))
  pickler.file_handle.write(chunk.tostring('C'))
  pickler.file_handle.write(chunk.tostring('C'))
  pickler.file_handle.write(chunk.tostring('C'))
  pickler.file_handle.write(chunk.tostring('C'))
  pickler.file_handle.write(chunk.tostring('C'))
  pickler.file_handle.write(chunk.tostring('C'))
  pickler.file_handle.write(chunk.tostring('C'))
  pickler.file_handle.write(chunk.tostring('C'))
  pickler.file_handle.write(chunk.tostring('C'))
  pickler.file_handle.write(chunk.tostring('C'))
  pickler.file_handle.write(chunk.tostring('C'))
  pickler.file_handl

  pickler.file_handle.write(chunk.tostring('C'))
  pickler.file_handle.write(chunk.tostring('C'))
  pickler.file_handle.write(chunk.tostring('C'))
  pickler.file_handle.write(chunk.tostring('C'))
  pickler.file_handle.write(chunk.tostring('C'))
  pickler.file_handle.write(chunk.tostring('C'))
  pickler.file_handle.write(chunk.tostring('C'))
  pickler.file_handle.write(chunk.tostring('C'))
  pickler.file_handle.write(chunk.tostring('C'))
  pickler.file_handle.write(chunk.tostring('C'))
  pickler.file_handle.write(chunk.tostring('C'))
  pickler.file_handle.write(chunk.tostring('C'))
  pickler.file_handle.write(chunk.tostring('C'))
  pickler.file_handle.write(chunk.tostring('C'))
  pickler.file_handle.write(chunk.tostring('C'))
  pickler.file_handle.write(chunk.tostring('C'))
  pickler.file_handle.write(chunk.tostring('C'))
  pickler.file_handle.write(chunk.tostring('C'))
  pickler.file_handle.write(chunk.tostring('C'))
  pickler.file_handle.write(chunk.tostring('C'))
  pickler.file_handl

  pickler.file_handle.write(chunk.tostring('C'))
  pickler.file_handle.write(chunk.tostring('C'))
  pickler.file_handle.write(chunk.tostring('C'))
  pickler.file_handle.write(chunk.tostring('C'))
  pickler.file_handle.write(chunk.tostring('C'))
  pickler.file_handle.write(chunk.tostring('C'))
  pickler.file_handle.write(chunk.tostring('C'))
  pickler.file_handle.write(chunk.tostring('C'))
  pickler.file_handle.write(chunk.tostring('C'))
  pickler.file_handle.write(chunk.tostring('C'))
  pickler.file_handle.write(chunk.tostring('C'))
  pickler.file_handle.write(chunk.tostring('C'))
  pickler.file_handle.write(chunk.tostring('C'))
  pickler.file_handle.write(chunk.tostring('C'))
  pickler.file_handle.write(chunk.tostring('C'))
  pickler.file_handle.write(chunk.tostring('C'))
  pickler.file_handle.write(chunk.tostring('C'))
  pickler.file_handle.write(chunk.tostring('C'))
  pickler.file_handle.write(chunk.tostring('C'))
  pickler.file_handle.write(chunk.tostring('C'))
  pickler.file_handl

In [34]:
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("{} ({}) with: {}".format(mean, stdev, param))

0.6288750516673556 (0.0011276496629218367) with: {'C': 0.01, 'penalty': 'l1', 'solver': 'saga'}
0.0 (0.0) with: {'C': 0.01, 'penalty': 'l1', 'solver': 'newton-cg'}
0.0 (0.0) with: {'C': 0.01, 'penalty': 'l1', 'solver': 'lbfgs'}
0.6292550567340898 (0.0013584615751696205) with: {'C': 0.01, 'penalty': 'l1', 'solver': 'liblinear'}
0.0 (0.0) with: {'C': 0.01, 'penalty': 'l1', 'solver': 'sag'}
0.7212962839504526 (0.0010768313574108335) with: {'C': 0.01, 'penalty': 'l2', 'solver': 'saga'}
0.7212696169282257 (0.0010676821474137374) with: {'C': 0.01, 'penalty': 'l2', 'solver': 'newton-cg'}
0.7212762836837825 (0.001049398947837482) with: {'C': 0.01, 'penalty': 'l2', 'solver': 'lbfgs'}
0.7215629541727223 (0.0011508988038740615) with: {'C': 0.01, 'penalty': 'l2', 'solver': 'liblinear'}
0.7212829504393392 (0.0010585380992763512) with: {'C': 0.01, 'penalty': 'l2', 'solver': 'sag'}
0.0 (0.0) with: {'C': 0.01, 'penalty': 'elasticnet', 'solver': 'saga'}
0.0 (0.0) with: {'C': 0.01, 'penalty': 'elasticne

In [9]:
print("Best combination: {} with accuracy of {}".format(grid_result.best_params_,grid_result.best_score_))

Best combination: {'C': 1, 'penalty': 'l2', 'solver': 'saga'} with accuracy of 0.7811437485833145
