# 1. Import packages and collect data

In [43]:
import pandas as pd
import numpy as np
import os
import sys

from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC, SVC
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix

from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

import pickle
import warnings
warnings.filterwarnings('ignore')

In [22]:
df = pd.read_csv('https://raw.githubusercontent.com/laxmimerit/twitter-data/master/twitter30k_cleaned.csv')

In [3]:
df.shape

(30000, 2)

In [4]:
df.head(2)

Unnamed: 0,twitts,sentiment
0,robbiebronniman sounds like a great night,1
1,damn the person who stolde my wallet may karma...,1


In [5]:
df['sentiment'].value_counts()

1    15000
0    15000
Name: sentiment, dtype: int64

Our dataset is a balanced one and has 30000 tweets, which either have positive or negative sentiment. 

# 2. Preprocess and reduce dimension of the dataset

Since our dataset has already been preprocessed to remove irrelated information, such as #hashtags and @mentions, we can now do some text feature engineering using Tfidf method from Sklearn and reduce the dimension, number of features if possible.

In [6]:
X = df['twitts']
y = df['sentiment']

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2, random_state=42, stratify = y)

In [7]:
X_train.shape,X_test.shape

((24000,), (6000,))

### 2.1 Transfrom the data to tf-idf format

In [8]:
tfidf = TfidfVectorizer()
X_train = tfidf.fit_transform(X_train)
X_test = tfidf.transform(X_test)

In [9]:
X_train.shape,X_test.shape

((24000, 35917), (6000, 35917))

In [10]:
X_train = X_train.toarray()
X_test = X_test.toarray()

In [11]:
X_train.shape,X_test.shape

((24000, 35917), (6000, 35917))

### 2.2 Reduce the dimension of dataset using Truncated Singular Value Decompostion (TSVD)

In [12]:
from sklearn.decomposition import TruncatedSVD as TSVD

In [13]:
#we will try decrease number of features in X_train from 35917 to 500
tsvd = TSVD(n_components = 500)
X_train_svd = tsvd.fit_transform(X_train)

In [15]:
X_test_svd = tsvd.transform(X_test)

In [19]:
X_train_svd.shape,X_test_svd.shape

((24000, 500), (6000, 500))

# 3. Train and Test the model without tunings

In [18]:
%%time
clf = LinearSVC()

def run_svm(clf,X_train,X_test,y_train,y_test):
    clf.fit(X_train,y_train)
    y_pred = clf.predict(X_test)
    
    print()
    print('Print classfication report')
    print(classification_report(y_test,y_pred))

CPU times: user 32 µs, sys: 2 µs, total: 34 µs
Wall time: 36 µs


In [17]:
run_svm(clf,X_train_svd,X_test_svd, y_train,y_test)


Print classfication report
              precision    recall  f1-score   support

           0       0.76      0.73      0.75      3000
           1       0.74      0.77      0.76      3000

    accuracy                           0.75      6000
   macro avg       0.75      0.75      0.75      6000
weighted avg       0.75      0.75      0.75      6000



We get the accuracy around 75%

# 4. Hyperparameters tuning

In [23]:
train = X_train_svd.copy()
test = X_test_svd.copy()

### 4.1. Logistic Regression model

In [30]:
lr = LogisticRegression(
    solver = 'liblinear', n_jobs = -1
)

In [31]:
penalty = ['l1','l2']

C = np.logspace(0,4,10) #Select 10 C values from 0 to 4

max_iter = [100,500]

In [32]:
hyperparameters = dict(penalty = penalty, C = C, max_iter = max_iter)

In [33]:
lr_clf = GridSearchCV(lr, hyperparameters, cv = 5, verbose = 0,n_jobs = -1)

In [34]:
best = lr_clf.fit(train,y_train)

  " = {}.".format(effective_n_jobs(self.n_jobs)))


In [35]:
#Return optimal values of our hyperparameters
best.best_estimator_

LogisticRegression(C=2.7825594022071245, class_weight=None, dual=False,
                   fit_intercept=True, intercept_scaling=1, l1_ratio=None,
                   max_iter=100, multi_class='auto', n_jobs=-1, penalty='l2',
                   random_state=None, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)

In [36]:
best.best_params_

{'C': 2.7825594022071245, 'max_iter': 100, 'penalty': 'l2'}

In [37]:
best.best_score_

0.744

In [39]:
y_pred = lr_clf.predict(test)

In [40]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.76      0.74      0.75      3000
           1       0.75      0.76      0.76      3000

    accuracy                           0.75      6000
   macro avg       0.75      0.75      0.75      6000
weighted avg       0.75      0.75      0.75      6000



## 4.2. Random Forest

In [57]:
from sklearn.ensemble import RandomForestClassifier

In [60]:
hyperparameters = {
                       'bootstrap': [True],
                    'max_depth' : [10,100],
                    # 'max_features': [2,3,X_train_svd.shape[1]],
                    'min_samples_leaf': [2,5],
                   'min_samples_split': [1,5,10],
                   'n_estimators': [10,100,200]
                   }

In [61]:
scores = ['precision','recall']

def run_tuning(model,hyperparameters,scores):
  for score in scores:
    print('Tuning hyperparameters for %s' % score)
    print()

    clf = GridSearchCV(model, hyperparameters, scoring = '%s_macro' % score ,cv = 5, verbose = 0,n_jobs = -1)
    clf.fit(train,y_train)

    print('Best parameters set found: ')
    print()
    print(clf.best_params)
    print()
    
    print('Grid Score in process: ')
    print()
    means = clf.cv_results_['mean_test_score']

    for mean,params in zip(means, clf.cv_results_['params']):
      print('%0.3f for %r'%(mean,params))

    print()
    print('Detailed Classification report')
    y_pred = clf.predict(test)
    print(classification_report(y_test,y_pred))
    print()


In [None]:
rf = RandomForestClassifier(n_jobs= -1)

run_tuning(rf,hyperparameters,scores)