In [5]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os, re
df = pd.read_csv('train.tsv',sep='\t')

In [6]:
df.drop_duplicates(subset=["SentenceId"], keep="first")
# df.drop_duplicates(subset=["PhraseId"], keep="first")
print(f"shape : {df.shape}")

df = df.groupby('Sentiment',as_index = False,group_keys=False).apply(lambda s: s.sample(100,replace=True))
print(f"shape : {df.shape}")
df_copy = df.copy()


shape : (156060, 4)
shape : (500, 4)


In [7]:
df.head()

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
91966,91967,4786,"We ca n't accuse Kung Pow for misfiring , sinc...",0
7999,8000,328,a better celebration of these unfairly dismiss...,0
154929,154930,8476,"interesting than the screenplay , which lags b...",0
112932,112933,5998,"a movie so sloppy , so uneven , so damn unplea...",0
100748,100749,5292,reach for a barf bag,0


In [8]:
txt = "hell0      dear me"
txt = re.sub(' +', ' ', txt)
txt

'hell0 dear me'

In [9]:
import re

def text_preprocessing(txt):
    txt = txt.lower().strip()
    txt = re.sub(r'\d+', ' ', txt) # remove numbers
    txt = re.sub(r'[^\w\s]',' ',txt) # remove punctuations
    txt = re.sub(' +', ' ', txt) # remove extra spaces
    return txt

In [10]:
df["Phrase"] = df["Phrase"].apply(lambda x : text_preprocessing(x))

In [11]:
df

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
91966,91967,4786,we ca n t accuse kung pow for misfiring since ...,0
7999,8000,328,a better celebration of these unfairly dismiss...,0
154929,154930,8476,interesting than the screenplay which lags bad...,0
112932,112933,5998,a movie so sloppy so uneven so damn unpleasant,0
100748,100749,5292,reach for a barf bag,0
...,...,...,...,...
139230,139231,7546,the lines work,4
131143,131144,7068,s a wacky and inspired little film that works...,4
147355,147356,8018,a fascinating documentary about the long and e...,4
110552,110553,5859,near masterpiece,4


In [12]:
from sklearn.model_selection import train_test_split

feature_var = "Phrase"
target_var = "Sentiment"

X = df[feature_var]
y = df[target_var]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify = y)

In [13]:
X_train.shape

(350,)

In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer
import joblib
import gzip

tfidf = TfidfVectorizer(sublinear_tf=True,
                        ngram_range=(1, 2),
                        stop_words='english',
                        max_features= 1024)
tfidf.fit(X_train)

joblib.dump(tfidf, 'tfidf.pkl')

# We transform each complaint into a vector
features_train = tfidf.transform(X_train)
# features = features_1.toarray()
print("Each of the %d complaints is represented by %d features (TF-IDF score of unigrams and bigrams)" %(features_train.shape))

Each of the 350 complaints is represented by 1024 features (TF-IDF score of unigrams and bigrams)


In [15]:
features_train.shape

(350, 1024)

In [16]:
feature_names = tfidf.get_feature_names_out()
feature_names

array(['abandon', 'acting', 'action', ..., 'young turks', 'yuppie',
       'yuppie plot'], dtype=object)

In [17]:
from sklearn.naive_bayes import MultinomialNB

clf_MultinomialNB = MultinomialNB()
clf_MultinomialNB.fit(features_train, y_train)

In [18]:
features_test = tfidf.transform(X_test)
y_pred = clf_MultinomialNB.predict(features_test)

In [19]:
from sklearn import metrics

clf_MultinomialNB_accuracy = metrics.accuracy_score(y_test, y_pred)
clf_MultinomialNB_accuracy

0.26666666666666666

In [40]:
metrics.accuracy_score(y_train, clf_MultinomialNB.predict(features_train))

0.8685714285714285

In [20]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier()

grid_space={'max_depth':[3,5,10,None],
              'n_estimators':[10,100,200],
              'max_features':[ 512, 1024]
           }

In [21]:
from re import VERBOSE
from sklearn.model_selection import GridSearchCV

grid = GridSearchCV(rf,param_grid=grid_space,cv=3,scoring='accuracy')
model_grid = grid.fit(features_train,y_train)

In [22]:
print('Best hyperparameters are: '+str(model_grid.best_params_))
print('Best score is: '+str(model_grid.best_score_))

Best hyperparameters are: {'max_depth': None, 'max_features': 1024, 'n_estimators': 10}
Best score is: 0.3028293545534925


In [23]:
rf = RandomForestClassifier()
rf.fit(features_train,y_train)

In [24]:
y_pred = rf.predict(features_test)
metrics.accuracy_score(y_test, y_pred)

0.28

In [25]:
from sklearn.linear_model import LogisticRegression

clf_lr = LogisticRegression(random_state=0)
clf_lr.fit(features_train, y_train)

In [26]:
y_pred = clf_lr.predict(features_test)
metrics.accuracy_score(y_test, y_pred)

0.3333333333333333

In [27]:
from sklearn.svm import LinearSVC

clf_svm = LinearSVC()
clf_svm.fit(features_train, y_train)

In [28]:
y_pred = clf_svm.predict(features_test)
metrics.accuracy_score(y_test, y_pred)

0.32666666666666666

In [29]:
print(metrics.classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.45      0.33      0.38        30
           1       0.25      0.13      0.17        30
           2       0.27      0.63      0.38        30
           3       0.20      0.10      0.13        30
           4       0.48      0.43      0.46        30

    accuracy                           0.33       150
   macro avg       0.33      0.33      0.31       150
weighted avg       0.33      0.33      0.31       150



In [30]:
y_pred

array([1, 4, 0, 2, 2, 1, 0, 2, 3, 3, 2, 2, 2, 1, 4, 2, 1, 2, 0, 2, 2, 2,
       2, 2, 0, 2, 2, 4, 2, 4, 2, 4, 4, 2, 2, 2, 2, 1, 2, 4, 2, 2, 4, 2,
       4, 2, 4, 1, 2, 1, 2, 3, 4, 1, 2, 3, 2, 2, 4, 2, 3, 0, 4, 2, 4, 2,
       4, 2, 2, 0, 4, 4, 2, 1, 2, 2, 4, 2, 2, 4, 2, 2, 2, 2, 2, 1, 2, 0,
       2, 1, 2, 1, 4, 0, 0, 0, 2, 2, 0, 2, 4, 2, 2, 2, 0, 3, 1, 2, 2, 3,
       1, 3, 2, 2, 2, 2, 4, 3, 2, 4, 3, 2, 0, 3, 0, 2, 4, 2, 0, 2, 1, 4,
       2, 0, 0, 3, 2, 0, 0, 3, 3, 4, 4, 1, 2, 3, 2, 0, 0, 0])

In [36]:
from sklearn import svm
clf_svm = svm.SVC(decision_function_shape='ovo', class_weight= "balanced")

In [37]:
clf_svm.fit(features_train, y_train)

In [38]:
y_pred = clf_svm.predict(features_test)
metrics.accuracy_score(y_test, y_pred)

0.36666666666666664

In [39]:
metrics.accuracy_score(y_train, clf_svm.predict(features_train))

0.9114285714285715

In [44]:
#  defining parameter range
param_grid = {'C': [0.1, 1, 10, 100, 1000],
              'gamma': [1, 0.1, 0.01, 0.001, 0.0001, "scale"],
              'kernel': ['rbf']}

grid = GridSearchCV(svm.SVC(), param_grid, refit = True, verbose = 3)

# fitting the model for grid search
grid.fit(features_train, y_train)

Fitting 5 folds for each of 30 candidates, totalling 150 fits
[CV 1/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.300 total time=   0.0s
[CV 2/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.286 total time=   0.1s
[CV 3/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.243 total time=   0.0s
[CV 4/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.186 total time=   0.0s
[CV 5/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.329 total time=   0.0s
[CV 1/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.243 total time=   0.0s
[CV 2/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.243 total time=   0.0s
[CV 3/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.286 total time=   0.0s
[CV 4/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.257 total time=   0.0s
[CV 5/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.257 total time=   0.0s
[CV 1/5] END .....C=0.1, gamma=0.01, kernel=rbf;, score=0.243 total time=   0.0s
[CV 2/5] END .....C=0.1, gamma=0.01, kernel=rbf

In [45]:
y_pred = grid.predict(features_test)
metrics.accuracy_score(y_test, y_pred)

0.3466666666666667

In [46]:
grid.best_params_

{'C': 10, 'gamma': 1, 'kernel': 'rbf'}