In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import pandas as pd
import numpy as np

In [3]:
df = pd.read_csv("C:/Users/ravis/Downloads/train.csv/train.csv")
df.shape

(162758, 5)

In [4]:
df.columns

Index(['movieid', 'reviewerName', 'isFrequentReviewer', 'reviewText',
       'sentiment'],
      dtype='object')

In [5]:
data = df[['reviewText','sentiment']]
data.head()

Unnamed: 0,reviewText,sentiment
0,Henry Selick’s first movie since 2009’s Corali...,POSITIVE
1,With a cast that reads like the Vogue Oscar pa...,NEGATIVE
2,Creed II does not give us anything but another...,POSITIVE
3,"I know what you're thinking, but this is no Li...",POSITIVE
4,Director Fernando Meirelles tells the story wi...,POSITIVE


In [6]:
data.dropna(inplace =  True)
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 156311 entries, 0 to 162757
Data columns (total 2 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   reviewText  156311 non-null  object
 1   sentiment   156311 non-null  object
dtypes: object(2)
memory usage: 3.6+ MB


In [7]:
X = data['reviewText']
y = data['sentiment']

In [8]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(y)
y

array([1, 0, 1, ..., 1, 0, 1])

In [9]:
from sklearn.model_selection import train_test_split
X_train , X_test , y_train , y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [10]:
from sklearn.feature_extraction.text import CountVectorizer , TfidfTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import f1_score

for N in range(1,11):

    cv = CountVectorizer(analyzer = 'word',ngram_range=(1,N), stop_words='english')
    X_train_cv = cv.fit_transform(X_train)
    X_test_cv = cv.transform(X_test)

    clf = SGDClassifier(random_state = 42)
    clf.fit(X_train_cv, y_train)
    y_pred = clf.predict(X_test_cv)
    
    score = np.round(f1_score(y_test, y_pred, average='micro'),4)
    print('F-1 score of model with n-gram range of {}: {}'.format((1,N), score))

F-1 score of model with n-gram range of (1, 1): 0.795
F-1 score of model with n-gram range of (1, 2): 0.8001
F-1 score of model with n-gram range of (1, 3): 0.8011
F-1 score of model with n-gram range of (1, 4): 0.8013
F-1 score of model with n-gram range of (1, 5): 0.8
F-1 score of model with n-gram range of (1, 6): 0.7988
F-1 score of model with n-gram range of (1, 7): 0.7981
F-1 score of model with n-gram range of (1, 8): 0.798
F-1 score of model with n-gram range of (1, 9): 0.7967
F-1 score of model with n-gram range of (1, 10): 0.7973


In [22]:
cvec = CountVectorizer(analyzer = 'word',ngram_range=(1,3), stop_words='english')
tfidf = TfidfTransformer()
X_train_trans = cvec.fit_transform(X_train)
X_test_trans = cvec.fit_transform(X_test)
X_test_trans = tfidf.fit_transform(X_train_trans)
X_test_trans = tfidf.fit_transform(X_test_trans)

In [12]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform
sgdclf = SGDClassifier(random_state = 42)
params = dict(loss = ['hinge' , 'log' , 'modified_huber' , 'squared_hinge' , 'perceptron'] , 
              learning_rate = ['optimal' , 'adaptive' , 'invscaling'] , 
              eta0 = uniform(loc = 1e-7 , scale = 1e-2) , 
              penalty = ['l1' , 'l2' , 'elastic_net'] , 
              alpha = uniform(loc = 1e-6 , scale = 1e-4))
ransearchcv = RandomizedSearchCV(estimator = sgdclf , 
                                param_distributions = params,
                                cv = 5,
                                n_iter = 100)
ransearchcv.fit(X_train_trans , y_train)
#y_pred = ransearchcv.predict(X_test_trans)
#f1_score(y_test , y_pred)

RandomizedSearchCV(cv=5, estimator=SGDClassifier(random_state=42), n_iter=100,
                   param_distributions={'alpha': <scipy.stats._distn_infrastructure.rv_continuous_frozen object at 0x000001C61B27F670>,
                                        'eta0': <scipy.stats._distn_infrastructure.rv_continuous_frozen object at 0x000001C630585CD0>,
                                        'learning_rate': ['optimal', 'adaptive',
                                                          'invscaling'],
                                        'loss': ['hinge', 'log',
                                                 'modified_huber',
                                                 'squared_hinge',
                                                 'perceptron'],
                                        'penalty': ['l1', 'l2', 'elastic_net']})

In [13]:
from pprint import pprint

print("The best parameters are:")
pprint(ransearchcv.best_params_)

The best parameters are:
{'alpha': 7.704366940821582e-05,
 'eta0': 0.0005136658609638782,
 'learning_rate': 'adaptive',
 'loss': 'modified_huber',
 'penalty': 'l2'}


In [18]:
accuracy = ransearchcv.score(X_train_trans, y_train)

print(f"The test accuracy score of the best model is {accuracy:.2f}")

The test accuracy score of the best model is 0.99
