In [2]:
# Imports

from typing import Text, Iterable

import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC


In [3]:
train_path = 'data/train.csv'

df = pd.read_csv(train_path)

In [4]:
df.head()

Unnamed: 0,ID,TEXT,LABEL
0,7766,Wonderful summary of Sai's words on nature. It...,0
1,4631,"I'm not going to lie, I have a tough time liki...",0
2,67707,This is an excellent book. It has allowed me ...,0
3,93531,Reading the series one book at a time; fell in...,0
4,58527,Oh my gosh! So amazing! I love this book so mu...,0


In [5]:
print("Total rows:", df.shape[0],'\n')

print("Empty values in column ID:",df['ID'].isna().sum())
print("Empty values in column TEXT:",df['TEXT'].isna().sum())
print("Empty values in column LABEL:",df['LABEL'].isna().sum(),'\n')

irrelevant_slice = df['TEXT'].loc[df['LABEL']==0]
pos_slice = df['TEXT'].loc[df['LABEL']==1]
neg_slice = df['TEXT'].loc[df['LABEL']==2]

print("Number of Irrelevant Reviews", len(irrelevant_slice))
print("Number of Positive Reviews", len(pos_slice))
print("Number of Negative Reviews", len(neg_slice),'\n')

print("Avg Length of Irrelevant Reviews", sum([len(text) for text in irrelevant_slice.dropna()]) / len(irrelevant_slice))
print("Avg Length of Positive Reviews", sum([len(text) for text in pos_slice.dropna()]) / len(pos_slice))
print("Avg Length of Positive Reviews", sum([len(text) for text in neg_slice.dropna()]) / len(neg_slice))

Total rows: 70187 

Empty values in column ID: 0
Empty values in column TEXT: 7
Empty values in column LABEL: 0 

Number of Irrelevant Reviews 35000
Number of Positive Reviews 17645
Number of Negative Reviews 17542 

Avg Length of Irrelevant Reviews 426.21105714285716
Avg Length of Positive Reviews 1332.3479739302918
Avg Length of Positive Reviews 1298.990822027135


In [6]:
df = df.fillna("EMPTY")

In [7]:
# Here I opt for a 85-15 split betwen train and development sets
train_df, dev_df = train_test_split(df,test_size=0.15,random_state=42)

In [19]:
featurizer = TfidfVectorizer(ngram_range=(1,1),use_idf=True,sublinear_tf=True)
featurizer.fit(train_df['TEXT'])

TfidfVectorizer(sublinear_tf=True)

In [24]:
len(featurizer.vocabulary_)

102381

In [25]:
class Classifier:
    def __init__(self) -> None:
        self.lr_model = LogisticRegression(max_iter=500)
        self.nb_model = MultinomialNB()
        self.svm_model = LinearSVC(max_iter=2500)
    def fit(self, features, labels):
        self.lr_model.fit(features,labels)
        self.nb_model.fit(features,labels)
        self.svm_model.fit(features,labels)

    def predict(self, features):
        return self.lr_model.predict(features),self.nb_model.predict(features),self.svm_model.predict(features)

In [26]:
features = featurizer.transform(train_df['TEXT'])

In [27]:
features.shape

(59658, 102381)

In [28]:
features

<59658x102381 sparse matrix of type '<class 'numpy.float64'>'
	with 5546185 stored elements in Compressed Sparse Row format>

In [29]:
clf = Classifier()

clf.fit(features,train_df['LABEL'])

lr_preds,nb_preds,svm_preds = clf.predict(featurizer.transform(dev_df['TEXT']))


In [30]:
def metric_printout(modelname:str, labels, preds):

    print(modelname,": -------------------")

    print(classification_report(labels,preds,target_names=['Not Movie','Positive','Negative']))

In [31]:
metric_printout("Linear Regression", dev_df['LABEL'],lr_preds)
metric_printout("Multinomal NB", dev_df['LABEL'],nb_preds)
metric_printout("Linear Support Vector Machine", dev_df['LABEL'],svm_preds)

Linear Regression : -------------------
              precision    recall  f1-score   support

   Not Movie       0.98      0.99      0.98      5247
    Positive       0.88      0.87      0.88      2710
    Negative       0.89      0.88      0.89      2572

    accuracy                           0.93     10529
   macro avg       0.92      0.91      0.92     10529
weighted avg       0.93      0.93      0.93     10529

Multinomal NB : -------------------
              precision    recall  f1-score   support

   Not Movie       0.96      0.98      0.97      5247
    Positive       0.86      0.78      0.82      2710
    Negative       0.83      0.88      0.85      2572

    accuracy                           0.90     10529
   macro avg       0.88      0.88      0.88     10529
weighted avg       0.90      0.90      0.90     10529

Linear Support Vector Machine : -------------------
              precision    recall  f1-score   support

   Not Movie       0.98      0.99      0.99      5247
 

In [22]:
test_path = 'data/test.csv'

test_df = pd.read_csv(test_path)
test_df = test_df.fillna("EMPTY")

In [23]:
_,_,predictions = clf.predict(featurizer.transform(test_df['TEXT']))

In [24]:
predictions

array([0, 0, 0, ..., 2, 2, 2])

In [30]:
submission = pd.DataFrame(columns=['ID','Predicted'])
submission['ID'] = test_df['ID']
submission['Predicted'] = predictions

submission

Unnamed: 0,ID,Predicted
0,18742,0
1,14108,0
2,52871,0
3,39785,0
4,46174,0
...,...,...
30073,86744,2
30074,27493,2
30075,72221,2
30076,16355,2


In [31]:
# Submission csv
submission.to_csv("submission.csv",index=False)