In [1]:
# imports 

from nltk.corpus import stopwords
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report 
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
import seaborn as sns 
import pickle 
import string 
import os
import re

In [2]:
data = pd.read_csv('../data/data_preprocessed.csv')

In [3]:
data

Unnamed: 0,text,label,preprocessed_text
0,He glorified himself as a great supporting act...,1,he glorified himself as a great supporting act...
1,"I was watching this with one of my friends, wh...",0,i was watching this with one of my friends wh...
2,One of several musicals about sailors on leave...,1,one of several musicals about sailors on leave...
3,Parker and Stone transplant their pacy expleti...,1,parker and stone transplant their pacy expleti...
4,I only gave this ridiculously titled comedy ho...,0,i only gave this ridiculously titled comedy ho...
...,...,...,...
995,This is definitely one of the best kung fu mov...,1,this is definitely one of the best kung fu mov...
996,This ABC straight-to-TV failure does absolutel...,0,this abc straight to tv failure does absolutel...
997,I remember the first time I saw this movie -- ...,1,i remember the first time i saw this movie ...
998,I agree that Capital City should be on DVD. I ...,1,i agree that capital city should be on dvd i ...


In [4]:
# Tokenization 

tfidf = TfidfVectorizer(stop_words='english') # Removing stop words 

token_matrix = tfidf.fit_transform(data['preprocessed_text'])

tokenized_data = pd.DataFrame(token_matrix.toarray(), columns=tfidf.get_feature_names_out())

In [5]:
tokenized_data

Unnamed: 0,00,000,00am,00s,01,06,07,10,100,1000,...,zoom,zooms,zorro,zp,zschering,zucco,zuni,zzzzzzzz,ääliöt,ísnt
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.062691,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
996,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
997,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.115514,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
998,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


I will be using a Logistic Regression model as the baseline for comparison. 

In [6]:
X_train, X_test, y_train, y_test = train_test_split(tokenized_data, data['label'], test_size=0.2, random_state=13)

model = LogisticRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.85      0.86      0.85       111
           1       0.82      0.81      0.81        89

    accuracy                           0.83       200
   macro avg       0.83      0.83      0.83       200
weighted avg       0.83      0.83      0.83       200



In [None]:
# save model 

current_dir = os.getcwd()
main_dir = os.path.abspath(os.path.join(current_dir, '..'))
models_dir = os.path.abspath(os.path.join(main_dir, 'models')) # Creating a new directory for saving models 
os.makedirs(models_dir, exist_ok=True)

In [27]:
logreg_path = os.path.join(models_dir, 'logreg.pkl')

with open(logreg_path, 'wb') as f:
    pickle.dump(model, f)