In [28]:
import nltk
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import StratifiedKFold, cross_val_score


plt.style.use('ggplot')

In [None]:
df = pd.read_csv('../data/movie.csv')
df.head()
df.shape

(40000, 2)

In [None]:
X = df['text']
y = df['label']
seed = 99

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=seed)

# pipeline to run the TF-IDF process and the Logistic regression
pipeline = Pipeline([
    ("tfidf", TfidfVectorizer(
        max_features=30000,
        ngram_range=(1,2),
        min_df=2,
        max_df=0.9,
        sublinear_tf=True
    )),
    ("logistic-regression", LogisticRegression(
        max_iter=2000,
        C=2,
        solver='liblinear',
        random_state=seed
    ))
])

# fitting the model with the training data and getting the predictions
model = pipeline.fit(X_train, y_train)
y_pred = model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

# doing cross validation with StratifiedKFold with 10 splits and shuffled data
cv = StratifiedKFold(
    n_splits=10,
    shuffle=True,
    random_state=seed
)

# calculating the cross validation scores
scores = cross_val_score(
    pipeline,
    X,
    y,
    cv=cv,
    scoring='accuracy',
    n_jobs=-1
)

print("Accuracy per fold:", scores)
print("Mean accuracy:", scores.mean())


Accuracy: 0.907
              precision    recall  f1-score   support

           0       0.92      0.90      0.91      4058
           1       0.90      0.92      0.91      3942

    accuracy                           0.91      8000
   macro avg       0.91      0.91      0.91      8000
weighted avg       0.91      0.91      0.91      8000

Accuracy per fold: [0.91425 0.90675 0.90725 0.91075 0.9105  0.90775 0.91875 0.9055  0.908
 0.90925]
Mean accuracy: 0.9098750000000001
