In [1]:
import pandas as pd 

In [2]:
df = pd.read_csv("IMDB Dataset.csv")

In [3]:
df = df.replace({"positive": 1, "negative": 0})

  df = df.replace({"positive": 1, "negative": 0})


In [4]:
df.head(5)

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. <br /><br />The...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1


## Lets get started with some natural langauge processing!

In [5]:
def metrics(actual, predicted):
    # lengths must be the same
    if len(actual) != len(predicted):
        print("error lengths of actual and predicted are not the same.")
        return
    
    true_positive = 0
    true_negative = 0
    false_positive = 0
    false_negative = 0

    for a, p in zip(actual, predicted):
        if a == 1 and p == 1:
            true_positive += 1
        elif a == 0 and p == 0:
            true_negative += 1
        elif a == 0 and p == 1:
            false_positive += 1
        elif a == 1 and p == 0:
            false_negative += 1

    try:
        accuracy = (true_positive+true_negative)/len(predicted)
        precision = true_positive/(true_positive+false_positive)
        recall = true_positive/(true_positive+false_negative)
        f1 = 2*((precision*recall)/(precision+recall))
    except:
        raise ZeroDivisionError("Division by zero")

    print(f"accuracy:{accuracy:.4f}, Precision:{precision:.4f}, Recall:{recall:.4f}, F1:{f1:.4f}")




## Step one Linear Regression and Bag of Words!


In [6]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

# Load data
X_train, X_test, y_train, y_test = train_test_split(df["review"], df["sentiment"], test_size=0.2)

# Convert text to BoW features
vectorizer = CountVectorizer(max_features=5000)
X_train_bow = vectorizer.fit_transform(X_train)
X_test_bow = vectorizer.transform(X_test)

# Train classifier
model = LogisticRegression()
model.fit(X_train_bow, y_train)

y_pred = model.predict(X_test_bow)

metrics(y_test.tolist(), y_pred.tolist())

accuracy:0.8894, Precision:0.8860, Recall:0.8938, F1:0.8898


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Linear Regression might be the simplest algorithm out there, but isn't it fantastic?

These values will be our baseline model! 

accuracy:0.8848, Precision:0.8804, Recall:0.8892, F1:0.8848

In [7]:
from sklearn.naive_bayes import GaussianNB

model = GaussianNB()

# Bayes expects "dense array"
X_train_dense = X_train_bow.toarray()
X_test_dense = X_test_bow.toarray()

model.fit(X_train_dense, y_train)

y_pred = model.predict(X_test_dense)

metrics(y_test.tolist(), y_pred.tolist())

accuracy:0.7449, Precision:0.8274, Recall:0.6186, F1:0.7080


Maybe not super surprising Bayes might not be kitted for NLP!

One thing we have forgotten about is the data is binary! of course bayes doesn't work!

We could use an SVM, however because a SVM scales with $n_{features}*n^2_{samples}$ the computation is way to slow for my machine :(


## Next step Tfidvectorizer

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()

X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

model = LogisticRegression()
model.fit(X_train_vec, y_train)

y_pred = model.predict(X_test_vec)

metrics(y_test.tolist(), y_pred.tolist())

accuracy:0.9030, Precision:0.8934, Recall:0.9152, F1:0.9041


A new best model! 

How exiating our new best model is:

accuracy:0.8966, Precision:0.8827, Recall:0.9129, F1:0.8975

Let's try something intresting a PCA, maybe?

In [21]:
from sklearn.decomposition import TruncatedSVD


svd = TruncatedSVD(n_components=10, random_state=42)
X_train_reduced = svd.fit_transform(X_train_vec)
X_test_reduced = svd.fit_transform(X_test_vec)

model.fit(X_train_reduced, y_train)

y_pred = model.predict(X_test_reduced)


metrics(y_test.tolist(), y_pred.tolist())

accuracy:0.6397, Precision:0.6434, Recall:0.6261, F1:0.6346


In [16]:
from sklearn.decomposition import IncrementalPCA

ipca = IncrementalPCA(n_components=2, batch_size=500)

X_train_reduced = ipca.fit_transform(X_train_vec.toarray())
X_test_reduced = ipca.transform(X_test_vec.toarray())

model.fit(X_train_reduced, y_train)

y_pred = model.predict(X_test_reduced)


metrics(y_test.tolist(), y_pred.tolist())

MemoryError: Unable to allocate 27.7 GiB for an array with shape (40000, 92886) and data type float64