In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
%matplotlib inline

In [2]:
data = pd.read_csv(r'C:\Users\jorda\Documents\studies\DScourse\CourseMaterials\Data\IMDB Dataset\IMDB Dataset.csv')

data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [3]:
X_train, X_test, y_train, y_test = train_test_split(data['review'], data['sentiment'], test_size=0.2, random_state=42)

In [4]:
# Define token for vectorization
token_pattern = r'(?u)\b[a-zA-Z]+\w*\b'

# Initialize the CountVectorizer
vectorizer = CountVectorizer(max_features=1000,stop_words='english', token_pattern=token_pattern, min_df=5, ngram_range=(1, 2))

# Fit and transform the training data to a BoW model
X_train_bow = vectorizer.fit_transform(X_train)

# Transform the test data to the same BoW model
X_test_bow = vectorizer.transform(X_test)

In [5]:
# Initialize the Random Forest classifier
model = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)

# Train the classifier
model.fit(X_train_bow, y_train)

## Step 6: Evaluating the Model

# Predicting the sentiment for test data
y_pred = model.predict(X_test_bow)

# Calculate the accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

# Detailed classification report
print(classification_report(y_test, y_pred))

Accuracy: 0.8327
              precision    recall  f1-score   support

    negative       0.83      0.83      0.83      4961
    positive       0.84      0.83      0.83      5039

    accuracy                           0.83     10000
   macro avg       0.83      0.83      0.83     10000
weighted avg       0.83      0.83      0.83     10000



In [7]:
X_train_bow_vec = pd.DataFrame(X_train_bow.toarray(), columns=vectorizer.get_feature_names_out())

X_train_vec_filtered = X_train_bow_vec.loc[:, (X_train_bow_vec.sum() > 5)]

X_train_vec_filtered

Unnamed: 0,able,absolutely,act,acted,acting,action,actor,actors,actress,actual,...,yeah,year,year old,years,years ago,yes,york,young,younger,zombie
0,0,1,0,0,1,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,1,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39995,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
39996,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
39997,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
39998,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
