In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd  # reads the data into a pandas dataframe

In [6]:
imdb_data = pd.read_csv('IMDB Dataset.csv')

In [7]:
imdb_data

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
...,...,...
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative


In [8]:
imdb_data.loc[0].review

"One of the other reviewers has mentioned that after watching just 1 Oz episode you'll be hooked. They are right, as this is exactly what happened with me.<br /><br />The first thing that struck me about Oz was its brutality and unflinching scenes of violence, which set in right from the word GO. Trust me, this is not a show for the faint hearted or timid. This show pulls no punches with regards to drugs, sex or violence. Its is hardcore, in the classic use of the word.<br /><br />It is called OZ as that is the nickname given to the Oswald Maximum Security State Penitentary. It focuses mainly on Emerald City, an experimental section of the prison where all the cells have glass fronts and face inwards, so privacy is not high on the agenda. Em City is home to many..Aryans, Muslims, gangstas, Latinos, Christians, Italians, Irish and more....so scuffles, death stares, dodgy dealings and shady agreements are never far away.<br /><br />I would say the main appeal of the show is due to the fa

In [9]:
imdb_data['label'] = imdb_data.sentiment.map({'positive': 1, 'negative': 0})
# mapping each label to a number 

In [10]:
imdb_data

Unnamed: 0,review,sentiment,label
0,One of the other reviewers has mentioned that ...,positive,1
1,A wonderful little production. <br /><br />The...,positive,1
2,I thought this was a wonderful way to spend ti...,positive,1
3,Basically there's a family where a little boy ...,negative,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,1
...,...,...,...
49995,I thought this movie did a down right good job...,positive,1
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative,0
49997,I am a Catholic taught in parochial elementary...,negative,0
49998,I'm going to have to disagree with the previou...,negative,0


In [11]:
from sklearn.model_selection import train_test_split
# randomly selecting data to run

X_train, X_test, y_train, y_test = train_test_split(
    imdb_data.review,
    imdb_data.label,
    test_size=0.2,
    random_state=2000,
    stratify=imdb_data.label
)

In [12]:
print("Shape of X_train: ", X_train.shape)
print("Shape of X_test: ", X_test.shape)

Shape of X_train:  (40000,)
Shape of X_test:  (10000,)


In [13]:
y_train.value_counts()

label
0    20000
1    20000
Name: count, dtype: int64

In [14]:
y_test.value_counts()

label
1    5000
0    5000
Name: count, dtype: int64

In [15]:
from sklearn.neighbors import KNeighborsClassifier # use KNN to analyze the data
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

clf = Pipeline([
    ('vectorizer_tfidf' ,TfidfVectorizer()),
    ('KNN', KNeighborsClassifier())
])

# Train the model
clf.fit(X_train, y_train)

# Make prediction
y_pred = clf.predict(X_test)

# Evaluate the performance
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.79      0.74      0.76      5000
           1       0.76      0.80      0.78      5000

    accuracy                           0.77     10000
   macro avg       0.77      0.77      0.77     10000
weighted avg       0.77      0.77      0.77     10000



In [16]:
X_test[:5]

43346    CB4 was awful, but it may have given Cundieff ...
39610    definitely the best game for N64 ever. I most ...
24427    When will the hurting stop? I never want to se...
36693    Musical bios are all cut of the same cloth. Ho...
32586    What is it now-a-days that minority comedians ...
Name: review, dtype: object

In [17]:
y_test[:5]

43346    1
39610    1
24427    0
36693    1
32586    0
Name: label, dtype: int64

In [18]:
y_pred[:5]

array([1, 1, 1, 1, 0])

In [19]:
from sklearn.naive_bayes import MultinomialNB

clf = Pipeline([
    ('vectorizer_tfidf', TfidfVectorizer()),
    ('Multi NB', MultinomialNB())
])

clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.85      0.89      0.87      5000
           1       0.88      0.84      0.86      5000

    accuracy                           0.86     10000
   macro avg       0.86      0.86      0.86     10000
weighted avg       0.86      0.86      0.86     10000



In [20]:
from sklearn.ensemble import RandomForestClassifier # use Random Forest to analyze the data

clf = Pipeline([
    ('vectorizer_tfidf' ,TfidfVectorizer()),
    ('Random Forest', RandomForestClassifier())
])

# Train the model
clf.fit(X_train, y_train)

# Make prediction
y_pred = clf.predict(X_test)

# Evaluate the performance
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.85      0.85      0.85      5000
           1       0.85      0.85      0.85      5000

    accuracy                           0.85     10000
   macro avg       0.85      0.85      0.85     10000
weighted avg       0.85      0.85      0.85     10000



In [21]:
X_test[:5]

43346    CB4 was awful, but it may have given Cundieff ...
39610    definitely the best game for N64 ever. I most ...
24427    When will the hurting stop? I never want to se...
36693    Musical bios are all cut of the same cloth. Ho...
32586    What is it now-a-days that minority comedians ...
Name: review, dtype: object

In [22]:
y_test[:5]

43346    1
39610    1
24427    0
36693    1
32586    0
Name: label, dtype: int64

In [23]:
y_pred[:5]

array([0, 1, 0, 1, 0])

In [24]:
from sklearn.linear_model import LogisticRegression # use Logistic Regression to analyze the data
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

# Create the pipeline with TF-IDF vectorizer and Logistic Regression
clf = Pipeline([
    ('vectorizer_tfidf', TfidfVectorizer()),
    ('log_reg', LogisticRegression(max_iter=1000))  # max_iter helps avoid convergence warnings
])

# Train the model
clf.fit(X_train, y_train)

# Make predictions
y_pred = clf.predict(X_test)

# Evaluate performance
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.91      0.89      0.90      5000
           1       0.89      0.91      0.90      5000

    accuracy                           0.90     10000
   macro avg       0.90      0.90      0.90     10000
weighted avg       0.90      0.90      0.90     10000



In [25]:
X_test[:5]

43346    CB4 was awful, but it may have given Cundieff ...
39610    definitely the best game for N64 ever. I most ...
24427    When will the hurting stop? I never want to se...
36693    Musical bios are all cut of the same cloth. Ho...
32586    What is it now-a-days that minority comedians ...
Name: review, dtype: object

In [26]:
y_test[:5]

43346    1
39610    1
24427    0
36693    1
32586    0
Name: label, dtype: int64

In [27]:
y_pred[:5]

array([1, 1, 0, 1, 0])

In [28]:
import spacy
import pydantic
print(f"spaCy version: {spacy.__version__}")
print(f"pydantic version: {pydantic.VERSION}")

spaCy version: 3.8.3
pydantic version: 2.10.6


In [29]:
nlp = spacy.load("en_core_web_sm")

In [30]:
import spacy

nlp = spacy.load("en_core_web_sm")

def preprocess(text):
    doc = nlp(text)
    filtered_tokens = []
    for token in doc:
        if token.is_stop or token.is_punct:
            continue
        filtered_tokens.append(token.lemma_)

    return " ".join(filtered_tokens)        

In [31]:
imdb_data['preprocessed_txt'] = imdb_data['review'].apply(preprocess)

In [32]:
imdb_data.head()

Unnamed: 0,review,sentiment,label,preprocessed_txt
0,One of the other reviewers has mentioned that ...,positive,1,reviewer mention watch 1 Oz episode hook right...
1,A wonderful little production. <br /><br />The...,positive,1,wonderful little production < br /><br />the f...
2,I thought this was a wonderful way to spend ti...,positive,1,think wonderful way spend time hot summer week...
3,Basically there's a family where a little boy ...,negative,0,basically family little boy Jake think zombie ...
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,1,Petter Mattei love Time money visually stunnin...


In [33]:
X_train, X_test, y_train, y_test = train_test_split(
    imdb_data.preprocessed_txt,
    imdb_data.label,
    test_size=0.2,
    random_state=2000,
    stratify=imdb_data.label
)

In [34]:
imdb_data.review[0]

"One of the other reviewers has mentioned that after watching just 1 Oz episode you'll be hooked. They are right, as this is exactly what happened with me.<br /><br />The first thing that struck me about Oz was its brutality and unflinching scenes of violence, which set in right from the word GO. Trust me, this is not a show for the faint hearted or timid. This show pulls no punches with regards to drugs, sex or violence. Its is hardcore, in the classic use of the word.<br /><br />It is called OZ as that is the nickname given to the Oswald Maximum Security State Penitentary. It focuses mainly on Emerald City, an experimental section of the prison where all the cells have glass fronts and face inwards, so privacy is not high on the agenda. Em City is home to many..Aryans, Muslims, gangstas, Latinos, Christians, Italians, Irish and more....so scuffles, death stares, dodgy dealings and shady agreements are never far away.<br /><br />I would say the main appeal of the show is due to the fa

In [35]:
imdb_data.preprocessed_txt[0]

'reviewer mention watch 1 Oz episode hook right exactly happen me.<br /><br />the thing strike Oz brutality unflinche scene violence set right word trust faint hearted timid pull punch regard drug sex violence hardcore classic use word.<br /><br />it call oz nickname give Oswald Maximum Security State Penitentary focus mainly Emerald City experimental section prison cell glass front face inward privacy high agenda Em City home Aryans Muslims gangsta Latinos Christians Italians irish scuffle death stare dodgy dealing shady agreement far away.<br /><br />I main appeal fact go show dare forget pretty picture paint mainstream audience forget charm forget romance oz mess episode see strike nasty surreal ready watch develop taste Oz get accustom high level graphic violence violence injustice crooked guard sell nickel inmate kill order away mannered middle class inmate turn prison bitch lack street skill prison experience watch Oz comfortable uncomfortable view that touch dark'

In [36]:
from sklearn.neighbors import KNeighborsClassifier # use KNN to analyze the data
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

clf = Pipeline([
    ('vectorizer_tfidf' ,TfidfVectorizer()),
    ('KNN', KNeighborsClassifier())
])

# Train the model
clf.fit(X_train, y_train)

# Make prediction
y_pred = clf.predict(X_test)

# Evaluate the performance
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.79      0.74      0.77      5000
           1       0.76      0.81      0.78      5000

    accuracy                           0.77     10000
   macro avg       0.77      0.77      0.77     10000
weighted avg       0.77      0.77      0.77     10000



In [37]:
from sklearn.ensemble import RandomForestClassifier # use Random Forest to analyze the data

clf = Pipeline([
    ('vectorizer_tfidf' ,TfidfVectorizer()),
    ('Random Forest', RandomForestClassifier())
])

# Train the model
clf.fit(X_train, y_train)

# Make prediction
y_pred = clf.predict(X_test)

# Evaluate the performance
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.85      0.85      0.85      5000
           1       0.85      0.85      0.85      5000

    accuracy                           0.85     10000
   macro avg       0.85      0.85      0.85     10000
weighted avg       0.85      0.85      0.85     10000



In [38]:
from sklearn.linear_model import LogisticRegression # use Logistic Regression to analyze the data
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

# Create the pipeline with TF-IDF vectorizer and Logistic Regression
clf = Pipeline([
    ('vectorizer_tfidf', TfidfVectorizer()),
    ('log_reg', LogisticRegression(max_iter=1000))  # max_iter helps avoid convergence warnings
])

# Train the model
clf.fit(X_train, y_train)

# Make predictions
y_pred = clf.predict(X_test)

# Evaluate performance
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.90      0.88      0.89      5000
           1       0.88      0.90      0.89      5000

    accuracy                           0.89     10000
   macro avg       0.89      0.89      0.89     10000
weighted avg       0.89      0.89      0.89     10000

