In [30]:
import numpy as np
import pandas as pd
import nltk
# !pip install nltk
# nltk.download("punkt")
# nltk.download('stopwords')
# nltk.download('wordnet')
# nltk.download('omw-1.4')
nltk.download('averaged_perceptron_tagger')

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer


[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Abdullah\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping taggers\averaged_perceptron_tagger.zip.


In [2]:
df=pd.read_csv("C:/Users/Abdullah/Desktop/IMDB Dataset.csv")

In [3]:
df

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
...,...,...
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative


In [4]:
df.columns

Index(['review', 'sentiment'], dtype='object')

In [5]:
df.shape

(50000, 2)

# Step1 
Text Preprocessing

In [6]:
# Tokenize the text in the 'review' column 
df['tokenized_review'] = df['review'].apply(word_tokenize)

In [7]:
df[['review','tokenized_review']].head()

Unnamed: 0,review,tokenized_review
0,One of the other reviewers has mentioned that ...,"[One, of, the, other, reviewers, has, mentione..."
1,A wonderful little production. <br /><br />The...,"[A, wonderful, little, production, ., <, br, /..."
2,I thought this was a wonderful way to spend ti...,"[I, thought, this, was, a, wonderful, way, to,..."
3,Basically there's a family where a little boy ...,"[Basically, there, 's, a, family, where, a, li..."
4,"Petter Mattei's ""Love in the Time of Money"" is...","[Petter, Mattei, 's, ``, Love, in, the, Time, ..."


# Removing stopwords

In [12]:
# define the stopwords
stop_words=set(stopwords.words('english'))

# Function to remove stopwords from tokenized text 
def remove_stopwords(tokenized_text):
    for word in tokenized_text:
        if word.lower() not in stop_words:
            return word
# Apply the function to the 'tokenized_review' column 
df['filtered_review'] = df['tokenized_review'].apply(remove_stopwords)

In [14]:
df[['review','tokenized_review','filtered_review']].head()

Unnamed: 0,review,tokenized_review,filtered_review
0,One of the other reviewers has mentioned that ...,"[One, of, the, other, reviewers, has, mentione...",One
1,A wonderful little production. <br /><br />The...,"[A, wonderful, little, production, ., <, br, /...",wonderful
2,I thought this was a wonderful way to spend ti...,"[I, thought, this, was, a, wonderful, way, to,...",thought
3,Basically there's a family where a little boy ...,"[Basically, there, 's, a, family, where, a, li...",Basically
4,"Petter Mattei's ""Love in the Time of Money"" is...","[Petter, Mattei, 's, ``, Love, in, the, Time, ...",Petter


In [18]:
lemmatizer = WordNetLemmatizer()

In [31]:
def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}
    return tag_dict.get(tag, wordnet.NOUN)

In [32]:
def lemmatize_text(tokenized_text):
     for word in tokenized_text:
         return lemmatizer.lemmatize(word, get_wordnet_pos(word))
           
df['lemmatized_review'] = df['tokenized_review'].apply(lemmatize_text)

In [34]:
df[['review','tokenized_review','filtered_review','lemmatized_review']].head()

Unnamed: 0,review,tokenized_review,filtered_review,lemmatized_review
0,One of the other reviewers has mentioned that ...,"[One, of, the, other, reviewers, has, mentione...",One,One
1,A wonderful little production. <br /><br />The...,"[A, wonderful, little, production, ., <, br, /...",wonderful,A
2,I thought this was a wonderful way to spend ti...,"[I, thought, this, was, a, wonderful, way, to,...",thought,I
3,Basically there's a family where a little boy ...,"[Basically, there, 's, a, family, where, a, li...",Basically,Basically
4,"Petter Mattei's ""Love in the Time of Money"" is...","[Petter, Mattei, 's, ``, Love, in, the, Time, ...",Petter,Petter


# Feature Engineering
Convert text data into numerical format using TF-IDF or word embeddings.

In [37]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize TF-IDF Vectorizer with a limited vocabulary size
tfidf_vectorizer = TfidfVectorizer(max_features=10000)  # Adjust the number of features as needed

# Fit and transform the text data
tfidf_matrix = tfidf_vectorizer.fit_transform(df['review'])

# Convert to DataFrame for better visualization
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf_vectorizer.get_feature_names_out())

# Display the first few rows of the DataFrame
print(tfidf_df.head())


    00  000  007       10  100  1000  101   11   12   13  ...  zhang  zizek  \
0  0.0  0.0  0.0  0.00000  0.0   0.0  0.0  0.0  0.0  0.0  ...    0.0    0.0   
1  0.0  0.0  0.0  0.00000  0.0   0.0  0.0  0.0  0.0  0.0  ...    0.0    0.0   
2  0.0  0.0  0.0  0.00000  0.0   0.0  0.0  0.0  0.0  0.0  ...    0.0    0.0   
3  0.0  0.0  0.0  0.05692  0.0   0.0  0.0  0.0  0.0  0.0  ...    0.0    0.0   
4  0.0  0.0  0.0  0.00000  0.0   0.0  0.0  0.0  0.0  0.0  ...    0.0    0.0   

   zoey  zombi    zombie  zombies  zone  zoom  zorro  zucco  
0   0.0    0.0  0.000000      0.0   0.0   0.0    0.0    0.0  
1   0.0    0.0  0.000000      0.0   0.0   0.0    0.0    0.0  
2   0.0    0.0  0.000000      0.0   0.0   0.0    0.0    0.0  
3   0.0    0.0  0.208539      0.0   0.0   0.0    0.0    0.0  
4   0.0    0.0  0.000000      0.0   0.0   0.0    0.0    0.0  

[5 rows x 10000 columns]


In [40]:
df.shape

(50000, 5)

In [41]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
# Initialize TF-IDF Vectorizer

# Fit and transform the text data
X = tfidf_matrix

# Extract the target variable
y = df['sentiment'].apply(lambda x: 1 if x == 'positive' else 0)

#  Split Data into Training and Testing Sets

In [42]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# Train the Logistic Regression Classifier

In [43]:
# Initialize the Logistic Regression model
logreg = LogisticRegression(max_iter=1000)

# Train the model
logreg.fit(X_train, y_train)


# Evaluate the Model

In [44]:
# Make predictions on the test set
y_pred = logreg.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

# Print classification report
print(classification_report(y_test, y_pred))


Accuracy: 0.90
              precision    recall  f1-score   support

           0       0.91      0.89      0.90      4961
           1       0.89      0.91      0.90      5039

    accuracy                           0.90     10000
   macro avg       0.90      0.90      0.90     10000
weighted avg       0.90      0.90      0.90     10000

