# **Reading the input file on kaggle**

In [1]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.svm import SVC
from sklearn.feature_extraction.text import TfidfVectorizer
import os

for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        filepath = os.path.join(dirname, filename)

/kaggle/input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv


In [2]:
imdb_reviews_orig = pd.read_csv(filepath)
imdb_reviews_orig.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


# Preprocessing the text

In [3]:
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [4]:
!unzip /usr/share/nltk_data/corpora/wordnet.zip -d /usr/share/nltk_data/corpora/

Archive:  /usr/share/nltk_data/corpora/wordnet.zip
   creating: /usr/share/nltk_data/corpora/wordnet/
  inflating: /usr/share/nltk_data/corpora/wordnet/lexnames  
  inflating: /usr/share/nltk_data/corpora/wordnet/data.verb  
  inflating: /usr/share/nltk_data/corpora/wordnet/index.adv  
  inflating: /usr/share/nltk_data/corpora/wordnet/adv.exc  
  inflating: /usr/share/nltk_data/corpora/wordnet/index.verb  
  inflating: /usr/share/nltk_data/corpora/wordnet/cntlist.rev  
  inflating: /usr/share/nltk_data/corpora/wordnet/data.adj  
  inflating: /usr/share/nltk_data/corpora/wordnet/index.adj  
  inflating: /usr/share/nltk_data/corpora/wordnet/LICENSE  
  inflating: /usr/share/nltk_data/corpora/wordnet/citation.bib  
  inflating: /usr/share/nltk_data/corpora/wordnet/noun.exc  
  inflating: /usr/share/nltk_data/corpora/wordnet/verb.exc  
  inflating: /usr/share/nltk_data/corpora/wordnet/README  
  inflating: /usr/share/nltk_data/corpora/wordnet/index.sense  
  inflating: /usr/share/nltk_data

In [6]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    # Remove HTML tags
    text = re.sub(r'<.*?>', '', text)
    # Remove special characters and digits
    text = re.sub(r'[^a-zA-Z]', ' ', text)
    # Convert to lowercase
    text = text.lower()
    # Remove stopwords and lemmatize
    text = ' '.join([lemmatizer.lemmatize(word) for word in text.split() if word not in stop_words])
    return text

In [7]:
#preprocess the reviews
imdb_reviews_orig['cleaned_review'] = imdb_reviews_orig['review'].apply(preprocess_text)
imdb_reviews_orig.head()

Unnamed: 0,review,sentiment,cleaned_review
0,One of the other reviewers has mentioned that ...,positive,one reviewer mentioned watching oz episode hoo...
1,A wonderful little production. <br /><br />The...,positive,wonderful little production filming technique ...
2,I thought this was a wonderful way to spend ti...,positive,thought wonderful way spend time hot summer we...
3,Basically there's a family where a little boy ...,negative,basically family little boy jake think zombie ...
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,petter mattei love time money visually stunnin...


In [8]:
#convert the categorical label to numerical
imdb_reviews_orig['label'] = imdb_reviews_orig['sentiment'].apply(lambda x: 0 if x == 'negative' else 1)
imdb_reviews_orig.head()

Unnamed: 0,review,sentiment,cleaned_review,label
0,One of the other reviewers has mentioned that ...,positive,one reviewer mentioned watching oz episode hoo...,1
1,A wonderful little production. <br /><br />The...,positive,wonderful little production filming technique ...,1
2,I thought this was a wonderful way to spend ti...,positive,thought wonderful way spend time hot summer we...,1
3,Basically there's a family where a little boy ...,negative,basically family little boy jake think zombie ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,petter mattei love time money visually stunnin...,1


# Splitting into train and test

In [9]:
X = imdb_reviews_orig['cleaned_review']
y = imdb_reviews_orig['label'] 

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=42)

print(f'Training data shape: {X_train.shape}, Training labels shape: {y_train.shape}')
print(f'Testing data shape: {X_test.shape}, Testing labels shape: {y_test.shape}')

Training data shape: (25000,), Training labels shape: (25000,)
Testing data shape: (25000,), Testing labels shape: (25000,)


# Feature extraction using TFIDF

In [10]:
vectorizer = TfidfVectorizer() 
#fitting the TF-IDF vectorizer on the entire dataset
fitted_vectorizer = vectorizer.fit(imdb_reviews_orig['cleaned_review'])

In [11]:
# Get the feature namesfrom the TF-IDF vectorizer
feature_names = fitted_vectorizer.get_feature_names_out()
print(feature_names[100:150])

['abandonof' 'abanks' 'abashed' 'abashidze' 'abasing' 'abatement'
 'abating' 'abattoir' 'abba' 'abbad' 'abbas' 'abbasi' 'abbe' 'abbey'
 'abbie' 'abbot' 'abbott' 'abbotts' 'abbreviate' 'abbreviated'
 'abbreviating' 'abbu' 'abby' 'abbyss' 'abc' 'abcd' 'abd' 'abdalla'
 'abderrahmane' 'abdic' 'abdicated' 'abdicates' 'abdicating' 'abdomen'
 'abdominal' 'abdoo' 'abdu' 'abduct' 'abducted' 'abductee' 'abducting'
 'abduction' 'abductor' 'abducts' 'abdul' 'abdullah' 'abdulrahman' 'abe'
 'abecassis' 'abed']


In [12]:
#transforming the train and test sets separately to avoid data leakage
X_train_tfidf = fitted_vectorizer.transform(X_train).toarray()
X_test_tfidf = fitted_vectorizer.transform(X_test).toarray()

print(f'Training data shape: {X_train_tfidf.shape}, Training labels shape: {y_train.shape}')
print(f'Testing data shape: {X_test_tfidf.shape}, Testing labels shape: {y_test.shape}')

Training data shape: (25000, 91705), Training labels shape: (25000,)
Testing data shape: (25000, 91705), Testing labels shape: (25000,)


# Building the models

In [13]:
lr_model = LogisticRegression()
lr_model.fit(X_train_tfidf, y_train)

y_pred = lr_model.predict(X_test_tfidf)

print(f'Accuracy: {accuracy_score(y_test, y_pred)}')
print(classification_report(y_test, y_pred, target_names=['negative', 'positive']))

Accuracy: 0.89108
              precision    recall  f1-score   support

    negative       0.90      0.87      0.89     12483
    positive       0.88      0.91      0.89     12517

    accuracy                           0.89     25000
   macro avg       0.89      0.89      0.89     25000
weighted avg       0.89      0.89      0.89     25000

