## Installation ##

In [3]:
!pip3 install xgboost


Collecting xgboost
  Using cached xgboost-2.1.2-py3-none-macosx_12_0_arm64.whl (1.9 MB)
Installing collected packages: xgboost
Successfully installed xgboost-2.1.2
You should consider upgrading via the '/Users/sandeep/DOCS/Machine Learning/IMDB/venv/bin/python3 -m pip install --upgrade pip' command.[0m


In [2]:
!pip3 install numpy
!pip3 install pandas
!pip3 install scikit-learn
!pip3 install NLTK
!pip3 install beautifulsoup4

Collecting numpy
  Using cached numpy-2.0.2-cp39-cp39-macosx_14_0_arm64.whl (5.3 MB)
Installing collected packages: numpy
Successfully installed numpy-2.0.2
You should consider upgrading via the '/Users/sandeep/DOCS/Machine Learning/IMDB/venv/bin/python3 -m pip install --upgrade pip' command.[0m
Collecting pandas
  Using cached pandas-2.2.3-cp39-cp39-macosx_11_0_arm64.whl (11.3 MB)
Collecting python-dateutil>=2.8.2
  Using cached python_dateutil-2.9.0.post0-py2.py3-none-any.whl (229 kB)
Collecting pytz>=2020.1
  Downloading pytz-2024.2-py2.py3-none-any.whl (508 kB)
[K     |████████████████████████████████| 508 kB 5.3 MB/s eta 0:00:01
Collecting tzdata>=2022.7
  Using cached tzdata-2024.2-py2.py3-none-any.whl (346 kB)
Collecting six>=1.5
  Using cached six-1.16.0-py2.py3-none-any.whl (11 kB)
Installing collected packages: six, tzdata, pytz, python-dateutil, pandas
Successfully installed pandas-2.2.3 python-dateutil-2.9.0.post0 pytz-2024.2 six-1.16.0 tzdata-2024.2
You should consider u

## Imports ##

In [17]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import nltk
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import xgboost as xgb
from gensim.models import Word2Vec

In [18]:
df = pd.read_csv("IMDB.csv")

## Preprocessing ##

In [20]:
import re
import string
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from bs4 import BeautifulSoup

# Initialize stemming and lemmatization
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

# Download stopwords if not already done
import nltk
nltk.download('stopwords')
nltk.download('wordnet')

# Preprocessing function
def preprocess(sentence):
    # 1. Remove HTML tags
    sentence = BeautifulSoup(sentence, "html.parser").get_text()
    
    # 2. Convert to lowercase
    sentence = sentence.lower()
    
    # 3. Remove punctuation
    sentence = sentence.translate(str.maketrans("", "", string.punctuation))
    
    # 4. Remove numbers
    sentence = re.sub(r"\d+", "", sentence)
    
    # 5. Tokenize words
    words = sentence.split()
    
    # 6. Remove stopwords
    stop_words = set(stopwords.words("english"))
    words = [word for word in words if word not in stop_words]
    
    # 7. Apply stemming
    words = [stemmer.stem(word) for word in words]
    
    # 8. Apply lemmatization
    words = [lemmatizer.lemmatize(word) for word in words]
    
    # 9. Rejoin words into a single string
    processed_sentence = " ".join(words)
    
    return processed_sentence


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/sandeep/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/sandeep/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [21]:
review_list = []
ls = df["review"].to_list()
for sentence in ls:
  sent = preprocess(sentence)
  review_list.append(sent)

  sentence = BeautifulSoup(sentence, "html.parser").get_text()


In [22]:
df1 = df.copy()
df1['review'] = np.array(review_list)
df1=df1.replace({"sentiment":{"positive":1,"negative":0}})

In [23]:
pd.set_option('future.no_silent_downcasting', True)

In [24]:
x_train,x_test,y_train,y_test = train_test_split(df1['review'],df1['sentiment'],test_size=0.2,random_state=42)

In [25]:
vector = CountVectorizer(max_features=5000)
x_train = vector.fit_transform(x_train)
x_test = vector.transform(x_test)
x_train = np.array(x_train).tolist()
x_test = np.array(x_test).tolist()

In [30]:
y_train = y_train.astype(int)
y_test = y_test.astype(int)

## Models training

In [31]:
clf = RandomForestClassifier(n_estimators=100)
clf.fit(x_train,y_train)

In [32]:
y_pred = clf.predict(x_test)

In [33]:
accuracy_score(y_pred,y_test)

0.8402

In [34]:
xgboost_model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42)

# Train the model
xgboost_model.fit(x_train, y_train)


Parameters: { "use_label_encoder" } are not used.



In [35]:
y_pred_xgb = xgboost_model.predict(x_test)

In [36]:
accuracy_score(y_pred_xgb,y_test)

0.8555

## Word2Vec tokenizer

In [37]:
df2 = df.copy()
df2['review'] = np.array(review_list)
df2=df2.replace({"sentiment":{"positive":1,"negative":0}})

In [38]:
df2.head()

Unnamed: 0,review,sentiment
0,one review mention watch oz episod youll hook ...,1
1,wonder littl product film techniqu unassum old...,1
2,thought wonder way spend time hot summer weeke...,1
3,basic there famili littl boy jake think there ...,0
4,petter mattei love time money visual stun film...,1


In [40]:
word2vec_model = Word2Vec(sentences=df["review"], vector_size=300, window=5, min_count=1, workers=4)

In [41]:
x_train1,x_test1,y_train1,y_test1 = train_test_split(df2['review'],df2['sentiment'],test_size=0.2,random_state=42)

In [50]:
def get_feature_vector(tokens, word2vec_model):
    # Get the vector for each token (word) in the review
    vectors = [word2vec_model.wv[word] for word in tokens if word in word2vec_model.wv]
    
    # If there are no valid words in the review, return a zero vector
    if len(vectors) > 0:
        return np.mean(vectors, axis=0)
    else:
        return np.zeros(word2vec_model.vector_size)

# Apply the function to each tokenized review in X_train1 to get the dense representation


In [51]:
X_train_dense = np.array([get_feature_vector(tokens, word2vec_model) for tokens in x_train1])
X_test_dense = np.array([get_feature_vector(tokens, word2vec_model) for tokens in x_test1])

In [52]:
xg_boost_w2v = xgb.XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42)

# Train the model
xg_boost_w2v.fit(X_train_dense, y_train1)

Parameters: { "use_label_encoder" } are not used.



In [53]:
y_pred_w2v = xg_boost_w2v.predict(X_test_dense)

In [59]:
y_test1 = y_test1.astype(int)

In [60]:
accuracy_score(y_test1,y_pred_w2v)

0.6134