# Part 1: Loading the Dataset

In [2]:
import pandas as pd

# Load the dataset
data = pd.read_csv("movie_review.csv")

# Display the first few rows of the dataset
data.head()

Unnamed: 0,fold_id,cv_tag,html_id,sent_id,text,tag
0,0,cv000,29590,0,films adapted from comic books have had plenty...,pos
1,0,cv000,29590,1,"for starters , it was created by alan moore ( ...",pos
2,0,cv000,29590,2,to say moore and campbell thoroughly researche...,pos
3,0,cv000,29590,3,"the book ( or "" graphic novel , "" if you will ...",pos
4,0,cv000,29590,4,"in other words , don't dismiss this film becau...",pos


# Part 2: Preprocessing the Text Data

In [7]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string

nltk.download('stopwords')
nltk.download('punkt')

df = pd.read_csv('movie_review.csv')

stop_words = set(stopwords.words('english'))
punctuation = set(string.punctuation)

def preprocess_text(text):
    tokens = word_tokenize(text)
    tokens = [word.lower() for word in tokens if word.lower() not in punctuation]
    tokens = [word for word in tokens if word not in stop_words]
    preprocessed_text = ' '.join(tokens)
    return preprocessed_text


df['Preprocessed_Text'] = df['text'].apply(preprocess_text)

print(df[['text', 'Preprocessed_Text']])


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


                                                    text  \
0      films adapted from comic books have had plenty...   
1      for starters , it was created by alan moore ( ...   
2      to say moore and campbell thoroughly researche...   
3      the book ( or " graphic novel , " if you will ...   
4      in other words , don't dismiss this film becau...   
...                                                  ...   
64715  that lack of inspiration can be traced back to...   
64716  like too many of the skits on the current inca...   
64717  after watching one of the " roxbury " skits on...   
64718   bump unsuspecting women , and . . . that's all .   
64719  after watching _a_night_at_the_roxbury_ , you'...   

                                       Preprocessed_Text  
0      films adapted comic books plenty success wheth...  
1      starters created alan moore eddie campbell bro...  
2      say moore campbell thoroughly researched subje...  
3      book `` graphic novel `` 500 pages l

# Part 3: Training the Word2Vec Model

In [16]:
from gensim.models import Word2Vec


tokenized_texts = [text.split() for text in df['Preprocessed_Text']]

vector_size = 100
window = 5
min_count = 1
sg = 0
epochs = 10


model = Word2Vec(sentences=tokenized_texts,
                 vector_size=vector_size,
                 window=window,
                 min_count=min_count,
                 sg=sg,
                 epochs=epochs)


print("Vecteur du mot 'movie':", model.wv['movie'])

Vecteur du mot 'movie': [-1.7579561   0.8204106  -0.2520395  -0.29048958  0.9601521  -1.1847149
  0.24561419  1.6537014  -0.46935317  0.32090178 -1.4048069  -0.40104583
  0.34730753  0.7429444   0.6778333  -1.539828    0.07763768  0.1657531
 -0.6126682  -0.7128043   0.0501584  -1.0626408   0.26205355 -0.36383516
 -1.0583227   0.16971242 -0.14686173  1.1350888  -1.3722883   0.77056944
  2.4434888  -0.11999991 -0.50902224 -0.20351736 -1.4583368   0.88750255
 -1.0474039  -0.09643658  0.53928256 -0.8366357  -0.28735015 -0.11685587
 -1.1859741   1.1382133  -0.31179678 -0.62790304  0.38889626 -0.6626972
  0.5801985   0.10273264  1.0088845  -0.64862955 -0.31599367  0.22346593
 -1.0785545  -0.23629595 -0.9543879  -1.5302505  -0.6836411  -1.5298994
  0.49084854 -1.0347214   2.3777354  -0.21714005 -1.0487622   0.96559477
  1.433617    1.1616608  -1.5889504   2.113356   -0.6458081   1.6034992
  0.66899323 -0.15659037  2.0948284   0.5385528  -0.8746524   1.2902334
 -0.2816932   0.12473442  0.05595

# Part 4: Vectorization of Movie Reviews

In [19]:
import numpy as np


def get_review_vector(review, model, vector_size):

    words = [word for word in review.split() if word in model.wv.key_to_index]
    if not words:
        return np.zeros(vector_size)
    word_vectors = [model.wv[word] for word in words]
    review_vector = np.mean(word_vectors, axis=0)
    return review_vector

df['Review_Vector'] = df['Preprocessed_Text'].apply(lambda x: get_review_vector(x, model, vector_size))

print(df[['Preprocessed_Text', 'Review_Vector']])

                                       Preprocessed_Text  \
0      films adapted comic books plenty success wheth...   
1      starters created alan moore eddie campbell bro...   
2      say moore campbell thoroughly researched subje...   
3      book `` graphic novel `` 500 pages long includ...   
4                          words n't dismiss film source   
...                                                  ...   
64715    lack inspiration traced back insipid characters   
64716  like many skits current incarnation _saturday_...   
64717  watching one `` roxbury `` skits snl come away...   
64718                         bump unsuspecting women 's   
64719  watching _a_night_at_the_roxbury_ 'll left exa...   

                                           Review_Vector  
0      [-0.4313996, 0.51210254, 0.30550045, 0.1692234...  
1      [-0.16296287, 0.37788075, 0.18781002, 0.430605...  
2      [0.014069708, 0.68756, 0.517141, 0.20507681, -...  
3      [-0.29285327, 0.43872818, 0.4871541,

# Part 5: Splitting the Data into Training and Testing Sets

In [31]:
# Calculate the sizes of the training and testing sets
total_rows = len(df)
train_size = int(0.8 * total_rows)
test_size = total_rows - train_size

# Shuffle the indices of the DataFrame
shuffled_indices = np.random.permutation(df.index)

# Split the indices into training and testing sets
train_indices = shuffled_indices[:train_size]
test_indices = shuffled_indices[train_size:]

# Create the training and testing sets
train_data = df.iloc[train_indices]
test_data = df.iloc[test_indices]

# Print the sizes of the training and testing sets
print("Taille de l'ensemble d'entraînement :", len(train_data))
print("Taille de l'ensemble de test :", len(test_data))


Taille de l'ensemble d'entraînement : 51776
Taille de l'ensemble de test : 12944


# Part 6: Building a Classifier (Logistic Regression)

In [28]:
from sklearn.linear_model import LogisticRegression


X_train = np.vstack(train_data['Review_Vector'].values)
y_train = train_data['tag']
X_test = np.vstack(test_data['Review_Vector'].values)
y_test = test_data['tag']

logistic_model = LogisticRegression(max_iter=1000, random_state=42)


logistic_model.fit(X_train, y_train)


y_pred = logistic_model.predict(X_test)
print(y_pred)

['neg' 'pos' 'neg' ... 'neg' 'pos' 'pos']


# Part 7: Evaluating the Model

In [33]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report


y_pred = logistic_model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)



precision = precision_score(y_test, y_pred, pos_label='pos')
recall = recall_score(y_test, y_pred, pos_label='pos')
f1 = f1_score(y_test, y_pred, pos_label='pos')

print("Accuracy :", accuracy)
print("Precision :", precision)
print("Recall :", recall)

Accuracy : 0.5862175525339925
Precision : 0.5833219224976037
Recall : 0.6481058877225011
