# Movie Reviews

In [30]:
import pandas as pd
import string
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import cross_validate
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer

In [31]:
# Import data and make a DataFrame
data = pd.read_pickle("reviews")
data.head()

Unnamed: 0,target,reviews
0,neg,"plot : two teen couples go to a church party ,..."
1,neg,the happy bastard's quick movie review \ndamn ...
2,neg,it is movies like these that make a jaded movi...
3,neg,""" quest for camelot "" is warner bros . ' firs..."
4,neg,synopsis : a mentally unstable man undergoing ...


The dataset is made up of positive and negative movie reviews.

## Preprocessing

👇 Remove punctuation and lower case the text.

In [32]:
# Clean data by removing punctuation and upper case.
data['clean_reviews'] = data['reviews'].str.translate(str.maketrans('','',string.punctuation))
data['clean_reviews'] = data['clean_reviews'].str.lower()
data

Unnamed: 0,target,reviews,clean_reviews
0,neg,"plot : two teen couples go to a church party ,...",plot two teen couples go to a church party d...
1,neg,the happy bastard's quick movie review \ndamn ...,the happy bastards quick movie review \ndamn t...
2,neg,it is movies like these that make a jaded movi...,it is movies like these that make a jaded movi...
3,neg,""" quest for camelot "" is warner bros . ' firs...",quest for camelot is warner bros first fe...
4,neg,synopsis : a mentally unstable man undergoing ...,synopsis a mentally unstable man undergoing p...
...,...,...,...
1995,pos,wow ! what a movie . \nit's everything a movie...,wow what a movie \nits everything a movie ca...
1996,pos,"richard gere can be a commanding actor , but h...",richard gere can be a commanding actor but he...
1997,pos,"glory--starring matthew broderick , denzel was...",glorystarring matthew broderick denzel washin...
1998,pos,steven spielberg's second epic film on world w...,steven spielbergs second epic film on world wa...


## Bag-of-Words modelling

👇 Using `cross_validate`, score a Multinomial Naive Bayes model trained on a Bag-of-Word representation of the texts.

In [33]:
# Create a verctorizor
vectorizor = CountVectorizer()

# Vectorize data with bag of word
X = vectorizor.fit_transform(data['clean_reviews']).toarray()
columns = vectorizor.get_feature_names_out()

#Create dataframe for visualize vectorisation
df = pd.DataFrame(X, columns = columns )
df

Unnamed: 0,00,000,0009f,000acre,000aweek,000foot,000paltry,007,007esque,00s,...,zuko,zukovsky,zulu,zundel,zurgs,zweibel,zwick,zwicks,zwigoffs,zycie
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1996,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1997,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
1998,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [35]:
# Create model
y = data.target
clf = MultinomialNB()
# Train model with cross_validate
results = cross_validate(clf, X, y, cv=5, scoring ='accuracy')

In [36]:
res = results['test_score']

In [37]:
res_mean = results['test_score'].mean()

In [38]:
print(f'les résultats des la cross validation sont {res}, la moyenne de ces résultats est égal à {res_mean}.')

les résultats des la cross validation sont [0.815  0.815  0.8125 0.835  0.795 ], la moyenne de ces résultats est égal à 0.8145.


## N-gram modelling

👇 Using `cross_validate`, score a Multinomial Naive Bayes model trained on a 2-gram Bag-of-Word representation of the texts.

In [41]:
# Create a verctorizor
vectorizor = CountVectorizer(ngram_range=(2,2))

# Vectorize data with bag of word
X_2 = vectorizor.fit_transform(data['clean_reviews']).toarray()
columns = vectorizor.get_feature_names_out()

#Create dataframe for visualize vectorisation
df_output = pd.DataFrame(X, columns = columns )
df_output

Unnamed: 0,00 am,00 feet,00 for,00 if,00 showing,00 sunday,00 wasnt,000 000,000 and,000 at,...,zwick thinks,zwicks 1994,zwicks courage,zwicks favorite,zwicks latest,zwicks the,zwigoffs brilliant,zwigoffs superb,zycie masterfully,zycie za
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1996,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1997,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1998,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [43]:
# Train model with cross_validate
results_2 = cross_validate(clf, X_2, y, cv=5, scoring ='accuracy')

In [38]:
res_2 = results_2['test_score']

array([0.83  , 0.8325, 0.8175, 0.865 , 0.84  ])

In [39]:
res_mean_2 = results_2['test_score'].mean()

0.837

In [None]:
print(f'les résultats des la cross validation sont {res_2}, la moyenne de ces résultats est égal à {res_mean_2}.')

⚠️ Please push the exercise once you are done 🙃

## 🏁 