# Vectorizer Tuning

In [5]:
# import librairies
import pandas as pd
import string
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

In [6]:
# load dataset
data = pd.read_pickle("reviews_3")
data.head()

Unnamed: 0,target,reviews
0,neg,"plot : two teen couples go to a church party ,..."
1,neg,the happy bastard's quick movie review \ndamn ...
2,neg,it is movies like these that make a jaded movi...
3,neg,""" quest for camelot "" is warner bros . ' firs..."
4,neg,synopsis : a mentally unstable man undergoing ...


The dataset is made up of positive and negative movie reviews.

## Preprocessing

👇 Remove punctuation and lower case the text.

In [7]:
# Clean data by removing punctuation and upper case.
data['clean_reviews'] = data['reviews'].str.translate(str.maketrans('','',string.punctuation))
data['clean_reviews'] = data['clean_reviews'].str.lower()
data
X = data.clean_reviews
y = data.target

## Tuning

👇 Tune a vectorizer of your choice (or try both!) and a MultinomialNB model simultaneously.

In [8]:
# Create Pipeline
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('nb', MultinomialNB()),
])
# Set parameters to search (model and vectorizer)
parameters = {
    'tfidf__ngram_range': ((1,1), (2,2), (3,3)),
    'nb__alpha':(0.1,1),}

# Perform grid search on pipeline
grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1,
                           verbose=1, scoring ='accuracy',
                           refit=True, cv=5)

grid_search.fit(X,y)

Fitting 5 folds for each of 6 candidates, totalling 30 fits


In [31]:
grid_search.best_params_
grid_search.best_score_
print(f"Bests parameters for alpha is {grid_search.best_params_['nb__alpha']} and for ngram is {grid_search.best_params_['tfidf__ngram_range']}. \nAnd the best score is {round(grid_search.best_score_,4)}")


Bests parameters for alpha is 0.1 and for ngram is (2, 2). 
And the best score is 0.8395


In [32]:
# Create Pipeline
pipeline = Pipeline([
    ('vect', None), # placeholder for vectorizer
    ('nb', MultinomialNB()),
])
# Set parameters to search (model and vectorizer)
parameters = {
    'vect': [TfidfVectorizer(), CountVectorizer()],
    'vect__ngram_range': ((1,1), (2,2)),
    'nb__alpha':(0.1,1),}

# Perform grid search on pipeline
grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1,
                           verbose=1, scoring ='accuracy',
                           refit=True, cv=5)

grid_search.fit(X,y)

Fitting 5 folds for each of 8 candidates, totalling 40 fits


In [33]:
grid_search.best_params_
grid_search.best_score_
print(f"Bests parameters for alpha is {grid_search.best_params_['nb__alpha']} and for vecteur is {grid_search.best_params_['vect']}. \nAnd the best score is {round(grid_search.best_score_,4)}")

Bests parameters for alpha is 0.1 and for vecteur is CountVectorizer(ngram_range=(2, 2)). 
And the best score is 0.84


# More cleaning data

In [34]:
def remove_stopwords(text):
    stop_words = stopwords.words('english')
    word_tokens = word_tokenize(text)
    filtered_text = " ".join([word for word in word_tokens if not word in stop_words])
    return filtered_text

data['clean_reviews'] = data['clean_reviews'].apply(remove_stopwords)

In [35]:
def lemmatize_text(text):
  
    lemmatizer = WordNetLemmatizer()
    word_tokens = word_tokenize(text)
    lemmatized_words = " ".join([lemmatizer.lemmatize(word) for word in word_tokens])
    return lemmatized_words

data['clean_reviews'] = data['clean_reviews'].apply(lemmatize_text)

In [36]:
data

Unnamed: 0,target,reviews,clean_reviews
0,neg,"plot : two teen couples go to a church party ,...",plot two teen couple go church party drink dri...
1,neg,the happy bastard's quick movie review \ndamn ...,happy bastard quick movie review damn y2k bug ...
2,neg,it is movies like these that make a jaded movi...,movie like make jaded movie viewer thankful in...
3,neg,""" quest for camelot "" is warner bros . ' firs...",quest camelot warner bros first featurelength ...
4,neg,synopsis : a mentally unstable man undergoing ...,synopsis mentally unstable man undergoing psyc...
...,...,...,...
1995,pos,wow ! what a movie . \nit's everything a movie...,wow movie everything movie funny dramatic inte...
1996,pos,"richard gere can be a commanding actor , but h...",richard gere commanding actor he always great ...
1997,pos,"glory--starring matthew broderick , denzel was...",glorystarring matthew broderick denzel washing...
1998,pos,steven spielberg's second epic film on world w...,steven spielberg second epic film world war ii...


In [37]:
X = data.clean_reviews
y = data.target

In [38]:
# Create Pipeline
pipeline = Pipeline([
    ('vect', None), # placeholder for vectorizer
    ('nb', MultinomialNB()),
])
# Set parameters to search (model and vectorizer)
parameters = {
    'vect': [TfidfVectorizer(), CountVectorizer()],
    'vect__ngram_range': ((1,1), (2,2)),
    'nb__alpha':(0.1,1),}

# Perform grid search on pipeline
grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1,
                           verbose=1, scoring ='accuracy',
                           refit=True, cv=5)

grid_search.fit(X,y)

Fitting 5 folds for each of 8 candidates, totalling 40 fits


In [40]:
grid_search.best_params_
grid_search.best_score_
print(f"Bests parameters for alpha is {grid_search.best_params_['nb__alpha']} and for vecteur is {grid_search.best_params_['vect']}. \nAnd the best score is {round(grid_search.best_score_,4)}")

Bests parameters for alpha is 1 and for vecteur is TfidfVectorizer(). 
And the best score is 0.8215


If we clean the data more, bests parameters are no longer the same, but the score does not increase.

# Analyse with Azure cognitive services

In [41]:
cog_key = 'a7404d50c1384d0680f9020fee700f96'
cog_endpoint = 'https://azure-ml-ai900-justine-31012023.cognitiveservices.azure.com/'

print('Ready to use cognitive services at {} using key {}'.format(cog_endpoint, cog_key))

Ready to use cognitive services at https://azure-ml-ai900-justine-31012023.cognitiveservices.azure.com/ using key a7404d50c1384d0680f9020fee700f96


### Detect language 

In [42]:
import os


# Create a collection of reviews with dataframe
reviews = []
for i in range(10):
    review = {"id": i, "text": data['clean_reviews'][i]}
    reviews.append(review)
    

for review_num in range(len(reviews)):
    # print the review text
     print('{}\n{}\n'.format(reviews[review_num]['id'], reviews[review_num]['text']))

0
plot two teen couple go church party drink drive get accident one guy dy girlfriend continues see life nightmare whats deal watch movie sorta find critique mindfuck movie teen generation touch cool idea present bad package make review even harder one write since generally applaud film attempt break mold mess head lost highway memento good bad way making type film folk didnt snag one correctly seem taken pretty neat concept executed terribly problem movie well main problem simply jumbled start normal downshift fantasy world audience member idea whats going dream character coming back dead others look like dead strange apparition disappearance looooot chase scene ton weird thing happen simply explained personally dont mind trying unravel film every give clue get kind fed film biggest problem obviously got big secret hide seems want hide completely final five minute make thing entertaining thrilling even engaging meantime really sad part arrow dig flick like actually figured halfway poi

In [43]:
from azure.cognitiveservices.language.textanalytics import TextAnalyticsClient
from msrest.authentication import CognitiveServicesCredentials

# Get a client for your text analytics cognitive service resource
text_analytics_client = TextAnalyticsClient(endpoint=cog_endpoint,
                                            credentials=CognitiveServicesCredentials(cog_key))

# Analyze the reviews you read from the /data/reviews folder earlier
language_analysis = text_analytics_client.detect_language(documents=reviews)

# print detected language details for each review
for review_num in range(len(reviews)):
    # print the review id
    print(reviews[review_num]['id'])

    # Get the language details for this review
    lang = language_analysis.documents[review_num].detected_languages[0]
    print(' - Language: {}\n - Code: {}\n - Score: {}\n'.format(lang.name, lang.iso6391_name, lang.score))

    # Add the detected language code to the collection of reviews (so we can do further analysis)
    reviews[review_num]["language"] = lang.iso6391_name

0
 - Language: English
 - Code: en
 - Score: 1.0

1
 - Language: English
 - Code: en
 - Score: 1.0

2
 - Language: English
 - Code: en
 - Score: 1.0

3
 - Language: English
 - Code: en
 - Score: 0.9926199316978455

4
 - Language: English
 - Code: en
 - Score: 1.0

5
 - Language: English
 - Code: en
 - Score: 1.0

6
 - Language: English
 - Code: en
 - Score: 0.9966887831687927

7
 - Language: English
 - Code: en
 - Score: 1.0

8
 - Language: English
 - Code: en
 - Score: 1.0

9
 - Language: English
 - Code: en
 - Score: 0.9974683523178101



### Extract Key Phrases

In [44]:
# # Use the client and reviews you created in the previous code cell to get key phrases
key_phrase_analysis = text_analytics_client.key_phrases(documents=reviews)

# print key phrases for each review
for review_num in range(len(reviews)):
    # print the review id
    print(reviews[review_num]['id'])

    # Get the key phrases in this review
    print('\nKey Phrases:')
    key_phrases = key_phrase_analysis.documents[review_num].key_phrases
    # Print each key phrase
    for key_phrase in key_phrases:
        print('\t', key_phrase)
    print('\n')

0

Key Phrases:
	 problem movie
	 entire film
	 film attempt
	 film biggest problem
	 movie sorta
	 line movie
	 film entertaining guess
	 decent teen mindfuck movie
	 critique mindfuck movie teen generation
	 character unraveling overall film doesnt
	 type film folk didnt snag
	 minute make thing entertaining thrilling
	 teen couple
	 way horror teen slasher
	 highway memento good bad way
	 dream character
	 packaged look way
	 main problem
	 lost highway
	 cool idea present bad package
	 normal downshift fantasy world audience member idea whats
	 life nightmare whats deal
	 crow salvation
	 sense actor
	 sure audience
	 melissa sagemiller
	 secret password
	 insight strangeness
	 big secret
	 dont mind
	 playing exact character american beauty new neighborhood biggest kudos
	 nightmare elm street
	 u different scene
	 dead strange apparition disappearance looooot chase scene ton weird thing
	 arrow dig flick
	 cool ending explanation craziness
	 music video little edge
	 guy dy girlf

### Determine Sentiment

In [45]:
# Use the client and reviews you created previously to get sentiment scores
sentiment_analysis = text_analytics_client.sentiment(documents=reviews)

# Print the results for each review
for review_num in range(len(reviews)):

    # Get the sentiment score for this review
    sentiment_score = sentiment_analysis.documents[review_num].score

    # classifiy 'positive' if more than 0.5, 
    if sentiment_score < 0.5:
        sentiment = 'negative'
    else:
        sentiment = 'positive'

    # print file name and sentiment
    print('{} : {} ({})'.format(reviews[review_num]['id'], sentiment, sentiment_score))

0 : negative (0.023799389600753784)
1 : negative (0.012056171894073486)
2 : positive (0.9996628761291504)
3 : positive (0.9618054628372192)
4 : positive (0.8864041566848755)
5 : negative (0.025242269039154053)
6 : positive (0.9851109981536865)
7 : negative (0.034242093563079834)
8 : negative (0.00016000866889953613)
9 : positive (0.9741955995559692)


Azure does not determine very well the feelings for negative reviews, score = 5/10 --> 50% 

In [46]:
import os


# Create a collection of reviews with dataframe
reviews = []
for i in range(1001,1010):
    review = {"id": i, "text": data['clean_reviews'][i]}
    reviews.append(review)
    

for review_num in range(len(reviews)):
    # print the review text
     print('{}\n{}\n'.format(reviews[review_num]['id'], reviews[review_num]['text']))

1001
every movie come along suspect studio every indication stinker everybodys surprise perhaps even studio film becomes critical darling mtv film election high school comedy starring matthew broderick reese witherspoon current example anybody know film existed week opened plot deceptively simple george washington carver high school student election tracy flick reese witherspoon overachiever hand raised nearly every question way way high mr matthew broderick sick megalomaniac student encourages paul popularbutslow jock run paul nihilistic sister jump race well personal reason dark side sleeper success expectation low going fact quality stuff made review even enthusiastic right cant help going baggage glowing review contrast negative baggage reviewer likely election good film live hype make election disappointing contains significant plot detail lifted directly rushmore released month earlier similarity staggering tracy flick election president extraordinary number club involved school 

In [47]:
# Use the client and reviews you created previously to get sentiment scores
sentiment_analysis = text_analytics_client.sentiment(documents=reviews)

# Print the results for each review
for review_num in range(len(reviews)):

    # Get the sentiment score for this review
    sentiment_score = sentiment_analysis.documents[review_num].score
    # classifiy 'positive' if more than 0.5, 
    if sentiment_score < 0.5:
        sentiment = 'negative'
    else:
        sentiment = 'positive'

    # print file name and sentiment
    print('{} : {} ({})'.format(reviews[review_num]['id'], sentiment, sentiment_score))

1001 : positive (0.9999831914901733)
1002 : positive (0.9994553327560425)
1003 : positive (0.8154124617576599)
1004 : positive (0.5)
1005 : positive (0.9259353876113892)
1006 : positive (0.997262716293335)
1007 : positive (0.9939534664154053)
1008 : negative (0.010699272155761719)
1009 : negative (0.02836906909942627)


It's better with positive reviews : 8/10 --> 80%

⚠️ Please push the exercise once you are done 🙃

## 🏁 