# *Imports*


In [None]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

plt.style.use('ggplot')
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Package words is already up-to-date!


True

# *Load Data , train on first 500 examples only because of the computation cost of the roberta model*

In [None]:
df = pd.read_csv('ancmm.csv')
df = df.head(100)


In [None]:
df.head()

Unnamed: 0.1,Unnamed: 0,comment
0,0,"Confidently directed, dark, brooding, and pack..."
1,1,I got to see The Dark Knight on Wednesday nigh...
2,2,This movie is a work of art. The finest sequel...
3,3,I think that I could wax lyrical but this movi...
4,4,It is just what you want for the best movie. G...


In [None]:
print(df.shape)


(25, 2)


# *Some Ground level NLTK*

In [None]:
example = df['comment'][0]


In [None]:
example

'Confidently directed, dark, brooding, and packed with impressive action sequences and a complex story, The Dark Knight includes a career-defining turn from Heath Ledger as well as other Oscar worthy performances, TDK remains not only the best Batman movie, but comic book movie ever created.'

**Tokenization of the words and different syllables**

In [None]:
tokens = nltk.word_tokenize(example)
tokens

['Confidently',
 'directed',
 ',',
 'dark',
 ',',
 'brooding',
 ',',
 'and',
 'packed',
 'with',
 'impressive',
 'action',
 'sequences',
 'and',
 'a',
 'complex',
 'story',
 ',',
 'The',
 'Dark',
 'Knight',
 'includes',
 'a',
 'career-defining',
 'turn',
 'from',
 'Heath',
 'Ledger',
 'as',
 'well',
 'as',
 'other',
 'Oscar',
 'worthy',
 'performances',
 ',',
 'TDK',
 'remains',
 'not',
 'only',
 'the',
 'best',
 'Batman',
 'movie',
 ',',
 'but',
 'comic',
 'book',
 'movie',
 'ever',
 'created',
 '.']

**Tagging the different Tokens**

In [None]:
tagged = nltk.pos_tag(tokens)
tagged

[('Confidently', 'RB'),
 ('directed', 'VBN'),
 (',', ','),
 ('dark', 'NN'),
 (',', ','),
 ('brooding', 'NN'),
 (',', ','),
 ('and', 'CC'),
 ('packed', 'VBD'),
 ('with', 'IN'),
 ('impressive', 'JJ'),
 ('action', 'NN'),
 ('sequences', 'NNS'),
 ('and', 'CC'),
 ('a', 'DT'),
 ('complex', 'JJ'),
 ('story', 'NN'),
 (',', ','),
 ('The', 'DT'),
 ('Dark', 'NNP'),
 ('Knight', 'NNP'),
 ('includes', 'VBZ'),
 ('a', 'DT'),
 ('career-defining', 'JJ'),
 ('turn', 'NN'),
 ('from', 'IN'),
 ('Heath', 'NNP'),
 ('Ledger', 'NNP'),
 ('as', 'RB'),
 ('well', 'RB'),
 ('as', 'IN'),
 ('other', 'JJ'),
 ('Oscar', 'NNP'),
 ('worthy', 'JJ'),
 ('performances', 'NNS'),
 (',', ','),
 ('TDK', 'NNP'),
 ('remains', 'VBZ'),
 ('not', 'RB'),
 ('only', 'RB'),
 ('the', 'DT'),
 ('best', 'JJS'),
 ('Batman', 'NNP'),
 ('movie', 'NN'),
 (',', ','),
 ('but', 'CC'),
 ('comic', 'JJ'),
 ('book', 'NN'),
 ('movie', 'NN'),
 ('ever', 'RB'),
 ('created', 'VBN'),
 ('.', '.')]

**Putting Tagged Tokens into different Chunks (groups)**

In [None]:
entities = nltk.chunk.ne_chunk(tagged)
entities.pprint()

(S
  Confidently/RB
  directed/VBN
  ,/,
  dark/NN
  ,/,
  brooding/NN
  ,/,
  and/CC
  packed/VBD
  with/IN
  impressive/JJ
  action/NN
  sequences/NNS
  and/CC
  a/DT
  complex/JJ
  story/NN
  ,/,
  The/DT
  (ORGANIZATION Dark/NNP Knight/NNP)
  includes/VBZ
  a/DT
  career-defining/JJ
  turn/NN
  from/IN
  (PERSON Heath/NNP Ledger/NNP)
  as/RB
  well/RB
  as/IN
  other/JJ
  Oscar/NNP
  worthy/JJ
  performances/NNS
  ,/,
  (ORGANIZATION TDK/NNP)
  remains/VBZ
  not/RB
  only/RB
  the/DT
  best/JJS
  (PERSON Batman/NNP)
  movie/NN
  ,/,
  but/CC
  comic/JJ
  book/NN
  movie/NN
  ever/RB
  created/VBN
  ./.)


# *Old School Approach : VADER Sentiment Scoring*

In [None]:
from nltk.sentiment import SentimentIntensityAnalyzer
from tqdm.notebook import tqdm
nltk.download('vader_lexicon')
sia = SentimentIntensityAnalyzer()

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


**Model Vibe Check !**

In [None]:
sia.polarity_scores('VIVA TOUNIZI LETS GOOOO')

{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}

In [None]:
sia.polarity_scores('FRANCA MA TERBA7CH >:(')

{'neg': 0.552, 'neu': 0.448, 'pos': 0.0, 'compound': -0.5719}

# *VADER Model on the whole dataset*

In [None]:
res = {}
for i, row in tqdm(df.iterrows(), total=len(df)):
    text = row['comment']
    myid = i
    res[i] = sia.polarity_scores(text)
res

  0%|          | 0/25 [00:00<?, ?it/s]

{0: {'neg': 0.044, 'neu': 0.728, 'pos': 0.228, 'compound': 0.7241},
 1: {'neg': 0.117, 'neu': 0.737, 'pos': 0.146, 'compound': 0.9567},
 2: {'neg': 0.0, 'neu': 0.581, 'pos': 0.419, 'compound': 0.9931},
 3: {'neg': 0.057, 'neu': 0.793, 'pos': 0.15, 'compound': 0.9907},
 4: {'neg': 0.0, 'neu': 0.609, 'pos': 0.391, 'compound': 0.9648},
 5: {'neg': 0.058, 'neu': 0.722, 'pos': 0.22, 'compound': 0.9835},
 6: {'neg': 0.062, 'neu': 0.722, 'pos': 0.216, 'compound': 0.9594},
 7: {'neg': 0.041, 'neu': 0.766, 'pos': 0.193, 'compound': 0.9906},
 8: {'neg': 0.111, 'neu': 0.667, 'pos': 0.222, 'compound': 0.9967},
 9: {'neg': 0.13, 'neu': 0.682, 'pos': 0.188, 'compound': 0.9952},
 10: {'neg': 0.0, 'neu': 0.606, 'pos': 0.394, 'compound': 0.8862},
 11: {'neg': 0.078, 'neu': 0.647, 'pos': 0.275, 'compound': 0.8467},
 12: {'neg': 0.089, 'neu': 0.741, 'pos': 0.17, 'compound': 0.9924},
 13: {'neg': 0.054, 'neu': 0.822, 'pos': 0.124, 'compound': 0.9958},
 14: {'neg': 0.091, 'neu': 0.811, 'pos': 0.098, 'compo

In [None]:
vaders = pd.DataFrame(res).T
vaders = vaders.reset_index().rename(columns={'index': 'Movie Id'})
vaders

Unnamed: 0,Movie Id,neg,neu,pos,compound
0,0,0.044,0.728,0.228,0.7241
1,1,0.117,0.737,0.146,0.9567
2,2,0.0,0.581,0.419,0.9931
3,3,0.057,0.793,0.15,0.9907
4,4,0.0,0.609,0.391,0.9648
5,5,0.058,0.722,0.22,0.9835
6,6,0.062,0.722,0.216,0.9594
7,7,0.041,0.766,0.193,0.9906
8,8,0.111,0.667,0.222,0.9967
9,9,0.13,0.682,0.188,0.9952


# *Installation of Transformers*

In [None]:
!pip install transformers


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
from scipy.special import softmax


# *Modern Approach : Roberta Sentiment Analysis*

# *Download of the Pre-trained Weights of the Roberta Model*

In [None]:
MODEL = f"cardiffnlp/twitter-roberta-base-sentiment"
tokenizer = AutoTokenizer.from_pretrained(MODEL)
model = AutoModelForSequenceClassification.from_pretrained(MODEL)

# *Tokenization + Fitting the model on our data*
Transforming the output to a numpy array in order to perform a softmax activation function 

In [None]:
def polarity_scores_roberta(example):
    encoded_text = tokenizer(example, return_tensors='pt')
    output = model(**encoded_text)
    scores = output[0][0].detach().numpy()
    scores = softmax(scores)
    scores_dict = {
        'Roberta_Negative_Review' : scores[0],
        'Roberta_Neutral_Review' : scores[1],
        'Roberta_Positive_Review' : scores[2]
    }
    return scores_dict

# *Iterating Over the Dataset And Analysing each review*
the roberta models breaks on some iterations due to long reviews

In [None]:
res = {}
for i, row in tqdm(df.iterrows(), total=len(df)):
    try:
        text = row['comment']
        myid = i
               
        roberta_result = polarity_scores_roberta(text)
        
        res[i] = roberta_result
    except RuntimeError:
        print(f'Broke for id {i}')

  0%|          | 0/25 [00:00<?, ?it/s]

Broke for id 1
Broke for id 8
Broke for id 9
Broke for id 12
Broke for id 13
Broke for id 17
Broke for id 19


In [None]:
res

{0: {'Roberta_Negative_Review': 0.0027522931,
  'Roberta_Neutral_Review': 0.057343543,
  'Roberta_Positive_Review': 0.93990415},
 2: {'Roberta_Negative_Review': 0.0019886186,
  'Roberta_Neutral_Review': 0.011501809,
  'Roberta_Positive_Review': 0.9865097},
 3: {'Roberta_Negative_Review': 0.022300515,
  'Roberta_Neutral_Review': 0.13169876,
  'Roberta_Positive_Review': 0.84600073},
 4: {'Roberta_Negative_Review': 0.0031336606,
  'Roberta_Neutral_Review': 0.030244535,
  'Roberta_Positive_Review': 0.9666218},
 5: {'Roberta_Negative_Review': 0.043412954,
  'Roberta_Neutral_Review': 0.31548202,
  'Roberta_Positive_Review': 0.64110506},
 6: {'Roberta_Negative_Review': 0.03954422,
  'Roberta_Neutral_Review': 0.18188061,
  'Roberta_Positive_Review': 0.77857524},
 7: {'Roberta_Negative_Review': 0.010867315,
  'Roberta_Neutral_Review': 0.06257825,
  'Roberta_Positive_Review': 0.92655444},
 10: {'Roberta_Negative_Review': 0.0034930182,
  'Roberta_Neutral_Review': 0.011301685,
  'Roberta_Positive_

In [None]:
plt.figure(figsize=[9,5])
run=sns.distplot(df["ancmm"])
plt.suptitle("ancmm",fontsize=20)
plt.show

KeyError: ignored

<Figure size 648x360 with 0 Axes>

# *Review Sentiment Analysis Results*

In [None]:
results_df = pd.DataFrame(res).T
results_df = results_df.iloc[1:,:]
results_df = results_df.reset_index().rename(columns={'index': 'Movie Id'})


***“I'm gonna make him an offer he can't refuse.”***

In [None]:
df_godfather = pd.read_csv('ancmm.csv')
df_godfather