## Vamos a fazer analise de sentiemntos de filmes em 5 passos

In [1]:
#from google.colab import drive
#drive.mount('/content/drive')

#import os
#workdir_path = '/content/drive/My Drive' #MODIFICAR!!! #Caminho para o seu workspace
#os.chdir(workdir_path)

## Passo 1 - carregar os dados

In [2]:
import pandas as pd
df = pd.read_csv('imdb-reviews-pt-br.csv').sample(2000, random_state=42) #quando trabalhemos com n-grams, reducão do tamanho da amostra

df.head()

Unnamed: 0,id,text_en,text_pt,sentiment
12532,12534,This was unusual: a modern-day film which was ...,Isso era incomum: um filme moderno que era ult...,pos
35445,35447,Some of my old friends suggested me to watch t...,Alguns dos meus velhos amigos sugeriram que eu...,neg
20279,20281,What a pleasure. This is really a parody. Only...,Que prazer. Isto é realmente uma paródia. Some...,pos
2969,2971,There are about ten minutes about half way thr...,"Há cerca de dez minutos a meio da Strangeland,...",neg
45161,45163,"Othello, the classic Shakespearen story of lov...","Otelo, a clássica história de Shakespearen sob...",pos


In [3]:
df.sentiment.value_counts()

neg    1016
pos     984
Name: sentiment, dtype: int64

## Passo 2 - mudar os labels 

In [4]:
df.sentiment = df['sentiment'].map({'pos': 1, 'neg': 0})
df.head()

Unnamed: 0,id,text_en,text_pt,sentiment
12532,12534,This was unusual: a modern-day film which was ...,Isso era incomum: um filme moderno que era ult...,1
35445,35447,Some of my old friends suggested me to watch t...,Alguns dos meus velhos amigos sugeriram que eu...,0
20279,20281,What a pleasure. This is really a parody. Only...,Que prazer. Isto é realmente uma paródia. Some...,1
2969,2971,There are about ten minutes about half way thr...,"Há cerca de dez minutos a meio da Strangeland,...",0
45161,45163,"Othello, the classic Shakespearen story of lov...","Otelo, a clássica história de Shakespearen sob...",1


## Passo 3 - limpeza de Dados

In [5]:
# Etapas de pré-processamento de texto - remova números, letras maiúsculas e pontuação
import re
import string

alphanumeric = lambda x: re.sub(r"""\w*\d\w*""", ' ', x) #limpar os numeros entre as palavras
punc_lower = lambda x: re.sub('[%s]' % re.escape(string.punctuation), ' ', x.lower()) #retirar pontuação

df['text_pt'] = df.text_pt.map(alphanumeric).map(punc_lower)
df.head()

Unnamed: 0,id,text_en,text_pt,sentiment
12532,12534,This was unusual: a modern-day film which was ...,isso era incomum um filme moderno que era ult...,1
35445,35447,Some of my old friends suggested me to watch t...,alguns dos meus velhos amigos sugeriram que eu...,0
20279,20281,What a pleasure. This is really a parody. Only...,que prazer isto é realmente uma paródia some...,1
2969,2971,There are about ten minutes about half way thr...,há cerca de dez minutos a meio da strangeland ...,0
45161,45163,"Othello, the classic Shakespearen story of lov...",otelo a clássica história de shakespearen sob...,1


In [6]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/LeonardoLins/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Passo 4 - Representação 

In [7]:
from sklearn.feature_extraction.text import CountVectorizer #bag of words
vect = CountVectorizer(stop_words = stopwords.words('portuguese'))
vect.fit(df.text_pt)
text_vect = vect.transform(df.text_pt)

In [8]:
dt = pd.DataFrame(text_vect.toarray(), columns=vect.get_feature_names_out())

In [9]:
dt.head()

Unnamed: 0,aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa,aaaaargh,aaawwwwnnn,aahe,aaliyah,aalox,aamir,aamr,aaron,aasman,...,últimos,úmido,única,únicas,único,únicos,úteis,útero,útil,œaberrações
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [10]:
dt.shape

(2000, 30521)

## Passo 5 - Data Mining 

Classificação - divido em base de treino e teste

In [11]:
from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test = train_test_split(text_vect, 
                                                 df.sentiment,
                                                 test_size = 0.3, # 30% para teste
                                                 random_state = 42 #inserir aleatoriedade na base de teste e treino
                                                )

## Utilizo Random Forest

In [12]:
from sklearn.ensemble import RandomForestClassifier #RF é mais rápido, pode qq classificador (reg linear, reg log, svm, etc...) 
import numpy as np
clf = RandomForestClassifier()
clf = clf.fit(X_train, y_train)

## Coloco a acurácia

In [13]:
from sklearn.metrics import accuracy_score
y_prediction = clf.predict(X_test)

accuracia = accuracy_score(y_prediction, y_test)

print(accuracia)

0.7716666666666666


## Representação TF-IDF

In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer #TFidf
cv_tfidf = TfidfVectorizer(stop_words="english")
cv_tfidf.fit(df.text_en)
text_vect = cv_tfidf.transform(df.text_en)

In [15]:
df_tfidf = pd.DataFrame(text_vect.toarray(), columns=cv_tfidf.get_feature_names_out())
df_tfidf.head()

Unnamed: 0,00,000,0069,00pm,00s,01,01pm,02,05,06,...,zoã,zu,zubeidaa,zucchini,zucco,zucker,zuckerman,zuniga,zurer,ãºber
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [16]:
df_tfidf.shape

(2000, 25818)

In [17]:
from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test = train_test_split(text_vect, 
                                                 df.sentiment,
                                                 test_size = 0.3, # 30% para teste
                                                 random_state = 42 #inserir aleatoriedade na base de teste e treino
                                                )

In [18]:
from sklearn.ensemble import RandomForestClassifier #RF é mais rápido, pode qq classificador (reg linear, reg log, svm, etc...) 
import numpy as np
clf = RandomForestClassifier()
clf = clf.fit(X_train, y_train)

In [19]:
from sklearn.metrics import accuracy_score
y_prediction = clf.predict(X_test)

accuracia = accuracy_score(y_prediction, y_test)

print('Acurácia:',accuracia)

Acurácia: 0.765


In [20]:
from sklearn.metrics import precision_score

precision = precision_score(y_prediction, y_test, average='weighted')

print('Precision:',precision)

Precision: 0.7667404348164888


In [21]:
from sklearn.metrics import recall_score

recall = recall_score(y_prediction, y_test, average='weighted')

print('Recall:',recall)

Recall: 0.765


In [22]:
from sklearn.metrics import f1_score

f1_score = f1_score(y_prediction, y_test, average='weighted')

print('F1 Score:', f1_score)

F1 Score: 0.7649314535731648


In [24]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_true=y_test.ravel(), y_pred=y_prediction, 
                                  labels=[1,0])

cm_frame = pd.DataFrame(cm, index=[1,0], columns=[1,0],)
cm_frame.index.name = 'Actual'
cm_frame.columns.name = 'Predicted'

In [25]:
print(cm_frame)

Predicted    1    0
Actual             
1          232   81
0           60  227


## Representação N-gramas

In [28]:
from sklearn.feature_extraction.text import TfidfVectorizer #TFidf
cv_tfidf = TfidfVectorizer(stop_words="english", ngram_range = (2,2)) #cria vetores para unigramas e para bi-gramas
cv_tfidf.fit(df.text_en)
text_vect = cv_tfidf.transform(df.text_en)

In [32]:
df_tfidf = pd.DataFrame(text_vect.toarray(), columns=cv_tfidf.get_feature_names_out())
df_tfidf.head()

Unnamed: 0,00 late,00 oh,000 000,000 creates,000 creators,000 dollars,000 double,000 entertain,000 feet,000 finds,...,zucco turning,zucker brothers,zucker created,zucker lier,zuckerman elements,zuniga film,zuniga light,zuniga scrapes,zurer break,ãºber crap
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [33]:
df_tfidf.shape

(2000, 174381)

In [29]:
from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test = train_test_split(text_vect, 
                                                 df.sentiment,
                                                 test_size = 0.3, # 30% para teste
                                                 random_state = 42 #inserir aleatoriedade na base de teste e treino
                                                )

In [30]:
from sklearn.ensemble import RandomForestClassifier #RF é mais rápido, pode qq classificador (reg linear, reg log, svm, etc...) 
import numpy as np
clf = RandomForestClassifier()
clf = clf.fit(X_train, y_train)

In [31]:
from sklearn.metrics import accuracy_score
y_prediction = clf.predict(X_test)

accuracia = accuracy_score(y_prediction, y_test)

print('Acurácia:',accuracia)

Acurácia: 0.615
