## Processamento de Linguagem Natural para Análise de Críticas de Filmes

Database em https://www.kaggle.com/luisfredgs/imdb-ptbr

In [None]:
import pandas as pd

In [None]:
review = pd.read_csv("imdb-reviews-pt-br.csv")
review.head()

In [None]:
# trocando neg por 0 e pos por 1
change = review["sentiment"].replace(["neg", "pos"], [0, 1])
change.head()

In [None]:
# adicionando coluna a review
review["sentimentBIN"] = change
review.head()

In [None]:
# checando se dados estão balanceados para executar treinamento
print(review["sentimentBIN"].value_counts())

### WORLD CLOUD

Biblioteca em https://github.com/amueller/word_cloud

In [None]:
!pip install wordcloud
!python -m pip install -U matplotlib

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
from wordcloud import WordCloud

In [None]:
# palavras positivas
posReview = review.query("sentiment=='pos'")
wordsPos = ' '.join([word for word in posReview["text_pt"]])

In [None]:
cloudPos = WordCloud(width=800, height=500, max_font_size=110, collocations=False).generate(wordsPos)

In [None]:
plt.figure(figsize=(30,7))
plt.imshow(cloudPos, interpolation="bilinear")
plt.axis("off")
plt.show()

In [None]:
# palavras negativas
negReview = review.query("sentiment=='neg'")
wordsNeg = ' '.join([word for word in negReview["text_pt"]])

In [None]:
cloudNeg = WordCloud(width=800, height=500, max_font_size=110, collocations=False).generate(wordsNeg)

In [None]:
plt.figure(figsize=(30,7))
plt.imshow(cloudNeg, interpolation="bilinear")
plt.axis("off")
plt.show()

In [None]:
!pip install -U nltk

In [None]:
# download data
import nltk
nltk.download('all')

In [None]:
from nltk import tokenize

In [None]:
spaceToken = tokenize.WhitespaceTokenizer()
allWords = ' '.join([text for text in review["text_pt"]])

frequency = nltk.FreqDist(spaceToken.tokenize(allWords))

frequencyDF = pd.DataFrame({"Palavras": list(frequency.keys()),
                           "Frequencia": list(frequency.values())})
frequencyDF.head()

In [None]:
frequencyDFtop10 = frequencyDF.nlargest(columns="Frequencia", n=10)
frequencyDFtop10

In [None]:
!pip install seaborn

In [None]:
import seaborn as sns

plt.figure(figsize=(12,8))
ax = sns.barplot(data=frequencyDFtop10, x="Palavras", y="Frequencia")
ax.set(ylabel="Contagem")
plt.show()

### BAG OF WORDS
Neste modelo, o texto (uma frase ou documento) é representado como um multiconjunto de suas palavras (o "saco"), desconsiderando a estrutura gramatical e até mesmo a ordenação delas, mas mantendo sua multiplicidade


In [None]:
# implementando bag of words (fazendo dicionário de palavras)
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
model = CountVectorizer(max_features=50)

bagOfWords = model.fit_transform(review["text_pt"])
model.get_feature_names()

In [None]:
# transformando numa matriz para visualizar melhor
dictionary = pd.SparseDataFrame(bagOfWords, columns = model.get_feature_names())
dictionary.head()

In [None]:
# separando dados para treino e teste (75% treino, 25% teste)
from sklearn.model_selection import train_test_split

In [None]:
train, test, trainClass, testClass = train_test_split(bagOfWords, review["sentimentBIN"], random_state = 42)

In [None]:
print(train.shape)
print(test.shape)

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
# treinando
logisticRegression = LogisticRegression(solver='lbfgs')
logisticRegression.fit(train, trainClass)

# testando
prevision = logisticRegression.predict_proba(test)
print(prevision)

In [None]:
# transformando em bool (neg - 0 e pos - 1)
previsionBool = prevision[:, 1] >= 0.5
print(previsionBool)

In [None]:
import numpy as np

In [None]:
# transformando em 0 ou 1
previsionInt = previsionBool.astype(np.int)
print(previsionInt)

In [None]:
from sklearn.metrics import accuracy_score

In [None]:
# comparando resultado obtido com o resultado original
accuracyTest = accuracy_score(testClass, previsionInt)
print("Taxa de acerto:", accuracyTest)