# Criando um modelo de Machine Learning

## 1. Obtendo os dados

In [126]:
corpus_url =  'https://raw.githubusercontent.com/henriquedezani/curso-semac2019/master/imdb-comentarios.csv'

In [127]:
import pandas as pd
data_frame = pd.read_csv(corpus_url)

In [128]:
data_frame.head()

Unnamed: 0,comentario,classificacao
0,"Mais uma vez, o Sr. Costner arrumou um filme p...",0
1,Este é um exemplo do motivo pelo qual a maiori...,0
2,"Primeiro de tudo eu odeio esses raps imbecis, ...",0
3,Nem mesmo os Beatles puderam escrever músicas ...,0
4,Filmes de fotos de latão não é uma palavra apr...,0


## 2. Realizando a transformação do texto em vetores

In [129]:
import nltk
nltk.download('stopwords')    # stopwords

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [130]:
from sklearn.feature_extraction.text import CountVectorizer

In [131]:
stop_words = nltk.corpus.stopwords.words('portuguese')

vectorizer = CountVectorizer(lowercase=True, stop_words=stop_words, max_features=1000)

In [132]:
bag_of_words = vectorizer.fit_transform(data_frame['comentario'])

In [133]:
bag_of_words

<49459x1000 sparse matrix of type '<class 'numpy.int64'>'
	with 2631261 stored elements in Compressed Sparse Row format>

In [134]:
bag_of_words[0].toarray()

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

In [135]:
vectorizer.transform(['Não gostei do filme. É muito desanimador.']).toarray()

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

## 3. Realiza o treinamento do modelo de Machine Learning

In [136]:
X = bag_of_words
y = data_frame['classificacao']

In [None]:
# from imblearn.under_sampling import RandomUnderSampler
# rus = RandomUnderSampler()
# X_balanced, y_balanced= rus.fit_sample(X, y)

In [None]:
# X_balanced

In [137]:
from sklearn.model_selection import train_test_split

X_treino, X_teste, y_treino, y_teste = train_test_split(X, y, random_state=42)

In [138]:
from sklearn.linear_model import LogisticRegression

classificador = LogisticRegression(solver='lbfgs', max_iter=500)

classificador.fit(X_treino, y_treino);

In [139]:
from sklearn.metrics import accuracy_score, precision_score

y_previsto = classificador.predict(X_teste)

acuracia = accuracy_score(y_teste, y_previsto)
precisao = precision_score(y_teste, y_previsto)

print(acuracia, precisao)

0.8536190861302062 0.8457966373098479


In [140]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_teste, y_previsto)

array([[5273,  963],
       [ 847, 5282]])

## 4. Usando o modelo treinado

In [141]:
# transforma o novo texto em um vetor:
frase = 'Filme sem noção. Enredo muito fraco. Não recomendo.'
frase_vetor_negativa = vectorizer.transform([frase]).toarray()
frase_vetor_negativa

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

In [147]:
# aplica o classificador para obter a predição:
classificador.predict(frase_vetor_negativa)
# classificador.predict_proba(frase_vetor_negativa)

array([0])

In [145]:
# transforma o novo texto em um vetor:
frase = 'Este filme é fantástico. Tem muita animação e suspense.'
frase_vetor_positiva = vectorizer.transform([frase]).toarray()
classificador.predict(frase_vetor_positiva)
# classificador.predict_proba(frase_vetor_positiva)

array([1])

## 5. Salvando o modelo para consumo na API

In [148]:
import pickle

In [149]:
with open('vetorizador.pkl', 'wb') as file_vetorizador:
  pickle.dump(vectorizer, file_vetorizador)

In [150]:
with open('classificador.pkl', 'wb') as file_classificador:
  pickle.dump(classificador, file_classificador)

## 6. Carregando modelos salvos para consumo

In [151]:
with open('classificador.pkl', 'rb') as file_classificador:
  classifier = pickle.load(file_classificador)
  result = classifier.predict(frase_vetor_positiva)
  print(result)

[1]
