# IMDB Sentiment Analyses

Neste notebook estamos utilizando os dados do Kaggle (https://www.kaggle.com/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews).

Vamos seguir os seguintes passos:

1. Importar o dataset
2. Analisar os dados
3. Preparar os dados para construir o modelo
4. Criar o dataset de teste e treino
5. Treinar o modelo utilizando diferentes algoritmos
6. Avaliar os modelos
7. Seleção do melhor modelo para este dataset
8. Realizar o deploy do modelo para o Watson Machine Learning



In [None]:
!pip install nltk

In [None]:
%matplotlib inline 

import pandas as pd
import numpy as np
import matplotlib as mlp
import matplotlib.pyplot as plt
import seaborn as sns
import json

from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import preprocessing
from sklearn import tree
from sklearn import svm
from sklearn import ensemble
from sklearn import neighbors
from sklearn import linear_model
from sklearn import metrics

In [None]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')

# Importar o Dataset

In [None]:
import types
import pandas as pd
from botocore.client import Config
import ibm_boto3

def __iter__(self): return 0

# @hidden_cell
# The following code accesses a file in your IBM Cloud Object Storage. It includes your credentials.
# You might want to remove those credentials before you share the notebook.
client_cff28c1bbcb74e9e8ddb271b5109fdc4 = ibm_boto3.client(service_name='s3',
    ibm_api_key_id='q-aaHeH2Nc4XQHFBXPmoojJ5mWfTZ-NaKX8Uml0G1rXb',
    ibm_auth_endpoint="https://iam.cloud.ibm.com/oidc/token",
    config=Config(signature_version='oauth'),
    endpoint_url='https://s3-api.us-geo.objectstorage.service.networklayer.com')

body = client_cff28c1bbcb74e9e8ddb271b5109fdc4.get_object(Bucket='imdbsentimentanalyses-donotdelete-pr-lxalf6ovy8cv0p',Key='imdb-dataset.csv')['Body']
# add missing __iter__ method, so pandas accepts body as file-like object
if not hasattr(body, "__iter__"): body.__iter__ = types.MethodType( __iter__, body )

df_data_1 = pd.read_csv(body)

## Analisando dos dados

In [None]:
df = df_data_1
df.head()

In [None]:
df.describe()

## Preparando dos dados

Agora vamos preparar nossos seguindo os passos:

    1. Tonkenization         
    2. Remover stopwords
    3. Stemming text
    4. Juntar novamente em uma única frase
    
Como estamos trabalhando com uma entrada de texto, realizamos estas etapas para "normalizar" nossa base.

In [None]:
stop_words = stopwords.words('english')
porter_stemmer = PorterStemmer()

In [None]:
def identify_tokens(row):
    source = row[0]
    tokens = word_tokenize(source)
    token_words = [w for w in tokens if w.isalpha()]
    return token_words

In [None]:
def remove_stops(row):
    source_tokenization = row[2]
    stop = [w for w in source_tokenization if not w in stop_words]
    return (stop)

In [None]:
def stem_porter(row):
    my_list = row[2]
    stemmed_list = [porter_stemmer.stem(word) for word in my_list]
    return (stemmed_list)

In [None]:
def rejoin_words(row):
    my_list = row[2]
    joined_words = (" ".join(my_list))
    return joined_words

In [None]:
def pre_processing(df):
    print('Tokenization')
    df['text1'] = df.apply(identify_tokens, axis=1)
    print('Remove stop words')
    df['text1'] = df.apply(remove_stops, axis=1)
    print('Stemming')
    df['text1'] = df.apply(stem_porter, axis=1)
    print('Rejoin words')
    df['tidy_text'] = df.apply(rejoin_words, axis=1)
    
    return df

In [None]:
df = pre_processing(df)

df['tidy_text'] = df['tidy_text'].str.lower()
df.head()

## Criando o dataset de teste/treino

Vamos criar o nosso dataset de teste (30%) e treino (70%) de forma balanceado (Stratified)

In [None]:
X = df['tidy_text']
Y = df['sentiment']

print(X.shape)
print(Y.shape)

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, stratify=Y)

Os modelos de Machine Learning ou Deep Learning esperam como entrada "X" um valor numérico. Como estamos trabalhando com texto iremos realizar o processo de TfIdf para transformar o texto em valores numéricos.

In [None]:
tfidf = TfidfVectorizer(max_features=2000, ngram_range=(2,3), sublinear_tf=True)

X_train_tf = tfidf.fit_transform(X_train)
X_test_tf = tfidf.transform(X_test)

print(Y.value_counts().shape)
print(X_train_tf.shape)

In [None]:
le = preprocessing.LabelEncoder()

Y_train_le = le.fit_transform(list(Y_train))
Y_test_le = le.transform(list(Y_test))

## Construindo e treinando o modelo

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression

In [None]:
# binary classifiers
# GradientBoostingClassifier
gradient_boost = GradientBoostingClassifier()
gradient_boost.fit(X_train_tf, Y_train_le)
Y_predict_gradient_boost = gradient_boost.predict(X_test_tf)
print('Gradient Boosting Classifier DONE!')

# SVC
svc_model = SVC(gamma='auto', kernel='sigmoid', C=1.8, probability=True)
svc_model.fit(X_train_tf, Y_train_le)
Y_predict_svm = svc_model.predict(X_test_tf)
print('Support Vector Machine(SVM) DONE!')

# RandomForestClassifier
random_forest = RandomForestClassifier(n_estimators=10)
random_forest.fit(X_train_tf, Y_train_le)
Y_predict_random_forest = random_forest.predict(X_test_tf)
print('Random Forest Classifier DONE!')

# KNeighborsClassifier
k_neighbors = KNeighborsClassifier()
k_neighbors.fit(X_train_tf, Y_train_le)
Y_predict_k_neighbors = k_neighbors.predict(X_test_tf)
print('K Nearest Neighbor Classifier DONE!')

# LogisticRegression
logistic_regression = LogisticRegression(solver='lbfgs', penalty='l2', C=1.5)
logistic_regression.fit(X_train_tf, Y_train_le)
Y_predict_logistic_regression = logistic_regression.predict(X_test_tf)
print('Logistic Regression DONE!')

In [None]:
print('Gradient Boosting Classifier:  ', metrics.accuracy_score(Y_test_le, Y_predict_gradient_boost))
print('Support Vector Machine(SVM):   ', metrics.accuracy_score(Y_test_le, Y_predict_svm))
print('Random Forest Classifier:      ', metrics.accuracy_score(Y_test_le, Y_predict_random_forest))
print('K Nearest Neighbor Classifier: ', metrics.accuracy_score(Y_test_le, Y_predict_k_neighbors))
print('Logistic Regression:           ', metrics.accuracy_score(Y_test_le, Y_predict_logistic_regression))

## Avaliação do modelo

### Support Vector Machines

In [None]:
svm_svc_conf_matrix = metrics.confusion_matrix(Y_test_le, Y_predict_svm)
sns.heatmap(svm_svc_conf_matrix, annot=True,  fmt='');
title = 'SVM'
plt.title(title);

### Random Forest

In [None]:
random_forest_conf_matrix = metrics.confusion_matrix(Y_test_le, Y_predict_random_forest)
sns.heatmap(random_forest_conf_matrix, annot=True,  fmt='');
title = 'Random Forest'
plt.title(title);

### Logistic Regression

In [None]:
logistic_regression_conf_matrix = metrics.confusion_matrix(Y_test_le, Y_predict_logistic_regression)
sns.heatmap(random_forest_conf_matrix, annot=True,  fmt='');
title = 'Logistic Regression'
plt.title(title);

## Resumo de classificação

In [None]:
print('Support vector machine(SVM):\n {}\n'.format(metrics.classification_report(Y_test_le, Y_predict_svm)))
print('Random Forest Classifier:\n {}\n'.format(metrics.classification_report(Y_test_le, Y_predict_random_forest)))
print('Logistic Regression:\n {}\n'.format(metrics.classification_report(Y_test_le, Y_predict_logistic_regression)))

## Seleção do modelo final

In [None]:
X_train_final = tfidf.fit_transform(X)
Y_train_final = le.fit_transform(list(Y))

print(X_train_final.shape)

In [None]:
lrc = LogisticRegression(solver='lbfgs', penalty='l2', C=1.5)
lrc.fit(X_train_final, Y_train_final)

## Deploy para o Watson Machine Learning



Para nos autenticar no Watson Machine Learning no IBM Cloud, você precisa da api_key e location do seu serviço.

Podemos utilizar o [IBM Cloud CLI](https://cloud.ibm.com/docs/cli/index.html) ou diretamente pelo portal do IBM Cloud.

Usando o IBM Cloud CLI:

```
ibmcloud login
ibmcloud iam api-key-create API_KEY_NAME
```

NOTE: Você pode obter a URL do serviço indo até [Endpoint URLs section of the Watson Machine Learning docs](https://cloud.ibm.com/apidocs/machine-learning).

In [None]:
api_key = 'YOUR_API_KEY'
location = 'YOUR_LOCATION'

In [None]:
wml_credentials = {
    "apikey": api_key,
    "url": location
}

### Instalando a biblioteca do Watson Machine Learning

NOTE: Documentação pode ser encontrada [aqui](http://ibm-wml-api-pyclient.mybluemix.net/)

In [None]:
!pip install -U ibm-watson-machine-learning

In [None]:
from ibm_watson_machine_learning import APIClient

client = APIClient(wml_credentials)
print(client.version)

### Criando nosso espaço de implementação

Primeiro, crie um espaço de implementação que será usado para fazer o deploy do nosso modelo. Caso ainda não tenha criado siga os passos abaixo.

    Clique em Novo Espaço de Implementação
    Crie um novo espaço vazio
    Selecione Cloud Object Storage
    Selecione Watson Machine Learning e clique em Criar
    Copie space_id e cole abaixo

In [None]:
space_id = 'YOUR_SPACE_ID'

In [None]:
client.spaces.list(limit=10)

In [None]:
client.set.default_space(space_id)

In [None]:
sofware_spec_uid = client.software_specifications.get_id_by_name("default_py3.7")
metadata = {
            client.repository.ModelMetaNames.NAME: 'Logistic Regression model to predict IMDB reviews',
            client.repository.ModelMetaNames.TYPE: 'scikit-learn_0.23',
            client.repository.ModelMetaNames.SOFTWARE_SPEC_UID: sofware_spec_uid
}

published_model = client.repository.store_model(
    model=lrc,
    meta_props=metadata)

In [None]:
published_model_uid = client.repository.get_model_uid(published_model)
model_details = client.repository.get_details(published_model_uid)
print(json.dumps(model_details, indent=2))

In [None]:
client.repository.list_models()

In [None]:
# client.repository.delete('GUID of stored model')

In [None]:
metadata = {
    client.deployments.ConfigurationMetaNames.NAME: "Deployment of IMDB reviews",
    client.deployments.ConfigurationMetaNames.ONLINE: {}
}

created_deployment = client.deployments.create(published_model_uid, meta_props=metadata)

In [None]:
# Get deployment UID and show details on the deployment
deployment_uid = client.deployments.get_uid(created_deployment)
client.deployments.get_details(deployment_uid)

In [None]:
client.deployments.list()

In [None]:
#client.deployments.delete('GUID of deployed model')

## Avaliando o modelo

Agora vamos enviar dados para o web service usando o método score do WML.

In [None]:
# get scoring end point
scoring_endpoint = client.deployments.get_scoring_href(created_deployment)
print(scoring_endpoint)

In [None]:
# add some test data
scoring_payload = {"input_data": [
    {'values': X_test_tf.toarray()
    }]}

In [None]:
# score the model
predictions = client.deployments.score(deployment_uid, scoring_payload)
print('prediction',json.dumps(predictions, indent=2))

In [None]:
Y_predict_final_model = []
for y in predictions['predictions'][0]['values']:
    Y_predict_final_model.append(y[0])
    
print('Final Model WML:\n {}\n'.format(metrics.classification_report(Y_test_le, Y_predict_final_model)))