In [23]:
from sklearn.naive_bayes import MultinomialNB
from tabulate import tabulate

<a href="https://colab.research.google.com/github/adolfoguimaraes/datascience/blob/main/code/07_laboratorio_classificacao_de_texto.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Naive Bayes - Análise de Sentimento

Para essa atividade vamos gerar uma modelo de análise de sentimento em inglês baseado em reviews retirados de 3 sites: Amazon, IMDb e Yelp. Essa base está disponível [neste link](https://archive.ics.uci.edu/ml/datasets/Sentiment+Labelled+Sentences). Mais detalhes podem ser encontrados no link ou no artigo de referência: *From Group to Individual Labels using Deep Features', Kotzias et. al,. KDD 2015*. 

A base possui um texto e para cada texto um sentimento sobre o conteúdo abordado no texto. Os sentimentos podem ser positivos (1) ou negativos (2). Foram coletados em média 500 textos para cada sentimento em cada base. 

A atividade consiste em construir uma modelo de aprendizagem para análise de sentimento em inglês. O primeiro passo foi carregar o Dataset de forma apropriada e em seguida construir a matriz de entrada para nosso algoritmo. As etapas do exercício juntamente com o que deve ser feito está descrito a seguir. 

## Carregando o Dataset

In [24]:
import pandas as pd

df_amazon = pd.read_csv("../datasets/analise_de_sentimento/amazon_cells_labelled.txt", 
                        sep="\t", header=None, names=['Text','Sentiment'])
df_imdb = pd.read_csv("../datasets/analise_de_sentimento/imdb_labelled.txt", 
                        sep="\t", header=None, names=['Text','Sentiment'])
df_yelp = pd.read_csv("../datasets/analise_de_sentimento/yelp_labelled.txt", 
                        sep="\t", header=None, names=['Text','Sentiment'])

In [25]:
df_imdb

Unnamed: 0,Text,Sentiment
0,"A very, very, very slow-moving, aimless movie ...",0
1,Not sure who was more lost - the flat characte...,0
2,Attempting artiness with black & white and cle...,0
3,Very little music or anything to speak of.,0
4,The best scene in the movie was when Gerardo i...,1
...,...,...
743,I just got bored watching Jessice Lange take h...,0
744,"Unfortunately, any virtue in this film's produ...",0
745,"In a word, it is embarrassing.",0
746,Exceptionally bad!,0


In [26]:
print("Amazon dataset %s" % str(df_amazon.shape))
print("IMDb dataset %s" % str(df_imdb.shape))
print("Yelp dataset %s" % str(df_yelp.shape))

Amazon dataset (1000, 2)
IMDb dataset (748, 2)
Yelp dataset (1000, 2)


In [27]:
join_frames = [df_amazon, df_imdb, df_yelp]

df_final_dataset = pd.concat(join_frames)

df_final_dataset.shape

(2748, 2)

## Construindo a base de dados

A base de dados possui 2748 textos que foram classificados em dois sentimentos: negativo (0) e positivo (1). Construa uma base de dados apropriada para os testes. Divida a base em treino e teste (80% para treino e 20% para teste). A base de treinamento será utilizado para a construção do modelo e a de teste para o teste final do modelo construído. 

In [28]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df_final_dataset['Text'], 
                                                    df_final_dataset['Sentiment'], 
                                                    random_state=1,
                                                    test_size=0.2
                                                   )

In [29]:
print('Number of rows in the total set: {}'.format(df_final_dataset.shape[0]))
print('Number of rows in the training set: {}'.format(X_train.shape[0]))
print('Number of rows in the test set: {}'.format(X_test.shape[0]))

Number of rows in the total set: 2748
Number of rows in the training set: 2198
Number of rows in the test set: 550


## Construindo o Bag of Words

Construa o Bag of Words para a base de treinamento. Para isso, utilize o método CountVectorizer como mostrado a seguir.

In [84]:
from sklearn.feature_extraction.text import CountVectorizer
count_vector = CountVectorizer()
count_vector

CountVectorizer()

O `CountVectorizer` permite construir o array que serve de entrada para os modelos de aprendizagem. O código a seguir, visualiza o array. 

In [97]:
training_data = count_vector.fit_transform(X_train)
training_data.shape

(2198, 4581)

Foi gerada uma matriz de 2198 linhas (os textos) e 4581 colunas (as palavras). Devemos fazer o mesmo com a base de teste.

In [98]:
testing_data = count_vector.transform(X_test)
testing_data.shape

(550, 4581)

Foi gerada uma matriz com 550 linhas e 4581 colunas também. `training_data` e `testing_data` são as estruturas que devem ser utilizadas no modelo Naive Bayes.

In [99]:
feature_names = count_vector.get_feature_names()



In [100]:
doc_array_train = count_vector.fit_transform(X_train).toarray()
doc_array_train

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [101]:
frequency_matrix_train = pd.DataFrame(doc_array_train, columns=feature_names)
frequency_matrix_train

Unnamed: 0,00,10,100,11,12,13,15,15pm,17,18,...,youtube,yukon,yum,yummy,yun,z500a,zero,zillion,zombie,zombiez
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2193,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
2194,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2195,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2196,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [102]:
doc_array_test = count_vector.transform(X_test).toarray()
doc_array_test

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [103]:
frequency_matrix_test = pd.DataFrame(doc_array_test, columns=feature_names)
frequency_matrix_test

Unnamed: 0,00,10,100,11,12,13,15,15pm,17,18,...,youtube,yukon,yum,yummy,yun,z500a,zero,zillion,zombie,zombiez
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
545,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
546,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
547,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
548,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [92]:
mult_naive_bayes = MultinomialNB()
mult_naive_bayes.fit(frequency_matrix_train, y_train)

MultinomialNB()

In [82]:
mult_naive_bayes.predict(frequency_matrix_test)

array([0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0,
       1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1,
       1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1,
       1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1,
       0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0,
       1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1,
       0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0,
       1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0,
       0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1,
       1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0,
       1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1,
       0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0,
       1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1,
       0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0,

In [93]:
mult_naive_bayes.score(frequency_matrix_train, y_train)

0.9440400363967243

## Atividade 1

Implemente um modelo de Machine Learning para a base gerada. Utilize validação cruzada de 5 folds na base de treinamento e em seguida teste o modelo gerado na base de testes. Reporte as métricas de avaliação estudadas resultante da validação cruzada e da base de testes. Teste pelo menos 3 algoritmos que possuem no `scikit-learn`.

In [184]:
def cross_validate_model(model, value_X, value_y):

    cross_result = cross_validate(model, value_X, value_y, scoring=('accuracy','precision','recall','f1'), cv=5)

    result_values = [
        cross_result['fit_time'].mean(),
        cross_result['score_time'].mean(),
        cross_result['test_accuracy'].mean(),
        cross_result['test_precision'].mean(),
        cross_result['test_recall'].mean(),
        cross_result['test_f1'].mean()
    ]

    return result_values



In [324]:
list(df_final_dataset.Text)[4]

'The mic is great.'

In [200]:
count_vect = CountVectorizer()
count_matrix = count_vect.fit_transform(list(df_final_dataset.Text))
count_array = count_matrix.toarray()

df = pd.DataFrame(data=count_array, columns=count_vect.get_feature_names_out())
df

Unnamed: 0,00,10,100,11,12,13,15,15g,15pm,17,...,yucky,yukon,yum,yummy,yun,z500a,zero,zillion,zombie,zombiez
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2743,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2744,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2745,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2746,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [150]:
mult_naive_bayes = MultinomialNB()
mult_naive_bayes.fit(df, df_final_dataset.Sentiment)

MultinomialNB()

In [172]:
mult_naive_bayes.predict(df)

array([0, 1, 1, ..., 0, 0, 0], dtype=int64)

In [146]:
mult_naive_bayes.score(df, df_final_dataset.Sentiment)

0.9406841339155749

In [185]:
from sklearn.model_selection import cross_validate
from sklearn.naive_bayes import MultinomialNB

# mult_naive_bayes = MultinomialNB()
cross_validate_model(mult_naive_bayes, df, df_final_dataset.Sentiment)

[0.20549969673156737,
 0.031550264358520506,
 0.7969524755754265,
 0.8183897864579229,
 0.7691816222112562,
 0.7914751013987691]

Multinomial Naive Bayes

In [194]:
mult_naive_bayes = MultinomialNB()

In [195]:
scores = cross_validate(mult_naive_bayes, df, df_final_dataset.Sentiment,
                        cv=5, scoring=('accuracy', 'precision', 'recall', 'f1'))
scores

{'fit_time': array([0.21041417, 0.26814747, 0.18886781, 0.17976069, 0.179739  ]),
 'score_time': array([0.03114295, 0.03314376, 0.03105354, 0.03205204, 0.0329783 ]),
 'test_accuracy': array([0.82727273, 0.78      , 0.75818182, 0.81785064, 0.80145719]),
 'test_precision': array([0.80536913, 0.80952381, 0.80851064, 0.86419753, 0.80434783]),
 'test_recall': array([0.86642599, 0.73646209, 0.68345324, 0.75812274, 0.80144404]),
 'test_f1': array([0.83478261, 0.77126654, 0.74074074, 0.80769231, 0.80289331])}

In [201]:
from sklearn.model_selection import cross_validate
from sklearn.naive_bayes import MultinomialNB

# mult_naive_bayes = MultinomialNB()
cross_validate_model(mult_naive_bayes, df, df_final_dataset.Sentiment)

[0.1905799388885498,
 0.029819393157958986,
 0.7969524755754265,
 0.8183897864579229,
 0.7691816222112562,
 0.7914751013987691]

Vizinhos mais próximos

In [177]:
from sklearn.neighbors import KNeighborsClassifier

all_knn_results = []
for k in range(1, 6):
    # Validação Cruzada
    knn_ = KNeighborsClassifier(n_neighbors=k)
    results_ = cross_validate_model(knn_, df, df_final_dataset.Sentiment)

    # Gerar a lista de valores
    temp_list = [k]
    temp_list.extend(results_)
    all_knn_results.append(temp_list)

In [178]:
print(tabulate(all_knn_results, headers=['k','Tempo Treino','Tempo Teste','Acurácia','Precisão','Recall','F1']))

  k    Tempo Treino    Tempo Teste    Acurácia    Precisão    Recall        F1
---  --------------  -------------  ----------  ----------  --------  --------
  1       0.0583294       0.196257    0.66521     0.646301  0.741057  0.689618
  2       0.0515591       0.182186    0.65903     0.727584  0.513786  0.601225
  3       0.0571264       0.184617    0.660845    0.653605  0.694886  0.672261
  4       0.0502259       0.184425    0.657199    0.711074  0.536862  0.610899
  5       0.0469357       0.180596    0.654653    0.648312  0.68764   0.666962


Árvores de decisão

In [182]:
from sklearn.tree import DecisionTreeClassifier

all_tree_results = []
for depth_ in range(1, 6):
    tree_ = DecisionTreeClassifier(max_depth=depth_, random_state=42)
    temp_list = [depth_]
    results_ = cross_validate_model(tree_, df, df_final_dataset.Sentiment)
    temp_list.extend(results_)
    all_tree_results.append(temp_list)

In [183]:
print(tabulate(all_tree_results, headers=['Profundidade','Tempo Treino','Tempo Teste','Acurácia','Precisão','Recall','F1']))

  Profundidade    Tempo Treino    Tempo Teste    Acurácia    Precisão    Recall        F1
--------------  --------------  -------------  ----------  ----------  --------  --------
             1        0.195301      0.028561     0.557132    0.932323  0.129917  0.22463
             2        0.304166      0.0287807    0.570239    0.541576  0.968254  0.694548
             3        0.384535      0.0310672    0.598618    0.884275  0.232387  0.365428
             4        0.423998      0.0312823    0.591346    0.81517   0.369553  0.418206
             5        0.465517      0.0312393    0.615715    0.85278   0.306727  0.436035


### Atividade 2

Além dos parâmetros do algoritmos em si, podemos ajustar parâmetros do pré-processamento. Quando utilizamos a classe `CountVectorizer` podemos utilizar uma série de técnicas de pré-processamento para melhorar os dados de entrada do modelo. 

Pesquise sobre o `CountVectorizer` e modifique os parâmetros `default` para gerar dados melhores e, consequentemente, um modelo melhor do que o construído na Atividade 1. Reporte seus resultados na validação cruzada e nos testes. Reavalie os algoritmos e parâmetros testados na etapa 1 para verificar o que muda ao mudar a vetorização.

### Ajustando

In [309]:
count_vect = CountVectorizer()

In [320]:
# palavras de parada
count_vect.stop_words = ['is', 'to', 'my', 'in', 'are', 'a', 'and', 'it', 'for', 'the', 'movie', 'but', 'or',
                         'when']

# max_df
# count_vect.max_df = 0.90

# piorou  max_features
# count_vect.max_features = 3

# piorou  min_df
# count_vect.min_df = 1

count_matrix = count_vect.fit_transform(list(df_final_dataset.Text))
count_array = count_matrix.toarray()

df = pd.DataFrame(data=count_array, columns=count_vect.get_feature_names_out())
df

Unnamed: 0,00,10,100,11,12,13,15,15g,15pm,17,...,yucky,yukon,yum,yummy,yun,z500a,zero,zillion,zombie,zombiez
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2743,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2744,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2745,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2746,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [321]:
mult_naive_bayes = MultinomialNB()
cross_validate_model(mult_naive_bayes, df, df_final_dataset.Sentiment)

[0.2145707130432129,
 0.03179211616516113,
 0.8009584368272893,
 0.8246797892242368,
 0.7699062410721244,
 0.7948433579592666]

In [198]:
scores = cross_validate(mult_naive_bayes, df, df_final_dataset.Sentiment,
                        cv=5, scoring=('accuracy', 'precision', 'recall', 'f1'))
scores

{'fit_time': array([0.21456194, 0.25610662, 0.1740284 , 0.17234135, 0.18745542]),
 'score_time': array([0.03052974, 0.03100443, 0.03209257, 0.02405047, 0.03124332]),
 'test_accuracy': array([0.83454545, 0.78727273, 0.76181818, 0.81420765, 0.7996357 ]),
 'test_precision': array([0.81418919, 0.81746032, 0.8209607 , 0.85714286, 0.79928315]),
 'test_recall': array([0.8700361 , 0.74368231, 0.67625899, 0.75812274, 0.80505415]),
 'test_f1': array([0.84118674, 0.77882798, 0.74161736, 0.8045977 , 0.80215827])}

In [228]:
knn_ = KNeighborsClassifier(n_neighbors=15)
cross_validate_model(knn_, df, df_final_dataset.Sentiment)

[0.04374442100524902,
 0.1912259578704834,
 0.6699605895015731,
 0.6595761021908914,
 0.7222761862711996,
 0.6875606786453362]