<a href="https://colab.research.google.com/github/leolellisr/npl_natural_language_processing_projects/blob/main/02_Bag_of_Words_Sentiment_Analysis/02_Bag_of_Words_Sentiment_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Aula 2: Análise de Sentimentos usando Bag of Words e TF-IDF
Nome: 

Neste notebook iremos treinar um modelo para fazer análise de sentimento usando o dataset IMDB.

# Preparando Dados

Primeiro, fazemos download do dataset:

In [None]:
!wget -nc http://files.fast.ai/data/examples/imdb_sample.tgz
!tar -xzf imdb_sample.tgz

--2021-08-23 23:03:52--  http://files.fast.ai/data/examples/imdb_sample.tgz
Resolving files.fast.ai (files.fast.ai)... 104.26.2.19, 104.26.3.19, 172.67.69.159, ...
Connecting to files.fast.ai (files.fast.ai)|104.26.2.19|:80... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://files.fast.ai/data/examples/imdb_sample.tgz [following]
--2021-08-23 23:03:52--  https://files.fast.ai/data/examples/imdb_sample.tgz
Connecting to files.fast.ai (files.fast.ai)|104.26.2.19|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 571827 (558K) [application/x-gtar-compressed]
Saving to: ‘imdb_sample.tgz’


2021-08-23 23:03:52 (11.4 MB/s) - ‘imdb_sample.tgz’ saved [571827/571827]



Carregamos o dataset .csv usando o pandas:

In [None]:
import pandas as pd
df = pd.read_csv('imdb_sample/texts.csv')
df.shape
print(df.shape)
df.head()

(1000, 3)


Unnamed: 0,label,text,is_valid
0,negative,Un-bleeping-believable! Meg Ryan doesn't even ...,False
1,positive,This is a extremely well-made film. The acting...,False
2,negative,Every once in a long while a movie will come a...,False
3,positive,Name just says it all. I watched this movie wi...,False
4,negative,This movie succeeds at being one of the most u...,False


Iremos agora dividir o dataset em conjuntos de treino e teste:

In [None]:
treino = df[df['is_valid'] == False]
valid = df[df['is_valid'] == True]

print('treino.shape:', treino.shape)
print('valid.shape:', valid.shape)

treino.shape: (800, 3)
valid.shape: (200, 3)


E iremos dividir estes dois conjuntos em entrada (X) e saída desejada (Y, ground-truth) do modelo:

In [None]:
X_treino = treino['text']
Y_treino = treino['label']
X_valid = valid['text']
Y_valid = valid['label']

print('X_treino.head():', X_treino.head())
print('Y_treino.head():', Y_treino.head())

X_treino.head(): 0    Un-bleeping-believable! Meg Ryan doesn't even ...
1    This is a extremely well-made film. The acting...
2    Every once in a long while a movie will come a...
3    Name just says it all. I watched this movie wi...
4    This movie succeeds at being one of the most u...
Name: text, dtype: object
Y_treino.head(): 0    negative
1    positive
2    negative
3    positive
4    negative
Name: label, dtype: object


Ainda falta converter as strings "positive" e "negative" do ground-truth para valores booleanos:

In [None]:
mapeamento = {'positive': True, 'negative': False}
Y_treino_bool = Y_treino.map(mapeamento)
Y_valid_bool = Y_valid.map(mapeamento)
print(Y_treino_bool.head())

0    False
1     True
2    False
3     True
4    False
Name: label, dtype: bool


In [None]:
import re
from collections import Counter

In [None]:
# Creating vocabulary with training inputs

debug = True
def get_vocabulary(corpus, k): #, stop_words
  # Getting lowercase string
  array_corpus = []
  for text in corpus:
    lower_case = text.lower()
    # Getting list of words and pontuaction with re.findall
    re_split = re.findall(r"[\w']+|[.,!?><:()@*~#]",lower_case)   
#    clean_array = [w.lower() for w in re_split if w not in stop_words] 
    array_corpus.extend(re_split)
  if(debug): print(f"num. words: {len(array_corpus)}")
  if(debug): print(f"words: {array_corpus}")
  return array_corpus, dict(Counter(array_corpus).most_common(k))

def vocabulary_to_index(vocabulary):
  new_dict = vocabulary
  i = 0
  for word in vocabulary:
    new_dict[word] = i
    i+=1
  new_dict['unknown'] = -1
  return new_dict
#stop_words = ["but", "or", "and", "if", "now", "the ", "the", "a", "an", "is", "are", "am", "i", "you", "he", "she", "it", "we", "they", "of", "at", "as", "just", "that", "this", "these", ",", ".", "to", "me", "my", "myself", "our", "ours", "ourselves", "your", "yours", "yourself", "yourselves", "him", "his", "himself", "her", "hers", "herself", "its", "itself", "them", "their", "theirs", "themselves", "what", "which", "who", "whom", "those", "was", "were", "be", "been", "being", "have", "has", "had", "having", "do", "does", "did", "doing", "because", "until", "while", "by", "for", "with", "about", "against", "between", "into", "through", "during", "before", "after", "above", "below", "from", "up", "down", "in", "out", "on", "off", "over", "under", "again", "further", "then", "once", "here", "there", "when", "where", "why", "how", "all", "any", "both", "each", "few", "more", "most", "other", "some", "such", "no", "nor", "not", "only", "own", "same", "so", "than", "too", "very", "s", "t", "can", "will", "just", "don", "should", "now"]

k = 30000
array_corpus, vocabulary = get_vocabulary(X_treino, k)
index_voc = vocabulary_to_index(vocabulary)
if(debug): print(index_voc)

num. words: 230822


In [None]:
len(vocabulary)

17363

In [None]:
import numpy as np

In [None]:
def bool_bag_of_words(phrase, index_voc):
    words = []
    bow = np.zeros(len(index_voc))
    lower_case = phrase.lower()
    re_split = re.findall(r"[\w']+|[.,!?><:()@*~#]",lower_case)   
    for w in re_split:
      if w not in words:
        if w in index_voc:
          ind=index_voc[w]
        else:
          ind=index_voc['unknown']
        if bow[ind]==0: bow[ind]+=1
        words.append(w)
    return bow

if(debug): print(bool_bag_of_words("This is a br test phrase desconheco about the movie and the film.", index_voc))   

[1. 1. 0. ... 0. 0. 1.]


In [None]:
def hist_bag_of_words(phrase, index_voc):
    words = []
    bow = np.zeros(len(index_voc))
    lower_case = phrase.lower()
    re_split = re.findall(r"[\w']+|[.,!?><:()@*~#]",lower_case)   
    for w in re_split:
      if w in index_voc:
        ind=index_voc[w]
      else:
        ind=index_voc['unknown']
      bow[ind]+=1
      words.append(w)
    return bow

if(debug): print(hist_bag_of_words("This is a br test phrase about desconheco the movie and the film.", index_voc))   

[2. 1. 0. ... 0. 0. 1.]


In [None]:
def tf_idf(phrase, corpus, index_voc):
    tf_idf_array = np.zeros(len(index_voc))
    lower_case = phrase.lower()
    re_split = re.findall(r"[\w']+|[.,!?><:()@*~#]",lower_case) 
    for word in re_split:
      if word in index_voc:
        ind=index_voc[word]
      else:
        ind=index_voc['unknown']
      tf = re_split.count(word) / len(re_split)
      idf = np.log10(len(corpus) / (1+sum([1 for doc in corpus if word in doc])))    
      tf_idf_array[ind] = round(tf*idf, 3)
    return tf_idf_array


tf_idf_array = tf_idf("This is a br test phrase about desconheco the movie and the film.", X_treino,index_voc)
if(debug): print(tf_idf_array)   

[ 0.    -0.     0.    ...  0.     0.     0.207]


0.0

In [None]:
#Import svm model
from sklearn import svm

#Create a svm Classifier
clf_bow_b = svm.SVC(kernel='linear') # Linear Kernel
clf_bow_h = svm.SVC(kernel='linear') # Linear Kernel
clf_tfidf = svm.SVC(kernel='linear') # Linear Kernel

In [None]:
#Train with bool BoW  (14s)
X_treino_bool = [bool_bag_of_words(phrase, index_voc) for phrase in X_treino]
X_valid_bool = [bool_bag_of_words(phrase, index_voc) for phrase in X_valid]

clf_bow_b.fit(X_treino_bool, Y_treino_bool)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='linear',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [None]:
error = []
x = 0
for y in Y_valid_bool:
  #Predict the response for test dataset
  y_pred = clf_bow_b.predict(X_valid_bool[x].reshape(1, -1))
  if y_pred == y: error.append(1)
  x += 1
print(f"Acc bow_bool: {100*sum(error)/len(X_valid_bool)}%")

Acc bow_bool: 80.0%


In [None]:
#Train with hist BoW (13s)
X_treino_bool = [hist_bag_of_words(phrase, index_voc) for phrase in X_treino]
X_valid_bool = [hist_bag_of_words(phrase, index_voc) for phrase in X_valid]

clf_bow_h.fit(X_treino_bool, Y_treino_bool)



SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='linear',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [None]:
error = []
x = 0
for y in Y_valid_bool:
  #Predict the response for test dataset
  y_pred = clf_bow_h.predict(X_valid_bool[x].reshape(1, -1))
  if y_pred == y: error.append(1)
  x += 1
print(f"Acc bow_hist: {100*sum(error)/len(X_valid_bool)}%")

Acc bow_hist: 78.0%


In [None]:
#Train with tf_idf  (3 min 34s)
X_treino_bool = [tf_idf(phrase, X_treino, index_voc) for phrase in X_treino]
#dfX_treino_bool = pd.Series(X_treino_bool)
X_valid_bool = [tf_idf(phrase, X_treino, index_voc) for phrase in X_valid]
#df_valid_bool = pd.Series( X_treino_bool)

clf_tfidf.fit(X_treino_bool, Y_treino_bool)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='linear',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [None]:
error = []
x = 0
for y in Y_valid_bool:
  #Predict the response for test dataset
  y_pred = clf_tfidf.predict(X_valid_bool[x].reshape(1, -1))
  if y_pred == y: error.append(1)
  x += 1
print(f"Acc tf_idf: {100*sum(error)/len(X_valid_bool)}%")

Acc tf_idf: 53.5%


In [None]:
X_valid_bool[x-1].reshape(1, -1)

array([[ 0.   , -0.   ,  0.002, ...,  0.   ,  0.   ,  0.009]])