# Classificador de Twitter

### Autor: Ricardo Robson ~rrms at cin.ufpe.br
### Baseado em: http://minerandodados.com.br/index.php/2017/03/15/analise-de-sentimentos-twitter-como-fazer/

O objetivo deste notebook é construir um classificador de sentimento de tweets, utilizando o Algoritmo Naive Bayes, classificando textos de tweets em Positivo ou Negativo.

Faremos uso de um cospus contendo pouco mais de 2900 tweets já classificados.

In [76]:
#Imports que serão usados ao longo da construção e análise do classificador
import nltk
import re
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
from sklearn.model_selection import cross_val_predict
from nltk.stem.snowball import SnowballStemmer
from googletrans import Translator

In [77]:
#Lendo o corpus de tweets classificados
dataset = pd.read_csv('twitter_corpus.csv')
dataset.count()

Text         4088
Sentiment    4088
dtype: int64

In [78]:
#Separando textos e classes (sentimentos)
tweets = dataset['Text'].values
classes = dataset['Sentiment'].values
classes

array(['neg', 'pos', 'pos', ..., 'pos', 'pos', 'pos'], dtype=object)

#### Treinando o modelo

In [79]:
#Criamos um objeto do tipo CountVectorizer chamado vectorizer.
vectorizer = CountVectorizer(analyzer="word")

#Usamos o objeto vectorizer para calcular a frequência de todas as palavras 
#da lista de tweets e armazenamos seu retorno em freq_tweets.
freq_tweets = vectorizer.fit_transform(tweets)

#Criamos um objeto chamado modelo do tipo Naive Bayes Multinomial.
#Esse é o algoritmo de classificação que iremos utilizar.
modelo = MultinomialNB()

#Por fim treinamos o modelo usando a frequência de palavras (freq_tweets)
#e as classes de cada instância.
modelo.fit(freq_tweets,classes)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

#### Treinando o modelo com stop words

In [80]:
stopwords_vectorizer = CountVectorizer(analyzer="word", stop_words='english')
stopwords_freq_tweets = stopwords_vectorizer.fit_transform(tweets)
stopwords_modelo = MultinomialNB()
stopwords_modelo.fit(stopwords_freq_tweets,classes)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

#### Testando modelo simples

In [81]:
#Criando uma pequena lista para testes
testes = ['Feeeling like shit right now. I really want to sleep, but nooo I have 3 hours of dancing and an art assignment to finish.',
         'goodbye exams, HELLO ALCOHOL TONIGHT',
         'trae is so sweet! He just bought me a new baithing suit!! Wove him']
#Testando
freq_testes = vectorizer.transform(testes)
modelo.predict(freq_testes)

array(['neg', 'pos', 'pos'], 
      dtype='<U3')

#### Testando modelo com stopwords

In [82]:
freq_testes = stopwords_vectorizer.transform(testes)
stopwords_modelo.predict(freq_testes)

array(['neg', 'neg', 'pos'], 
      dtype='<U3')

#### Treinando modelo com stemming

In [85]:
stem_modelo = MultinomialNB()
stemmer = SnowballStemmer("english")
stem_vectorizer = CountVectorizer(stemmer)
stemeed_tweets = []
for tweet in tweets:
    stemeed_tweets.append(' '.join([stemmer.stem(word) for word in tweet.split()]))

freq_steemed_tweets = stem_vectorizer.fit_transform(stemeed_tweets)
stem_modelo.fit(freq_steemed_tweets,classes)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

#### Testando modelo com stemming

In [86]:
stem_testes = []
for test in testes:
    stem_testes.append(' '.join([stemmer.stem(word) for word in test.split()]))

stem_freq_testes = stem_vectorizer.transform(stem_testes)
stem_modelo.predict(stem_freq_testes)

array(['neg', 'pos', 'pos'], 
      dtype='<U3')

#### Treinando modelo com stopwords e stemming

In [87]:
stem_stopwords_vectorizer = CountVectorizer(analyzer="word", stop_words='english')
stem_stopwords_freq_tweets = stem_stopwords_vectorizer.fit_transform(stemeed_tweets)
stem_stopwords_modelo = MultinomialNB()
stem_stopwords_modelo.fit(stem_stopwords_freq_tweets,classes)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

#### Testando modelo com stopwords e stemming

In [88]:
stem_stopwords_freq_testes = stem_stopwords_vectorizer.transform(stem_testes)
stem_stopwords_modelo.predict(stem_stopwords_freq_testes)

array(['neg', 'pos', 'pos'], 
      dtype='<U3')

In [36]:
def translateTweers(file_name, sep_name):
    print("Traduzindo '"+file_name+"'")
    tweets_csv = pd.read_csv(file_name, sep=sep_name)
    translator = Translator()
    tweets_list = []
    dates = []
    for index, tweet in enumerate(tweets_csv['tweet']):
        print('.', end="")
        try:
            translated = translator.translate(tweet, dest='en')
            tweets_list.append(translated.text)
            dates.append(tweets_csv['date'][index])
        except ValueError:
            print("ERRO ao tentar traduzir em '"+file_name);
    return dates, tweets_list

In [37]:
tweets_C9_dates, tweets_list_C9 = translateTweers("C9",";")
tweets_list_C9_vector = vectorizer.transform(tweets_list_C9)
tweets_result_C9 = modelo.predict(tweets_list_C9_vector)

Traduzindo 'C9'
............................................................................................................................................................................................................................................................................................................

In [40]:
tweets_FNC_dates, tweets_list_FNC = translateTweers("FNC",";")
tweets_list_FNC_vector = vectorizer.transform(tweets_list_FNC)
tweets_result_FNC = modelo.predict(tweets_list_FNC_vector)

Traduzindo 'FNC'
............................................................................................................................................................................................................................................................................................................

In [41]:
tweets_LZ_dates, tweets_list_LZ = translateTweers("LZ",";")
tweets_list_LZ_vector = vectorizer.transform(tweets_list_LZ)
tweets_result_LZ = modelo.predict(tweets_list_LZ_vector)

Traduzindo 'LZ'
............................................................................................................................................................................................................................................................................................................

In [42]:
tweets_MSF_dates, tweets_list_MSF = translateTweers("MSF",";")
tweets_list_MSF_vector = vectorizer.transform(tweets_list_MSF)
tweets_result_MSF = modelo.predict(tweets_list_MSF_vector)

Traduzindo 'MSF'
............................................................................................................................................................................................................................................................................................................

In [43]:
tweets_RNG_dates, tweets_list_RNG = translateTweers("RNG",";")
tweets_list_RNG_vector = vectorizer.transform(tweets_list_RNG)
tweets_result_RNG = modelo.predict(tweets_list_RNG_vector)

Traduzindo 'RNG'
.......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................

In [44]:
tweets_SKT_dates, tweets_list_SKT = translateTweers("SKT",";")
tweets_list_SKT_vector = vectorizer.transform(tweets_list_SKT)
tweets_result_SKT = modelo.predict(tweets_list_SKT_vector)

Traduzindo 'SKT'
.......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................

In [45]:
tweets_SSG_dates, tweets_list_SSG = translateTweers("SSG",";")
tweets_list_SSG_vector = vectorizer.transform(tweets_list_SSG)
tweets_result_SSG = modelo.predict(tweets_list_SSG_vector)

Traduzindo 'SSG'
................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................ERRO ao tentar traduzir em 'SSG
.......................................................................................................................

In [46]:
tweets_WE_dates, tweets_list_WE = translateTweers("WE",";")
tweets_list_WE_vector = vectorizer.transform(tweets_list_WE)
tweets_result_WE = modelo.predict(tweets_list_WE_vector)

Traduzindo 'WE'
........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................

In [47]:
print(str(len(tweets_list_C9))+"\t-\t"+str(len(tweets_result_C9)))
print(str(len(tweets_list_FNC))+"\t-\t"+str(len(tweets_result_FNC)))
print(str(len(tweets_list_LZ))+"\t-\t"+str(len(tweets_result_LZ)))
print(str(len(tweets_list_MSF))+"\t-\t"+str(len(tweets_result_MSF)))
print(str(len(tweets_list_RNG))+"\t-\t"+str(len(tweets_result_RNG)))
print(str(len(tweets_list_SKT))+"\t-\t"+str(len(tweets_result_SKT)))
print(str(len(tweets_list_SSG))+"\t-\t"+str(len(tweets_result_SSG)))
print(str(len(tweets_list_WE))+"\t-\t"+str(len(tweets_result_WE)))


300	-	300
300	-	300
300	-	300
300	-	300
1200	-	1200
1900	-	1900
1898	-	1898
1200	-	1200


In [48]:
print(len(tweets_result_C9))
percentual_positives = len(tweets_result_C9[tweets_result_C9 == 'pos'])/len(tweets_result_C9)
percentual_negatives = 1-percentual_positives
print("Pos: "+str(percentual_positives)+" Neg:"+str(percentual_negatives))

print(len(tweets_result_FNC))
percentual_positives = len(tweets_result_FNC[tweets_result_FNC == 'pos'])/len(tweets_result_FNC)
percentual_negatives = 1-percentual_positives
print("Pos: "+str(percentual_positives)+" Neg:"+str(percentual_negatives))

print(len(tweets_result_LZ))
percentual_positives = len(tweets_result_LZ[tweets_result_LZ == 'pos'])/len(tweets_result_LZ)
percentual_negatives = 1-percentual_positives
print("Pos: "+str(percentual_positives)+" Neg:"+str(percentual_negatives))

print(len(tweets_result_MSF))
percentual_positives = len(tweets_result_MSF[tweets_result_MSF == 'pos'])/len(tweets_result_MSF)
percentual_negatives = 1-percentual_positives
print("Pos: "+str(percentual_positives)+" Neg:"+str(percentual_negatives))

print(len(tweets_result_RNG))
percentual_positives = len(tweets_result_RNG[tweets_result_RNG == 'pos'])/len(tweets_result_RNG)
percentual_negatives = 1-percentual_positives
print("Pos: "+str(percentual_positives)+" Neg:"+str(percentual_negatives))

print(len(tweets_result_SKT))
percentual_positives = len(tweets_result_SKT[tweets_result_SKT == 'pos'])/len(tweets_result_SKT)
percentual_negatives = 1-percentual_positives
print("Pos: "+str(percentual_positives)+" Neg:"+str(percentual_negatives))

print(len(tweets_result_SSG))
percentual_positives = len(tweets_result_SSG[tweets_result_SSG == 'pos'])/len(tweets_result_SSG)
percentual_negatives = 1-percentual_positives
print("Pos: "+str(percentual_positives)+" Neg:"+str(percentual_negatives))

print(len(tweets_result_WE))
percentual_positives = len(tweets_result_WE[tweets_result_WE == 'pos'])/len(tweets_result_WE)
percentual_negatives = 1-percentual_positives
print("Pos: "+str(percentual_positives)+" Neg:"+str(percentual_negatives))

300
Pos: 0.24666666666666667 Neg:0.7533333333333333
300
Pos: 0.19333333333333333 Neg:0.8066666666666666
300
Pos: 0.25333333333333335 Neg:0.7466666666666666
300
Pos: 0.26 Neg:0.74
1200
Pos: 0.21083333333333334 Neg:0.7891666666666667
1900
Pos: 0.2231578947368421 Neg:0.7768421052631579
1898
Pos: 0.24868282402528977 Neg:0.7513171759747103
1200
Pos: 0.16833333333333333 Neg:0.8316666666666667


#### Escrevendo CSV com os resultados

In [50]:
f = open('tweets_classifieds.csv','w')
for index, C9_result in enumerate(tweets_result_C9):
    text = tweets_C9_dates[index]+','+tweets_list_C9[index]+','+C9_result+'\n'
    f.write(text)
for index, FNC_result in enumerate(tweets_result_FNC):
    text = tweets_FNC_dates[index]+','+tweets_list_FNC[index]+','+FNC_result+'\n'
    f.write(text)
for index, LZ_result in enumerate(tweets_result_LZ):
    text = tweets_LZ_dates[index]+','+tweets_list_LZ[index]+','+LZ_result+'\n'
    f.write(text)
for index, MSF_result in enumerate(tweets_result_MSF):
    text = tweets_MSF_dates[index]+','+tweets_list_MSF[index]+','+MSF_result+'\n'
    f.write(text)
for index, RNG_result in enumerate(tweets_result_RNG):
    text = tweets_RNG_dates[index]+','+tweets_list_RNG[index]+','+RNG_result+'\n'
    f.write(text)
for index, SKT_result in enumerate(tweets_result_SKT):
    text = tweets_SKT_dates[index]+','+tweets_list_SKT[index]+','+SKT_result+'\n'
    f.write(text)
for index, SSG_result in enumerate(tweets_result_SSG):
    text = tweets_SSG_dates[index]+','+tweets_list_SSG[index]+','+SSG_result+'\n'
    f.write(text)
for index, WE_result in enumerate(tweets_result_WE):
    text = tweets_WE_dates[index]+','+tweets_list_WE[index]+','+WE_result+'\n'
    f.write(text)
f.close()
print("Finished")

Finished
