## Bibliotecas

In [4]:
import fasttext
import pandas as pd
import os
import numpy as np
import re
from unidecode import unidecode
from string import punctuation

## Leitura e tratamento da base Emocoes

In [5]:
df = pd.read_csv('../dataset/Emocoes.csv', sep = ";", encoding = 'iso8859-1')

In [6]:
# Renomeando campos e apagando
df.rename(columns = dict(zip(df.columns, [i.replace('[', '').replace(']', '') for i in df.columns])), inplace = True) 
df.drop(['Unnamed: 15', 'Unnamed: 16','Unnamed: 17','Unnamed: 18',	'Unnamed: 19',	'Unnamed: 20'], axis = 1, inplace = True)

In [7]:
# Ignorando lixo em hashtagsearch
df = df[df['hashtagsearch'].map(df['hashtagsearch'].value_counts()) > 4]

In [8]:
# Tratando texto em hashtagsearch
df['hashtagsearch'] = df['hashtagsearch'].apply(lambda x: x.replace('#', '').replace(',','').replace('"', ''))

In [9]:
# Conferindo tratamento
df['hashtagsearch'].value_counts()

Triste      2731
Amor        2720
Feliz       1948
Chateado    1676
Inveja       858
Ironia       749
Raiva        510
Name: hashtagsearch, dtype: int64

In [10]:
df['hashtagsearch'] = df['hashtagsearch'].apply(lambda x: "__label__" + x)

In [11]:
df['hashtagsearch'].value_counts()

__label__Triste      2731
__label__Amor        2720
__label__Feliz       1948
__label__Chateado    1676
__label__Inveja       858
__label__Ironia       749
__label__Raiva        510
Name: hashtagsearch, dtype: int64

In [12]:
def clean_text(text):
    text = text.lower()
    text = re.sub('@[^\s]+', '', text)
    text = unidecode(text)
    text = re.sub('<[^<]+?>','', text)
    text = text.replace('{', '').replace('}', '')
    text = ''.join(c for c in text if not c.isdigit())
    text = re.sub('((www\.[^\s]+)|(https?://[^\s]+)|(http?://[^\s]+))', '', text)
    text = ''.join(c for c in text if c not in punctuation)
    return text

In [13]:
df['text'] = df['text'].apply(lambda x: clean_text(x))

In [14]:
df.shape[0]*0.7

7834.4

In [15]:
df.shape[0]*0.3

3357.6

# Testando outra base

In [36]:
df_binary = pd.read_csv('../dataset/NoThemeTweets.csv', sep = ",")

In [37]:
df_binary['tweet_text'] = df_binary['tweet_text'].apply(lambda x: clean_text(x))
df_binary['sentiment'] = df_binary['sentiment'].apply(lambda x: "__label__" + x)

In [38]:
df_binary.shape[0]*0.7

550069.7999999999

In [39]:
df_binary.shape[0]*0.3

235744.19999999998

# Separando em treino e teste

In [40]:
with open('twitter_p.txt', 'w') as f:
    f.write(
        df_binary[['sentiment', 'tweet_text']].to_string(header = False, index = False)
    )

In [41]:
os.system('head -n 550069 twitter_p.txt > twitter_p_train.txt')
os.system('tail -n 235744 twitter_p.txt > twitter_p_test.txt')

0

## Treinando modelo

In [42]:
%%time
model = fasttext.train_supervised(input="twitter_p_train.txt", epoch=50, lr=0.3, wordNgrams=2, bucket=200000, dim=50, loss='hs')
model.save_model('model_twitter.bin')

CPU times: user 3min 36s, sys: 23.7 s, total: 4min
Wall time: 24.9 s


## Testando modelo

In [43]:
%%time
model.test('twitter_p_test.txt')

CPU times: user 797 ms, sys: 93.8 ms, total: 891 ms
Wall time: 926 ms


(235744, 0.7794726482964571, 0.7794726482964571)