# Importações

In [1]:
# Preparando ambiente (importando bibliotecas e downloads...)

!pip install nltk
import nltk
nltk.download('rslp')
nltk.download('stopwords')
nltk.download('punkt')
import re
import pandas as pd 
import numpy as np
import spacy
nlp = spacy.load('pt')



[nltk_data] Downloading package rslp to /home/karine/nltk_data...
[nltk_data]   Package rslp is already up-to-date!
[nltk_data] Downloading package stopwords to /home/karine/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/karine/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# Carregamento e tratamento dos léxicos e datasets necessários

In [2]:
dicionario_lexico = {}
sentilexpt = open('lexicos/SentiLex-lem-PT02.txt','r', encoding='utf8')
ontolp = open('lexicos/lexico_v3.0.txt','r', encoding='utf8')

In [3]:
for i in sentilexpt:
    pos_ponto = i.find('.')
    palavra = (i[:pos_ponto])
    pol_pos = i.find('POL')
    polaridade = (i[pol_pos+7:pol_pos+9]).replace(';','')
    dicionario_lexico[palavra] = polaridade

In [4]:
for i in ontolp:
    split_dic = i.split(',')
    palavra = split_dic[0]
    if palavra not in dicionario_lexico:
        polaridade = split_dic[2]
        dicionario_lexico[palavra] = polaridade        

In [5]:
dic_sentimentos = {}
sentidic = open('dic_sentimento_3_niveis_balanceado.txt','r', encoding='utf8')

In [6]:
for i in sentidic:
    split_dic = i.split('\t')
    dic_sentimentos[split_dic[0]] = split_dic[1][:-1]

In [7]:
degree_words = open('lexicos/degree-words.txt','r', encoding='utf8')
degree_words_set = {}

In [8]:
for i in degree_words:
    split_dic = i.split(';')
    degree_words_set[split_dic[0]] = split_dic[1][:-1]

In [9]:
sentilexpt.close()
ontolp.close()
degree_words.close()
sentidic.close()

In [10]:
liwc = pd.read_csv("lexicos/LIWC2007.txt",sep="\t",encoding="ISO-8859-1", names=['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20'])

In [11]:
liwc_set = {}
def cria_dic_liwc(x):
    lista = []
    if 126 in list(x):
        liwc_set[x.name] = 'pos'
    elif 127 in list(x):
        liwc_set[x.name] = 'neg'

In [12]:
liwc.apply(lambda x: cria_dic_liwc(x), axis=1)

a          None
aba        None
abafa      None
abafad*    None
abafada    None
           ... 
último     None
último*    None
út*        None
úteis      None
útil       None
Length: 127161, dtype: object

In [15]:
def taxa_sent_por_frase(frase):
    
    frase = nlp(frase)
    degree_words_head = []
    degree_words_aux = []
    score_frase = {'alegria':0, 'tristeza':0, 'raiva':0, 'medo':0, 'nojo':0}
    
    for token in frase:
        if(token.text in degree_words_set):
            degree_words_head.append(token.head.text)
            degree_words_aux.append(int(degree_words_set.get(token.text)))
            
    for token in frase:
        if(token.text in dic_sentimentos):
            dic = dic_sentimentos.get(token.text)
            neg = 0
            if token.head.text in degree_words_head:                
                neg = int(degree_words_aux[degree_words_head.index(token.head.text)])
            score_frase[dic] = int(score_frase.get(dic)) + 1 + neg

    score_resultado = ('neutro', 0)
    for sentimento, valor in score_frase.items():
        s, v = score_resultado
        if abs(valor) > v:
            score_resultado = (sentimento, valor)
        
    s, v = score_resultado
    return s

# Treinamento do Modelo

## Pre-processamento

In [16]:
stop_words = nltk.corpus.stopwords.words('portuguese')

In [17]:
def removeStopWords():
    frases = []
    for (palavras, sentimento) in zip(logs.Frase, logs.Sentimento):
        semStop = [ p for p in palavras.split() if p not in stop_words]
        frases.append((semStop, sentimento))
    return frases

In [18]:
def realiza_stem(dados):
    stemmer = nltk.stem.RSLPStemmer()
    frases_sem_Stemming = []
    for (palavras, sentimento) in dados:
        com_Stemming = [str(stemmer.stem(p)) for p in palavras.split() if p not in stop_words]
        frases_sem_Stemming.append((com_Stemming, sentimento))
    return pd.DataFrame(frases_sem_Stemming)

In [19]:
def preprocessamento():
    sem_stopwords = removeStopWords()    
    return realiza_stem(sem_stop_words)

## Treinamento

In [20]:
from sklearn.feature_extraction.text import CountVectorizer
vect = CountVectorizer(ngram_range=(1, 1))
vect.fit(logs.Frase)
text_vect = vect.transform(logs.Frase)

NameError: name 'logs' is not defined

In [None]:
from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test = train_test_split(text_vect, logs.Sentimento, test_size = 0.3, random_state = 42)

In [None]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(random_state=0, solver='newton-cg')
#y=y.astype('int')
clf = clf.fit(X_train, y_train)

In [None]:
from sklearn.metrics import f1_score
y_prediction = clf.predict(X_test)

f1 = f1_score(y_prediction, y_test, average='weighted')

print(f1)

# Sentimentos por transação

In [21]:
sentimentos = ['alegria','medo','nojo','raiva','tristeza']

In [22]:
def sentimento_transacao(transacao):
    lista_sentimentos = []
    sent_max = ''
    sent_v = 0
    for frase in transacao:
        res = frase.split('\t')
        lista_sentimentos.append(taxa_sent_por_frase(res[1]))        
    for item in sentimentos:
        v_sent_atual = lista_sentimentos.count(item)
        if v_sent_atual > sent_v:
            sent_v = v_sent_atual
            sent_max = item
    return sent_max

In [24]:
registros = open('logs/Logs_transacoes/Registros.txt', 'r', encoding='utf8')
for i in registros:
    aux = str('logs/Logs_transacoes/Corrigidos/' + i[:-1] + ".txt")
    log_transacao = open(aux, 'r', encoding='utf8')
    #sentimento_transacao(log_transacao)
    print(sentimento_transacao(log_transacao))

alegria
raiva
raiva
raiva
alegria
alegria
medo
raiva
raiva
raiva
alegria
raiva
raiva
raiva
raiva
raiva
raiva
raiva
raiva
raiva
raiva
raiva
raiva
raiva
raiva
raiva
raiva
raiva
nojo
raiva
alegria
raiva
raiva
raiva
alegria
raiva
tristeza
tristeza
tristeza
medo
tristeza
alegria
medo
raiva
medo
alegria
medo

nojo
medo
raiva
raiva
raiva
alegria
raiva
raiva
raiva
raiva
medo
alegria
alegria
raiva
raiva
raiva
medo
raiva
alegria
tristeza
raiva
raiva
raiva
raiva
medo
raiva
medo
alegria
alegria
raiva
raiva
alegria
alegria
raiva
alegria
medo
raiva
raiva
raiva

alegria
alegria
medo
medo
alegria

nojo
alegria
nojo

alegria
raiva
raiva
raiva
raiva
alegria
alegria
alegria
alegria
alegria
medo
tristeza
tristeza
alegria
tristeza
medo
medo
medo
medo
medo
nojo
alegria
raiva
alegria
alegria
alegria
raiva
alegria
raiva
raiva
alegria
raiva
medo
raiva
raiva
raiva
raiva
raiva
raiva
raiva
raiva
raiva
raiva
raiva
