#Métodos do Pré-processamento

In [None]:
import pandas as pd

class FileToDataframe:

    def __init__(self):
        pass
  
    # converte o arquivo para dataframe
    def ConvertToDataframe(self, dataset):
        try:
            df = pd.read_csv(dataset, sep="\t", header=None, names=['text','sent'])
            df.text = df.text.astype(str)
            sent = [1,-1,0,"1","-1","0"]

            for i in range(len(df)):
                if df['sent'][i] in sent:
                    df.loc[i, 'sent'] = int(df['sent'][i])

            return df
        except IOerror as exc:
            print("Erro ao abrir arquivo")
            if exc.errno != errno.EISDIR:
                raise

In [None]:
class RemocaoStopwords:

    def __init__(self):
        pass


    # remove as stopwords utilizando a lista do NLTK
    def stopwordsNLTK(self, dataset):
        import nltk

        nltk.download('stopwords')
        nltk_stopwords = nltk.corpus.stopwords.words('portuguese')

        # remove palavras com sentido negativo
        nltk_stopwords = [i for i in nltk_stopwords if i not in ["não", "nenhum", "nada", "jamais", "nunca", "nem"]]

        for i in range(len(dataset)):
            word = dataset['text'][i].split(' ')
            listWords = [j for j in word if j not in nltk_stopwords]
            dataset.loc[i, 'text'] = ' '.join(listWords)


    # remove as stopwords utilizando a lista do Spacy
    def stopwordsSpacy(self, dataset):
        import spacy.cli, spacy

        spacy.cli.download("pt_core_news_sm")
        sp = spacy.load("pt_core_news_sm")
        spacy_stopwords = spacy.lang.pt.stop_words.STOP_WORDS

        # remove palavras com sentido negativo
        spacy_stopwords = [i for i in spacy_stopwords if i not in ["não", "nenhum", "nada", "jamais", "nunca", "nem"]]

        for i in range(len(dataset)):
            word = dataset['text'][i].split(' ')
            listWords = [j for j in word if j not in spacy_stopwords]
            dataset.loc[i, 'text'] = ' '.join(listWords)

In [None]:
class Stemming:
  
    def __init__(self):
        pass
        

    # reduz a palavra para o radical utilizando RSLP Stemmer
    def RSLP(self, dataset):
        import nltk

        nltk.download('rslp')
        stemmer = nltk.stem.RSLPStemmer()

        for i in range(len(dataset)):
            word = dataset['text'][i].split(' ')
            listWords = [stemmer.stem(j) for j in word if len(j) > 1]
            dataset.loc[i, 'text'] = ' '.join(listWords)


    # reduz a palavra para o radical utilizando o Snowball
    def Snowball(self, dataset):
        import nltk

        stemmer = nltk.stem.SnowballStemmer("portuguese")

        for i in range(len(dataset)):
            word = dataset['text'][i].split(' ')
            listWords = [stemmer.stem(j) for j in word if len(j) > 1]
            dataset.loc[i, 'text'] = ' '.join(listWords)

In [None]:
!pip install symspellpy

from symspellpy import SymSpell, Verbosity

class CorrecaoOrtografica:
  
    def __init__(self):
        self.sym_spell = SymSpell(max_dictionary_edit_distance=2, prefix_length=7)
        self.sym_spell.load_dictionary("pt_br_50k.txt", term_index=0, count_index=1)

    # corrige a ortografia de cada palavra contida no texto
    def corrigir(self, dataset):
        max_edit_distance = 2
        for i in range(len(dataset)):
            sugestoes = self.sym_spell.lookup_compound(dataset['text'][i], max_edit_distance)
    
            for sugestao in sugestoes:
                dataset.loc[i, 'text'] = sugestao.term

In [None]:
class MetricasClassificacao:

    def __init__(self):
        pass


    def treinoTeste(self, y_test, pred, algoritmo=None):
        if algoritmo == "lstm":
            import numpy as np
            y_test = np.argmax(y_test, axis=1)
            pred = np.argmax(pred, axis=1)

        MetricasClassificacao().imprimindo(y_test, pred)


    def validacaoCruzada(self, model, x, y, folds, y_original=None, algoritmo=None):
        from sklearn.model_selection import cross_val_predict

        pred = cross_val_predict(model, x, y, cv=folds)

        if algoritmo == "lstm":
            y = y_original

        MetricasClassificacao().imprimindo(y, pred)


    def imprimindo(self, y, pred):
        import numpy as np
        from sklearn import metrics
        import matplotlib.pyplot as plt
        print("\n_____________________________________________________________")
        print("\t\tRelatorio de Classificação")
        print("-------------------------------------------------------------\n")
        print(metrics.classification_report(y, pred))
        print("_____________________________________________________________")
        print("\t\t  Matriz de Confusao")
        print("-------------------------------------------------------------\n")
        print(pd.crosstab(y, pred, rownames=['Real'], colnames=['Predito'], margins=True))

#Pré-processamento

In [None]:
# substituir pelo arquivo do dataset
df = FileToDataframe().ConvertToDataframe("arquivo do dataset")

In [None]:
df = df[df['sent'] != 0] # para remover o sentimento neutro
df.reset_index(drop=True, inplace=True)  # restaura os indices de cada linha

In [None]:
!pip install emoji
import re, unicodedata, string, emoji

for i in range(len(df)):
    df.loc[i, 'text'] = re.sub(r" ?#([^\s]+)", " HASHTAG ", df['text'][i])         # remove hashtag
    df.loc[i, 'text'] = emoji.get_emoji_regexp().sub(" EMOJIS ", df['text'][i])   # remove emojis
    df.loc[i, 'text'] = re.sub("\d+|(\d*\.\d+)"," NUMBER ", df['text'][i])   # remove numeros
    df.loc[i, 'text'] = unicodedata.normalize('NFKD', df['text'][i]).encode('ASCII', 'ignore').decode("utf-8") # remove acentos das palavras
    df.loc[i, 'text'] = re.sub("\S*@\S*\s?"," EMAIL ", df['text'][i])  # remove email
    df.loc[i, 'text'] = re.compile('[%s]' % re.escape(string.punctuation)).sub(' ', df['text'][i]) # remove pontuacao
    df.loc[i, 'text'] = re.sub("\S+\.com\S*"," SITE ", df['text'][i])  # remove sites
    df.loc[i, 'text'] = re.sub("\s+"," ", df['text'][i])  # remove espacos desnecessarios

In [None]:
CorrecaoOrtografica().corrigir(df)

In [None]:
RemocaoStopwords().stopwordsNLTK(df)
#RemocaoStopwords().stopwordsSpacy(df)

Stemming().RSLP(df)
#Stemming().Snowball(df)

In [None]:
#utilizar o codigo abaixo, pois na remocao das stopwords deixa alguns textos em branco
for i in range(len(df)):
    if df['text'][i] == "":
        df = df.drop(i,axis=0)
df.reset_index(drop=True, inplace=True)  # restaura os indices de cada linha

#SVM

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import svm

vectorizer = TfidfVectorizer(min_df=0, max_df=0.9, sublinear_tf=True, use_idf=True, ngram_range=(1, 3))
classificador = vectorizer.fit_transform(df['text'])

model = svm.LinearSVC()

In [None]:
MetricasClassificacao().validacaoCruzada(model, classificador, df['sent'], 10, df['sent'])

#LSTM

In [None]:
from keras.layers.core import Activation, Dense, Dropout, SpatialDropout1D
from keras.layers.embeddings import Embedding
from keras.layers.recurrent import LSTM
from keras.layers import Bidirectional, Conv1D, MaxPooling1D, BatchNormalization
from keras.models import Sequential
from keras.regularizers import l2
from keras.optimizers.schedules import ExponentialDecay
from keras.optimizers import RMSprop

class LSTM_m:

    def __init__(self):
        pass


    def lstm_artigo(self, x, vocab_size, d):
        model = Sequential()
        model.add(Embedding(vocab_size,64, input_length=x.shape[1]))
        model.add(SpatialDropout1D((0.5)))
        model.add(LSTM(32, dropout=0.5, recurrent_dropout=0.5))
        model.add(Dense(d))
        model.add(Activation("sigmoid"))
        model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])
        return model


    def lstm_artigo_vc(self, x, vocab_size, d):   # validacao cruzada
        def bm():
            model = LSTM_m().lstm_artigo(x, vocab_size, d)
            return model
        return bm

In [None]:
#utilizar para polaridade ternaria
encoding = {0: 0, 1: 1, -1: 2}
df['sent'] = [encoding[x] for x in df['sent']]

In [None]:
#utilizar para polaridade binaria
encoding = {1: 1, -1: 0}
df['sent'] = [encoding[x] for x in df['sent']]

In [None]:
from sklearn.model_selection import train_test_split
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
import numpy as np

tokenizer = Tokenizer(split=' ')
tokenizer.fit_on_texts(df['text'].values)
vocab_size = len(tokenizer.word_index) + 1

X = tokenizer.texts_to_sequences(df['text'].values)
Y = pd.get_dummies(df['sent']).values
X = pad_sequences(X)

In [None]:
# LSTM com validacao cruzada

from keras.wrappers.scikit_learn import KerasClassifier
kmodel = KerasClassifier(build_fn=LSTM_m().lstm_artigo_vc(X, vocab_size, 2), epochs = 5, batch_size=32) # utilizar para polaridade binaria
#kmodel = KerasClassifier(build_fn=LSTM_m().lstm_artigo_vc(X, vocab_size, 3), epochs = 5, batch_size=32) # utilizar para polaridade ternaria

MetricasClassificacao().validacaoCruzada(kmodel, X, Y, 10, df['sent'], "lstm")