In [None]:
import pandas as pd

# Pré-Processamento
from sklearn import preprocessing
from sklearn.model_selection import train_test_split

# Fluxo de Pré-Processamento + Extração de Features
from pre_processing import preprocess_nlp

# Pipeline de modelos a serem executados
from classification import classification_models

# Modelos que serão executados
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB

## Leitura da base

In [None]:
# df = pd.read_csv('../dataset/Emocoes.csv', sep = ";", encoding = 'iso8859-1')
df = pd.read_csv('../dataset/NoThemeTweets.csv', sep = ",")

In [None]:
df = df.sample(frac=0.10, replace=True, random_state=1)

In [None]:
df.shape

In [None]:
# # Renomeando campos e apagando
# df.rename(columns = dict(zip(df.columns, [i.replace('[', '').replace(']', '') for i in df.columns])), inplace = True) 
# df.drop(['Unnamed: 15', 'Unnamed: 16','Unnamed: 17','Unnamed: 18',	'Unnamed: 19',	'Unnamed: 20'], axis = 1, inplace = True)

In [None]:
# # Ignorando lixo em hashtagsearch
# df = df[df['hashtagsearch'].map(df['hashtagsearch'].value_counts()) > 4]

In [None]:
# # Tratando texto em hashtagsearch
# df['hashtagsearch'] = df['hashtagsearch'].apply(lambda x: x.replace('#', '').replace(',','').replace('"', ''))

In [None]:
# # Conferindo tratamento
# df['hashtagsearch'].value_counts()

## Chamando pré-processamento

In [None]:
# Instânciando classe para tratamento dos textos
prepro_imdb = preprocess_nlp(df['tweet_text'], lemma=False, wordcloud=True, numeric='tfidf', ngram=3)

In [None]:
stops = ['mim', 'eh', 'vamo', 'deu', 'tb', 'pro', 'oi', 'oq']

for i in stops:
    prepro_imdb.add_stopword(i)

In [None]:
%%time
# Pegando matriz e textos tratados
sparse_matrix_imdb, transform_Texts = prepro_imdb.preprocess()

In [None]:
le = preprocessing.LabelEncoder()
le.fit(df['sentiment'].unique())
df['sentiment'] = le.transform(df['sentiment'])

In [None]:
# Train and valid
X_train, X_valid, y_train, y_valid = train_test_split(sparse_matrix_imdb, df['sentiment'], test_size=0.3)

## Chamando os modelos

In [None]:
models = [
    ("RandomForest", RandomForestClassifier()),
    ("LogisticRegression", LogisticRegression(max_iter = 10000)),
    ("SVC", SVC()),
    ("KNeighborsClassifier", KNeighborsClassifier()),
    ("MultinomialNB", MultinomialNB())
]

In [None]:
mod = classification_models(X_train, y_train, X_valid, y_valid, models)

In [None]:
%%time
results = mod.apply_model()

In [None]:
results