In [129]:
import pandas as pd
import numpy as np
from sklearn import metrics
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn import svm
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.model_selection import ShuffleSplit
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.ensemble import AdaBoostClassifier

In [130]:
df = pd.read_csv('tash-pt.csv')
df

Unnamed: 0,id_twitter,sentiment,text
0,1067856049821155334,1,@tchaugip @n00bona História vence técnica faci...
1,1070902848957964288,1,@camandis aa obrigado perfeito o seu timing ti...
2,1073158858925838336,0,moldura artística favorita entre filistinos al...
3,1073220881344856064,0,Minha cama tá no localizada no meio nesse mome...
4,1073347140334833664,0,@eueduramos o mundo so vai mudar quando as pes...
...,...,...,...
2782,1068599890761129985,-1,@PastorMalafaia Concordo com Prof Olavo de Car...
2783,1071376463230840833,-1,@ClaudiaLeitte se o clipe de saudade não for g...
2784,1073978880787181568,1,agr o ar vai ficar ligado 24 por 48
2785,1074925265980145666,0,e eu que no meio de tantos problemas ainda con...


In [131]:
df.drop('id_twitter', inplace = True, axis = 1) #deletando coluna 'id_twitter'
df.head()

Unnamed: 0,sentiment,text
0,1,@tchaugip @n00bona História vence técnica faci...
1,1,@camandis aa obrigado perfeito o seu timing ti...
2,0,moldura artística favorita entre filistinos al...
3,0,Minha cama tá no localizada no meio nesse mome...
4,0,@eueduramos o mundo so vai mudar quando as pes...


In [132]:
df.isnull().sum()

sentiment    0
text         0
dtype: int64

In [133]:
df.drop_duplicates() #deletando também duplicatas, se existirem
df.shape

(2787, 2)

In [134]:
df['sentiment'].value_counts()

 0    1018
 1     888
-1     881
Name: sentiment, dtype: int64

In [135]:
print(df['text'][0])

@tchaugip @n00bona História vence técnica facilmente. A menos que seja algo experimental que traga algo muito diferente e novo, pois aí agrega ao panteao de técnicas e então é lembrado. Caso contrário é só execução.


## Pré-processamento do texto

**Limpeza do texto**

Removendo acentos e caracteres especiais

In [136]:
import unicodedata

def f_clean(df, text):
    df['text'] = df['text'].replace(regex='[!/,.-]',value='').apply(lambda x: unicodedata.normalize('NFKD', x).encode('ascii','ignore').decode("utf-8"))
    return df
                                                                    
f_clean(df, df['text'])
print('\n',df['text'][0])
print('\n',df['text'][9])
print('\n',df['text'][1575])
print('\n',df['text'][2019])
print('\n',df['text'][1981])


 @tchaugip @n00bona Historia vence tecnica facilmente A menos que seja algo experimental que traga algo muito diferente e novo pois ai agrega ao panteao de tecnicas e entao e lembrado Caso contrario e so execucao

 viado eu to apaixonado pelo @jaoromania  nao da amo um artista

 se eu tivesse um canal no youtube c um milhao de inscritos ja fazer pelo menos cada inscrito me mandar um real por mes

 To rindo mais to preocupado joguei dentro denovo pqp tava na onda do boldo 

 Nao se preocupe com passado q ele sempre foi ausente
Se tava tudo igual eu faco ficar diferente 


## Remoção de stopwords

In [137]:
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

def f_stopwords(df, text):
    stop_words = stopwords.words('portuguese')
    df['text'] = df['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))
    return df

f_stopwords(df, df['text'])
print('\n',df['text'][0])
print('\n',df['text'][9])
print('\n',df['text'][1575])
print('\n',df['text'][2019])
print('\n',df['text'][1981])

[nltk_data] Downloading package stopwords to C:\Users\Letícia
[nltk_data]     Sousa\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!



 @tchaugip @n00bona Historia vence tecnica facilmente A menos algo experimental traga algo diferente novo pois ai agrega panteao tecnicas entao lembrado Caso contrario so execucao

 viado to apaixonado @jaoromania nao amo artista

 canal youtube c milhao inscritos ja fazer menos cada inscrito mandar real mes

 To rindo to preocupado joguei dentro denovo pqp tava onda boldo

 Nao preocupe passado q sempre ausente Se tava tudo igual faco ficar diferente


## Aplicação de stemming

## Vetorização

In [138]:
sample = df['text']
target = df['sentiment']
count_vect = CountVectorizer()
X_sample = count_vect.fit_transform(sample)
tfidf_transformer = TfidfTransformer()
X_sample_transform = tfidf_transformer.fit_transform(X_sample)

In [139]:
X, y = X_sample_transform, target
X.shape, y.shape

((2787, 12576), (2787,))

## K-Fold SVM

In [140]:
kf = KFold(n_splits=10, shuffle=True)

for train_index, test_index in kf.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    reg = svm.SVC(kernel = 'linear', C=1).fit(X_train, y_train)
    y_pred = reg.predict(X_test)
    print(reg.score(X_test, y_test))

0.44086021505376344
0.4336917562724014
0.5125448028673835
0.46236559139784944
0.4229390681003584
0.4874551971326165
0.4265232974910394
0.38489208633093525
0.4568345323741007
0.49640287769784175


## K-Fold NB 

In [141]:
kf = KFold(n_splits=10, shuffle=True)

for train_index, test_index in kf.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    nb = MultinomialNB().fit(X_train, y_train)
    y_pred = nb.predict(X_test)
    print(nb.score(X_test, y_test))

0.4014336917562724
0.5053763440860215
0.45878136200716846
0.43727598566308246
0.44802867383512546
0.44086021505376344
0.45878136200716846
0.44964028776978415
0.38848920863309355
0.46402877697841727


## K-Fold LR

In [142]:
kf = KFold(n_splits=10, shuffle=True)

for train_index, test_index in kf.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    lr = LogisticRegression(max_iter=5000).fit(X_train, y_train)
    y_pred = lr.predict(X_test)
    print(lr.score(X_test, y_test))

0.3906810035842294
0.4336917562724014
0.46236559139784944
0.45161290322580644
0.4910394265232975
0.4121863799283154
0.5017921146953405
0.4568345323741007
0.45323741007194246
0.4712230215827338


## K-Fold AdaBoost

In [143]:
kf = KFold(n_splits=10, shuffle=True)

for train_index, test_index in kf.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    adb = AdaBoostClassifier().fit(X_train, y_train)
    y_pred = adb.predict(X_test)
    print(adb.score(X_test, y_test))

0.36917562724014336
0.45878136200716846
0.4336917562724014
0.4157706093189964
0.4444444444444444
0.3978494623655914
0.3906810035842294
0.420863309352518
0.3776978417266187
0.4028776978417266


## K-Fold OneVsRest

In [144]:
kf = KFold(n_splits=10, shuffle=True)

for train_index, test_index in kf.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    ovr = OneVsRestClassifier(LinearSVC(random_state=0, max_iter=5000)).fit(X_train, y_train)
    y_pred = ovr.predict(X_test)
    print(ovr.score(X_test, y_test))

0.43010752688172044
0.41935483870967744
0.45878136200716846
0.43010752688172044
0.5089605734767025
0.4229390681003584
0.44086021505376344
0.42805755395683454
0.42805755395683454
0.46402877697841727
