### 1. Carrega o arquivo com as sentenças, features e classificação

In [1]:
import pandas as pd

data_frame = pd.read_csv('dataset_sentencas_processadas.csv')

# sentenca_original = sentenças obtidas do trabalho de Gabriela
# sentenca_processada1 = remoção das anotações feitas por Gabriela
# sentenca_processada2 = sem pontuações
# sentenca_processada3_1 = sem acentos
# sentenca_processada3_2 = sem stopwords
# sentenca_processada3_3 = letras minúsculas
# sentenca_processada3_4 = apenas as raizes das palavras (stemmer)

In [2]:
data_frame.head(1)

Unnamed: 0,sentenca_original,sentenca_processada1,classificacao,sentenca_processada2,sentenca_processada3_1,sentenca_processada3_2,sentenca_processada3_3,sentenca_processada3_4
0,"A Folha, sempre [tão solícita]P6, só fez junta...","A Folha, sempre tão solícita, só fez juntar os...",1,A Folha sempre tão solícita só fez juntar os d...,A Folha sempre tao solicita so fez juntar os d...,A Folha sempre tao solicita fez juntar dois de...,a folha sempre tao solicita fez juntar dois de...,a folh sempr tao solicit fez junt doi desafet ...


In [3]:
print(data_frame['classificacao'].value_counts())

1    2000
0    2000
Name: classificacao, dtype: int64


### 2. TF-IDF (Term Frequency–Inverse Document Frequency)

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer

vetorizar = TfidfVectorizer(lowercase=False)

tf_idf = vetorizar.fit_transform(data_frame['sentenca_processada3_4'].values.astype('U'))

vocabulario = vetorizar.get_feature_names()

In [5]:
print(tf_idf.shape)

(4000, 5776)


#### 2.1. Utiliza 10-Fold Cross Validation para realizar os testes

In [6]:
from sklearn.model_selection import cross_val_score

In [7]:
from sklearn import tree
classificador = tree.DecisionTreeClassifier()

scores = cross_val_score(classificador, tf_idf, data_frame['classificacao'], cv=10, scoring='f1_micro')
# print(scores)
print("F1: %0.3f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

scores = cross_val_score(classificador, tf_idf, data_frame['classificacao'], cv=10, scoring='precision')
# print(scores)
print("Precision: %0.3f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

scores = cross_val_score(classificador, tf_idf, data_frame['classificacao'], cv=10, scoring='accuracy')
# print(scores)
print("Accuracy: %0.3f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

scores = cross_val_score(classificador, tf_idf, data_frame['classificacao'], cv=10, scoring='recall')
# print(scores)
print("Recall: %0.3f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

F1: 0.576 (+/- 0.08)
Precision: 0.590 (+/- 0.09)
Accuracy: 0.582 (+/- 0.07)
Recall: 0.487 (+/- 0.09)


In [9]:
from sklearn.linear_model import LogisticRegression
classificador = LogisticRegression(solver='lbfgs')

scores = cross_val_score(classificador, tf_idf, data_frame['classificacao'], cv=10, scoring='f1_micro')
# print(scores)
print("F1: %0.3f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

scores = cross_val_score(classificador, tf_idf, data_frame['classificacao'], cv=10, scoring='precision')
# print(scores)
print("Precision: %0.3f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

scores = cross_val_score(classificador, tf_idf, data_frame['classificacao'], cv=10, scoring='accuracy')
# print(scores)
print("Accuracy: %0.3f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

scores = cross_val_score(classificador, tf_idf, data_frame['classificacao'], cv=10, scoring='recall')
# print(scores)
print("Recall: %0.3f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

F1: 0.668 (+/- 0.09)
Precision: 0.661 (+/- 0.10)
Accuracy: 0.668 (+/- 0.09)
Recall: 0.698 (+/- 0.10)


In [10]:
from sklearn import svm

classificador = svm.SVC(gamma='auto', C=1.0, kernel='linear', probability=True)

scores = cross_val_score(classificador, tf_idf, data_frame['classificacao'], cv=10, scoring='f1_micro')
# print(scores)
print("F1: %0.3f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

scores = cross_val_score(classificador, tf_idf, data_frame['classificacao'], cv=10, scoring='precision')
# print(scores)
print("Precision: %0.3f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

scores = cross_val_score(classificador, tf_idf, data_frame['classificacao'], cv=10, scoring='accuracy')
# print(scores)
print("Accuracy: %0.3f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

scores = cross_val_score(classificador, tf_idf, data_frame['classificacao'], cv=10, scoring='recall')
# print(scores)
print("Recall: %0.3f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

F1: 0.657 (+/- 0.09)
Precision: 0.653 (+/- 0.09)
Accuracy: 0.657 (+/- 0.09)
Recall: 0.675 (+/- 0.09)


In [11]:
from sklearn.neural_network import MLPClassifier

classificador = MLPClassifier(activation='relu', max_iter=10000, solver='adam', alpha=1e-10)

scores = cross_val_score(classificador, tf_idf, data_frame['classificacao'], cv=10, scoring='f1_micro')
# print(scores)
print("F1: %0.3f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

scores = cross_val_score(classificador, tf_idf, data_frame['classificacao'], cv=10, scoring='precision')
# print(scores)
print("Precision: %0.3f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

scores = cross_val_score(classificador, tf_idf, data_frame['classificacao'], cv=10, scoring='accuracy')
# print(scores)
print("Accuracy: %0.3f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

scores = cross_val_score(classificador, tf_idf, data_frame['classificacao'], cv=10, scoring='recall')
# print(scores)
print("Recall: %0.3f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

F1: 0.597 (+/- 0.06)
Precision: 0.613 (+/- 0.07)
Accuracy: 0.598 (+/- 0.06)
Recall: 0.538 (+/- 0.08)
