# EP1 de Processamento de Língua Natural

## Leitura e análise dos dados

In [102]:
import pandas as pd
import time
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.dummy import DummyClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer

In [36]:
treino_dataset = pd.read_excel(io='ep1_esic2023_clareza_TRAIN.xlsx')
treino_dataset

Unnamed: 0,resp_text,clarity
0,Prezado Sr Jose Taunai Em atenção ao seu pe...,c5
1,"""A pedido do Pró-Reitor de Graduação, informa...",c5
2,"""Prezado (a) Sr. (a), Agradecemos o contato e...",c234
3,"""Prezado (a) Sr. (a), Agradecemos o contato e...",c234
4,"""Prezado Prof. Gilberto Tadeu Reis da Silva ...",c234
...,...,...
5995,Trata-se de solicitação com base na Lei de Ac...,c1
5996,Trata-se de uma solicitação repetida. As info...,c5
5997,Unidade:,c5
5998,Vale dizer que a gestão dos Telefones de Uso ...,c234


In [37]:
treino_dataset.describe()

Unnamed: 0,resp_text,clarity
count,6000,6000
unique,5626,3
top,"Prezado(a) Senhor(a), Sua manifestação foi a...",c5
freq,41,2000


In [38]:
counts = treino_dataset['clarity'].value_counts()
counts

clarity
c5      2000
c234    2000
c1      2000
Name: count, dtype: int64

## Vectorizers

### Count Vectorizer

In [39]:
vect = CountVectorizer()
count_X = vect.fit_transform(treino_dataset.resp_text)
count_Y = treino_dataset.clarity

print(count_X.shape)
print(count_Y.shape)

(6000, 28710)
(6000,)


### TF-IDF Vectorizer

In [40]:
tfidf_vect = TfidfVectorizer()
tfidf_X = tfidf_vect.fit_transform(treino_dataset.resp_text)
tfidf_Y = treino_dataset.clarity

print(tfidf_X.shape)
print(tfidf_Y.shape)

(6000, 28710)
(6000,)


### TF-IDF Vectorizer Bigramas

In [41]:
tfidf_vect = TfidfVectorizer(ngram_range=(2,2))
tfidf_bigram_X = tfidf_vect.fit_transform(treino_dataset.resp_text)
tfidf_bigram_Y = treino_dataset.clarity

print(tfidf_bigram_X.shape)
print(tfidf_bigram_Y.shape)

(6000, 202965)
(6000,)


### TF-IDF Vectorizer 3-Char

In [42]:
tfidf_vect = TfidfVectorizer(analyzer='char', ngram_range=(3,3))
tfidf_3_char_X = tfidf_vect.fit_transform(treino_dataset.resp_text)
tfidf_3_char_Y = treino_dataset.clarity

print(tfidf_bigram_X.shape)
print(tfidf_bigram_Y.shape)

(6000, 202965)
(6000,)


### TF-IDF Vectorizer 5-Char

In [43]:
tfidf_vect = TfidfVectorizer(analyzer='char', ngram_range=(5,5))
tfidf_5_char_X = tfidf_vect.fit_transform(treino_dataset.resp_text)
tfidf_5_char_Y = treino_dataset.clarity

print(tfidf_bigram_X.shape)
print(tfidf_bigram_Y.shape)

(6000, 202965)
(6000,)


## Base

### Funções

In [70]:
def timing_decorator(func):
    def wrapper(*args, **kwargs):
        start_time = time.time()
        result = func(*args, **kwargs)
        end_time = time.time()
        execution_time = end_time - start_time
        print(f'{func.__name__} took {execution_time:.2f} seconds to run')

        return result
    
    return wrapper

In [71]:
@timing_decorator
def calculate_model_score(clf, X, Y):
    return cross_val_score(clf, X, Y, scoring='accuracy', cv=10).mean()

In [85]:
def train_all(clf):
    print('----------------------------------------------------')
    print('Training Count Vectorizer')
    result_logistic_count_vect = calculate_model_score(clf, count_X, count_Y)
    print(f'Accuracy score: {result_logistic_count_vect}')

    print('----------------------------------------------------')
    print('Training TF-IDF Vectorizer')
    result_logistic_tfidf = calculate_model_score(clf, tfidf_X, tfidf_Y)
    print(f'Accuracy score: {result_logistic_tfidf}')

    print('----------------------------------------------------')
    print('Training TF-IDF Vectorizer Bigramas')
    result_logistic_tfidf_bigram = calculate_model_score(clf, tfidf_bigram_X, tfidf_bigram_Y)
    print(f'Accuracy score: {result_logistic_tfidf_bigram}')

    print('----------------------------------------------------')
    print('Training TF-IDF Vectorizer 3-Char')
    result_logistic_tfidf_3_char = calculate_model_score(clf, tfidf_3_char_X, tfidf_3_char_Y)
    print(f'Accuracy score: {result_logistic_tfidf_3_char}')

    print('----------------------------------------------------')
    print('Training TF-IDF Vectorizer 5-Char')
    result_logistic_tfidf_5_char = calculate_model_score(clf, tfidf_5_char_X, tfidf_5_char_Y)
    print(f'Accuracy score: {result_logistic_tfidf_5_char}')
    print('----------------------------------------------------')

## Modelos

### Dummy

In [74]:
clf_dummy = DummyClassifier(strategy='most_frequent', random_state=100, constant=None).fit(count_X, count_Y)
result_dummy = clf_dummy.predict(count_X)
accuracy_dummy = accuracy_score(count_Y, result_dummy)
accuracy_dummy

0.3333333333333333

### Regressão Logística

In [89]:
clf_logistic = LogisticRegression(class_weight='balanced', max_iter=1000)
train_all(clf_logistic)

----------------------------------------------------
Training Count Vectorizer
calculate_model_score took 65.21 seconds to run
Accuracy score: 0.43600000000000005
----------------------------------------------------
Training TF-IDF Vectorizer
calculate_model_score took 9.51 seconds to run
Accuracy score: 0.4608333333333333
----------------------------------------------------
Training TF-IDF Vectorizer Bigramas
calculate_model_score took 45.05 seconds to run
Accuracy score: 0.44783333333333336
----------------------------------------------------
Training TF-IDF Vectorizer 3-Char
calculate_model_score took 24.10 seconds to run
Accuracy score: 0.46316666666666667
----------------------------------------------------
Training TF-IDF Vectorizer 5-Char
calculate_model_score took 71.57 seconds to run
Accuracy score: 0.4616666666666667
----------------------------------------------------


### Naive Bayes

In [86]:
clf_multinomial_naive_bayes = MultinomialNB()
train_all(clf_multinomial_naive_bayes)

----------------------------------------------------
Training Count Vectorizer
calculate_model_score took 0.11 seconds to run
Accuracy score: 0.4338333333333334
----------------------------------------------------
Training TF-IDF Vectorizer
calculate_model_score took 0.08 seconds to run
Accuracy score: 0.4371666666666667
----------------------------------------------------
Training TF-IDF Vectorizer Bigramas
calculate_model_score took 0.17 seconds to run
Accuracy score: 0.44366666666666665
----------------------------------------------------
Training TF-IDF Vectorizer 3-Char
calculate_model_score took 0.17 seconds to run
Accuracy score: 0.43100000000000005
----------------------------------------------------
Training TF-IDF Vectorizer 5-Char
calculate_model_score took 0.31 seconds to run
Accuracy score: 0.417
----------------------------------------------------


### Random Forest

In [88]:
clf_random_forest = RandomForestClassifier(class_weight='balanced', random_state=100)
train_all(clf_random_forest)

----------------------------------------------------
Training Count Vectorizer
calculate_model_score took 138.64 seconds to run
Accuracy score: 0.43950000000000006
----------------------------------------------------
Training TF-IDF Vectorizer
calculate_model_score took 129.75 seconds to run
Accuracy score: 0.4381666666666666
----------------------------------------------------
Training TF-IDF Vectorizer Bigramas
calculate_model_score took 785.97 seconds to run
Accuracy score: 0.42833333333333334
----------------------------------------------------
Training TF-IDF Vectorizer 3-Char
calculate_model_score took 140.89 seconds to run
Accuracy score: 0.44366666666666665
----------------------------------------------------
Training TF-IDF Vectorizer 5-Char
calculate_model_score took 782.41 seconds to run
Accuracy score: 0.4261666666666667
----------------------------------------------------


### Support Vector Machine (SVM)

In [99]:
clf_svm = SVC(kernel='linear', class_weight='balanced')
train_all(clf_svm)

----------------------------------------------------
Training Count Vectorizer
calculate_model_score took 158.93 seconds to run
Accuracy score: 0.43066666666666664
----------------------------------------------------
Training TF-IDF Vectorizer
calculate_model_score took 140.99 seconds to run
Accuracy score: 0.4548333333333333
----------------------------------------------------
Training TF-IDF Vectorizer Bigramas
calculate_model_score took 217.96 seconds to run
Accuracy score: 0.45166666666666666
----------------------------------------------------
Training TF-IDF Vectorizer 3-Char
calculate_model_score took 777.32 seconds to run
Accuracy score: 0.4648333333333333
----------------------------------------------------
Training TF-IDF Vectorizer 5-Char
calculate_model_score took 1839.48 seconds to run
Accuracy score: 0.4629999999999999
----------------------------------------------------


### Multilayer Perceptron (MLP)

In [103]:
clf_mlp = MLPClassifier(hidden_layer_sizes=(100,), max_iter=1000, random_state=100)
train_all(clf_mlp)

----------------------------------------------------
Training Count Vectorizer
calculate_model_score took 578.38 seconds to run
Accuracy score: 0.438
----------------------------------------------------
Training TF-IDF Vectorizer
calculate_model_score took 835.56 seconds to run
Accuracy score: 0.43966666666666665
----------------------------------------------------
Training TF-IDF Vectorizer Bigramas


