# EP1 de Processamento de Língua Natural - Lematização

## Leitura e análise dos dados

In [2]:
import pandas as pd
import time
import spacy
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.dummy import DummyClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer

In [3]:
treino_dataset = pd.read_excel(io='ep1_esic2023_clareza_TRAIN.xlsx')
treino_dataset

Unnamed: 0,resp_text,clarity
0,Prezado Sr Jose Taunai Em atenção ao seu pe...,c5
1,"""A pedido do Pró-Reitor de Graduação, informa...",c5
2,"""Prezado (a) Sr. (a), Agradecemos o contato e...",c234
3,"""Prezado (a) Sr. (a), Agradecemos o contato e...",c234
4,"""Prezado Prof. Gilberto Tadeu Reis da Silva ...",c234
...,...,...
5995,Trata-se de solicitação com base na Lei de Ac...,c1
5996,Trata-se de uma solicitação repetida. As info...,c5
5997,Unidade:,c5
5998,Vale dizer que a gestão dos Telefones de Uso ...,c234


In [4]:
treino_dataset.describe()

Unnamed: 0,resp_text,clarity
count,6000,6000
unique,5626,3
top,"Prezado(a) Senhor(a), Sua manifestação foi a...",c5
freq,41,2000


In [5]:
counts = treino_dataset['clarity'].value_counts()
counts

clarity
c5      2000
c234    2000
c1      2000
Name: count, dtype: int64

In [33]:
nlp = spacy.load('pt_core_news_sm')

i = 0
for text in treino_dataset['resp_text']:
    doc = nlp(text)
    lemma = [token.lemma_ for token in doc if not token.is_punct and not token.is_space]
    treino_dataset['resp_text'].iloc[i] = " ".join(x for x in lemma)
    i = i + 1

treino_dataset
# treino_dataset['resp_text'] = doc.apply(lambda x: " ".join(x.lemma_ for x in x.split())) # if token.pos_ == 'NOUN'
# treino_dataset


Unnamed: 0,resp_text,clarity
0,Prezado Sr Jose Taunai em atenção a o seu pedi...,c5
1,a pedido de o Pró-Reitor de Graduação informar...,c5
2,Prezado o sr. o Agradecemos o contato e inform...,c234
3,Prezado o sr. o Agradecemos o contato e inform...,c234
4,Prezado Prof Gilberto Tadeu Reis de o Silva em...,c234
...,...,...
5995,tratar se de solicitação com base em o Lei de ...,c1
5996,tratar se de um solicitação repetir o informaç...,c5
5997,Unidade,c5
5998,Vale dizer que o gestão de o Telefones de Uso ...,c234


## Vectorizers

### Count Vectorizer

In [34]:
vect = CountVectorizer()
count_X = vect.fit_transform(treino_dataset.resp_text)
count_Y = treino_dataset.clarity

print(count_X.shape)
print(count_Y.shape)

(6000, 25371)
(6000,)


### TF-IDF Vectorizer

In [35]:
tfidf_vect = TfidfVectorizer()
tfidf_X = tfidf_vect.fit_transform(treino_dataset.resp_text)
tfidf_Y = treino_dataset.clarity

print(tfidf_X.shape)
print(tfidf_Y.shape)

(6000, 25371)
(6000,)


### TF-IDF Vectorizer Bigramas

In [36]:
tfidf_vect = TfidfVectorizer(ngram_range=(2,2))
tfidf_bigram_X = tfidf_vect.fit_transform(treino_dataset.resp_text)
tfidf_bigram_Y = treino_dataset.clarity

print(tfidf_bigram_X.shape)
print(tfidf_bigram_Y.shape)

(6000, 177972)
(6000,)


### TF-IDF Vectorizer 3-Char

In [37]:
tfidf_vect = TfidfVectorizer(analyzer='char', ngram_range=(3,3))
tfidf_3_char_X = tfidf_vect.fit_transform(treino_dataset.resp_text)
tfidf_3_char_Y = treino_dataset.clarity

print(tfidf_bigram_X.shape)
print(tfidf_bigram_Y.shape)

(6000, 177972)
(6000,)


### TF-IDF Vectorizer 5-Char

In [38]:
tfidf_vect = TfidfVectorizer(analyzer='char', ngram_range=(5,5))
tfidf_5_char_X = tfidf_vect.fit_transform(treino_dataset.resp_text)
tfidf_5_char_Y = treino_dataset.clarity

print(tfidf_bigram_X.shape)
print(tfidf_bigram_Y.shape)

(6000, 177972)
(6000,)


## Base

### Funções

In [39]:
def timing_decorator(func):
    def wrapper(*args, **kwargs):
        start_time = time.time()
        result = func(*args, **kwargs)
        end_time = time.time()
        execution_time = end_time - start_time
        print(f'{func.__name__} took {execution_time:.2f} seconds to run')

        return result
    
    return wrapper

In [40]:
@timing_decorator
def calculate_model_score(clf, X, Y):
    return cross_val_score(clf, X, Y, scoring='accuracy', cv=10).mean()

In [46]:
def train_all(clf):
    print('----------------------------------------------------')
    print('Training Count Vectorizer')
    result_logistic_count_vect = calculate_model_score(clf, count_X, count_Y)
    print(f'Accuracy score: {result_logistic_count_vect}')

    print('----------------------------------------------------')
    print('Training TF-IDF Vectorizer')
    result_logistic_tfidf = calculate_model_score(clf, tfidf_X, tfidf_Y)
    print(f'Accuracy score: {result_logistic_tfidf}')

    print('----------------------------------------------------')
    print('Training TF-IDF Vectorizer Bigramas')
    result_logistic_tfidf_bigram = calculate_model_score(clf, tfidf_bigram_X, tfidf_bigram_Y)
    print(f'Accuracy score: {result_logistic_tfidf_bigram}')

    print('----------------------------------------------------')
    print('Training TF-IDF Vectorizer 3-Char')
    result_logistic_tfidf_3_char = calculate_model_score(clf, tfidf_3_char_X, tfidf_3_char_Y)
    print(f'Accuracy score: {result_logistic_tfidf_3_char}')

    print('----------------------------------------------------')
    print('Training TF-IDF Vectorizer 5-Char')
    result_logistic_tfidf_5_char = calculate_model_score(clf, tfidf_5_char_X, tfidf_5_char_Y)
    print(f'Accuracy score: {result_logistic_tfidf_5_char}')
    print('----------------------------------------------------')

## Modelos

### Dummy

In [42]:
clf_dummy = DummyClassifier(strategy='most_frequent', random_state=100, constant=None).fit(count_X, count_Y)
result_dummy = clf_dummy.predict(count_X)
accuracy_dummy = accuracy_score(count_Y, result_dummy)
accuracy_dummy

0.3333333333333333

### Regressão Logística

In [47]:
clf_logistic = LogisticRegression(class_weight='balanced', max_iter=5000)
train_all(clf_logistic)

----------------------------------------------------
Training Count Vectorizer
calculate_model_score took 84.49 seconds to run
Accuracy score: 0.44433333333333336
----------------------------------------------------
Training TF-IDF Vectorizer
calculate_model_score took 9.41 seconds to run
Accuracy score: 0.4659999999999999
----------------------------------------------------
Training TF-IDF Vectorizer Bigramas
calculate_model_score took 43.38 seconds to run
Accuracy score: 0.45166666666666666
----------------------------------------------------
Training TF-IDF Vectorizer 3-Char
calculate_model_score took 23.47 seconds to run
Accuracy score: 0.46866666666666673
----------------------------------------------------
Training TF-IDF Vectorizer 5-Char
calculate_model_score took 66.18 seconds to run
Accuracy score: 0.4680000000000001
----------------------------------------------------


### Naive Bayes

In [48]:
clf_multinomial_naive_bayes = MultinomialNB()
train_all(clf_multinomial_naive_bayes)

----------------------------------------------------
Training Count Vectorizer
calculate_model_score took 0.11 seconds to run
Accuracy score: 0.4381666666666667
----------------------------------------------------
Training TF-IDF Vectorizer
calculate_model_score took 0.08 seconds to run
Accuracy score: 0.44716666666666666
----------------------------------------------------
Training TF-IDF Vectorizer Bigramas
calculate_model_score took 0.14 seconds to run
Accuracy score: 0.4368333333333333
----------------------------------------------------
Training TF-IDF Vectorizer 3-Char
calculate_model_score took 0.17 seconds to run
Accuracy score: 0.43616666666666665
----------------------------------------------------
Training TF-IDF Vectorizer 5-Char
calculate_model_score took 0.28 seconds to run
Accuracy score: 0.42366666666666664
----------------------------------------------------


### Random Forest

In [49]:
clf_random_forest = RandomForestClassifier(class_weight='balanced', random_state=100)
train_all(clf_random_forest)

----------------------------------------------------
Training Count Vectorizer
calculate_model_score took 126.52 seconds to run
Accuracy score: 0.44033333333333335
----------------------------------------------------
Training TF-IDF Vectorizer
calculate_model_score took 116.44 seconds to run
Accuracy score: 0.43533333333333324
----------------------------------------------------
Training TF-IDF Vectorizer Bigramas
calculate_model_score took 700.06 seconds to run
Accuracy score: 0.43500000000000005
----------------------------------------------------
Training TF-IDF Vectorizer 3-Char
calculate_model_score took 130.29 seconds to run
Accuracy score: 0.442
----------------------------------------------------
Training TF-IDF Vectorizer 5-Char
calculate_model_score took 684.42 seconds to run
Accuracy score: 0.43233333333333335
----------------------------------------------------


### Support Vector Machine (SVM)

In [50]:
clf_svm = SVC(kernel='linear', class_weight='balanced')
train_all(clf_svm)

----------------------------------------------------
Training Count Vectorizer
calculate_model_score took 165.08 seconds to run
Accuracy score: 0.43483333333333335
----------------------------------------------------
Training TF-IDF Vectorizer
calculate_model_score took 134.77 seconds to run
Accuracy score: 0.4630000000000001
----------------------------------------------------
Training TF-IDF Vectorizer Bigramas
calculate_model_score took 223.18 seconds to run
Accuracy score: 0.44783333333333336
----------------------------------------------------
Training TF-IDF Vectorizer 3-Char
calculate_model_score took 722.83 seconds to run
Accuracy score: 0.4715
----------------------------------------------------
Training TF-IDF Vectorizer 5-Char
calculate_model_score took 1127.69 seconds to run
Accuracy score: 0.4611666666666666
----------------------------------------------------


### Multilayer Perceptron (MLP)

In [51]:
clf_mlp = MLPClassifier(hidden_layer_sizes=(100,), max_iter=1000, random_state=100)
train_all(clf_mlp)

----------------------------------------------------
Training Count Vectorizer


