# EP1 de Processamento de Língua Natural

## Leitura e análise dos dados

In [8]:
import pandas as pd
import time
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.dummy import DummyClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer

In [9]:
treino_dataset = pd.read_excel(io='ep1_esic2023_clareza_TRAIN.xlsx')
treino_dataset

Unnamed: 0,resp_text,clarity
0,Prezado Sr Jose Taunai Em atenção ao seu pe...,c5
1,"""A pedido do Pró-Reitor de Graduação, informa...",c5
2,"""Prezado (a) Sr. (a), Agradecemos o contato e...",c234
3,"""Prezado (a) Sr. (a), Agradecemos o contato e...",c234
4,"""Prezado Prof. Gilberto Tadeu Reis da Silva ...",c234
...,...,...
5995,Trata-se de solicitação com base na Lei de Ac...,c1
5996,Trata-se de uma solicitação repetida. As info...,c5
5997,Unidade:,c5
5998,Vale dizer que a gestão dos Telefones de Uso ...,c234


In [10]:
treino_dataset.describe()

Unnamed: 0,resp_text,clarity
count,6000,6000
unique,5626,3
top,"Prezado(a) Senhor(a), Sua manifestação foi a...",c5
freq,41,2000


In [11]:
counts = treino_dataset['clarity'].value_counts()
counts

clarity
c5      2000
c234    2000
c1      2000
Name: count, dtype: int64

## Remoção das aspas no inicio e fim de texto

In [49]:
def remove_aspas(text:str):
    text_stripped = text.strip()
    if text_stripped[0] == '"':
        trimmed = text_stripped[1:]
        trimmed = trimmed[:-1]
        return trimmed
    else:
        return text_stripped

treino_dataset_aspas = treino_dataset
treino_dataset_aspas = treino_dataset_aspas.applymap(remove_aspas)

treino_dataset = treino_dataset_aspas

treino_dataset

  treino_dataset_aspas = treino_dataset_aspas.applymap(remove_aspas)


Unnamed: 0,resp_text,clarity
0,Prezado Sr Jose Taunai Em atenção ao seu pedi...,c5
1,"A pedido do Pró-Reitor de Graduação, informamo...",c5
2,"Prezado (a) Sr. (a), Agradecemos o contato e i...",c234
3,"Prezado (a) Sr. (a), Agradecemos o contato e i...",c234
4,Prezado Prof. Gilberto Tadeu Reis da Silva Em...,c234
...,...,...
5995,Trata-se de solicitação com base na Lei de Ace...,c1
5996,Trata-se de uma solicitação repetida. As infor...,c5
5997,Unidade:,c5
5998,Vale dizer que a gestão dos Telefones de Uso P...,c234


## Vectorizers

### Count Vectorizer

In [50]:
vect = CountVectorizer()
count_X = vect.fit_transform(treino_dataset.resp_text)
count_Y = treino_dataset.clarity

print(count_X.shape)
print(count_Y.shape)

(6000, 28711)
(6000,)


### TF-IDF Vectorizer

In [51]:
tfidf_vect = TfidfVectorizer()
tfidf_X = tfidf_vect.fit_transform(treino_dataset.resp_text)
tfidf_Y = treino_dataset.clarity

print(tfidf_X.shape)
print(tfidf_Y.shape)

(6000, 28711)
(6000,)


### TF-IDF Vectorizer Bigramas

In [52]:
tfidf_vect = TfidfVectorizer(ngram_range=(2,2))
tfidf_bigram_X = tfidf_vect.fit_transform(treino_dataset.resp_text)
tfidf_bigram_Y = treino_dataset.clarity

print(tfidf_bigram_X.shape)
print(tfidf_bigram_Y.shape)

(6000, 202966)
(6000,)


### TF-IDF Vectorizer 3-Char

In [53]:
tfidf_vect = TfidfVectorizer(analyzer='char', ngram_range=(3,3))
tfidf_3_char_X = tfidf_vect.fit_transform(treino_dataset.resp_text)
tfidf_3_char_Y = treino_dataset.clarity

print(tfidf_bigram_X.shape)
print(tfidf_bigram_Y.shape)

(6000, 202966)
(6000,)


### TF-IDF Vectorizer 5-Char

In [54]:
tfidf_vect = TfidfVectorizer(analyzer='char', ngram_range=(5,5))
tfidf_5_char_X = tfidf_vect.fit_transform(treino_dataset.resp_text)
tfidf_5_char_Y = treino_dataset.clarity

print(tfidf_bigram_X.shape)
print(tfidf_bigram_Y.shape)

(6000, 202966)
(6000,)


## Base

### Funções

In [55]:
def timing_decorator(func):
    def wrapper(*args, **kwargs):
        start_time = time.time()
        result = func(*args, **kwargs)
        end_time = time.time()
        execution_time = end_time - start_time
        print(f'{func.__name__} took {execution_time:.2f} seconds to run')

        return result
    
    return wrapper

In [56]:
@timing_decorator
def calculate_model_score(clf, X, Y):
    return cross_val_score(clf, X, Y, scoring='accuracy', cv=10).mean()

In [57]:
def train_all(clf):
    print('----------------------------------------------------')
    print('Training Count Vectorizer')
    result_logistic_count_vect = calculate_model_score(clf, count_X, count_Y)
    print(f'Accuracy score: {result_logistic_count_vect}')

    print('----------------------------------------------------')
    print('Training TF-IDF Vectorizer')
    result_logistic_tfidf = calculate_model_score(clf, tfidf_X, tfidf_Y)
    print(f'Accuracy score: {result_logistic_tfidf}')

    print('----------------------------------------------------')
    print('Training TF-IDF Vectorizer Bigramas')
    result_logistic_tfidf_bigram = calculate_model_score(clf, tfidf_bigram_X, tfidf_bigram_Y)
    print(f'Accuracy score: {result_logistic_tfidf_bigram}')

    print('----------------------------------------------------')
    print('Training TF-IDF Vectorizer 3-Char')
    result_logistic_tfidf_3_char = calculate_model_score(clf, tfidf_3_char_X, tfidf_3_char_Y)
    print(f'Accuracy score: {result_logistic_tfidf_3_char}')

    print('----------------------------------------------------')
    print('Training TF-IDF Vectorizer 5-Char')
    result_logistic_tfidf_5_char = calculate_model_score(clf, tfidf_5_char_X, tfidf_5_char_Y)
    print(f'Accuracy score: {result_logistic_tfidf_5_char}')
    print('----------------------------------------------------')

## Modelos

### Dummy

In [58]:
clf_dummy = DummyClassifier(strategy='most_frequent', random_state=100, constant=None).fit(count_X, count_Y)
result_dummy = clf_dummy.predict(count_X)
accuracy_dummy = accuracy_score(count_Y, result_dummy)
accuracy_dummy

0.3333333333333333

### Regressão Logística

In [59]:
clf_logistic = LogisticRegression(class_weight='balanced', max_iter=1000)
train_all(clf_logistic)

----------------------------------------------------
Training Count Vectorizer
calculate_model_score took 120.17 seconds to run
Accuracy score: 0.43583333333333335
----------------------------------------------------
Training TF-IDF Vectorizer
calculate_model_score took 14.85 seconds to run
Accuracy score: 0.4608333333333333
----------------------------------------------------
Training TF-IDF Vectorizer Bigramas


KeyboardInterrupt: 

### Naive Bayes

In [None]:
clf_multinomial_naive_bayes = MultinomialNB()
train_all(clf_multinomial_naive_bayes)

----------------------------------------------------
Training Count Vectorizer
calculate_model_score took 0.24 seconds to run
Accuracy score: 0.4338333333333334
----------------------------------------------------
Training TF-IDF Vectorizer
calculate_model_score took 0.24 seconds to run
Accuracy score: 0.4371666666666667
----------------------------------------------------
Training TF-IDF Vectorizer Bigramas
calculate_model_score took 0.48 seconds to run
Accuracy score: 0.44366666666666665
----------------------------------------------------
Training TF-IDF Vectorizer 3-Char
calculate_model_score took 0.57 seconds to run
Accuracy score: 0.43100000000000005
----------------------------------------------------
Training TF-IDF Vectorizer 5-Char
calculate_model_score took 1.16 seconds to run
Accuracy score: 0.417
----------------------------------------------------


### Random Forest

In [None]:
clf_random_forest = RandomForestClassifier(class_weight='balanced', random_state=100)
train_all(clf_random_forest)

----------------------------------------------------
Training Count Vectorizer
calculate_model_score took 247.43 seconds to run
Accuracy score: 0.43950000000000006
----------------------------------------------------
Training TF-IDF Vectorizer
calculate_model_score took 220.58 seconds to run
Accuracy score: 0.4381666666666666
----------------------------------------------------
Training TF-IDF Vectorizer Bigramas
calculate_model_score took 1310.05 seconds to run
Accuracy score: 0.42833333333333334
----------------------------------------------------
Training TF-IDF Vectorizer 3-Char
calculate_model_score took 351.10 seconds to run
Accuracy score: 0.44366666666666665
----------------------------------------------------
Training TF-IDF Vectorizer 5-Char


KeyboardInterrupt: 

### Support Vector Machine (SVM)

In [None]:
clf_svm = SVC(kernel='linear', class_weight='balanced')
train_all(clf_svm)

----------------------------------------------------
Training Count Vectorizer


KeyboardInterrupt: 

### Multilayer Perceptron (MLP)

In [None]:
clf_mlp = MLPClassifier(hidden_layer_sizes=(100,), max_iter=1000, random_state=100)
train_all(clf_mlp)

----------------------------------------------------
Training Count Vectorizer


