# EP1 de Processamento de Língua Natural

## Leitura e análise dos dados

In [12]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.linear_model import LogisticRegression
from sklearn.dummy import DummyClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer

In [13]:
treino_dataset = pd.read_excel(io='ep1_esic2023_clareza_TRAIN.xlsx')
treino_dataset

Unnamed: 0,resp_text,clarity
0,Prezado Sr Jose Taunai Em atenção ao seu pe...,c5
1,"""A pedido do Pró-Reitor de Graduação, informa...",c5
2,"""Prezado (a) Sr. (a), Agradecemos o contato e...",c234
3,"""Prezado (a) Sr. (a), Agradecemos o contato e...",c234
4,"""Prezado Prof. Gilberto Tadeu Reis da Silva ...",c234
...,...,...
5995,Trata-se de solicitação com base na Lei de Ac...,c1
5996,Trata-se de uma solicitação repetida. As info...,c5
5997,Unidade:,c5
5998,Vale dizer que a gestão dos Telefones de Uso ...,c234


In [14]:
treino_dataset.describe()

Unnamed: 0,resp_text,clarity
count,6000,6000
unique,5626,3
top,"Prezado(a) Senhor(a), Sua manifestação foi a...",c5
freq,41,2000


In [15]:
counts = treino_dataset['clarity'].value_counts()

## Vectorizer

In [16]:
vect = CountVectorizer()
X = vect.fit_transform(treino_dataset.resp_text)
print(X.shape)

(6000, 28710)


In [17]:
Y = treino_dataset.clarity
print(Y.shape)

(6000,)


## Tfidf Vectorizer

In [18]:
tfidf_vect = TfidfVectorizer()
tfidf_X = tfidf_vect.fit_transform(treino_dataset.resp_text)
tfidf_Y = treino_dataset.clarity
print(tfidf_X.shape)
print(tfidf_Y.shape)

(6000, 28710)
(6000,)


## Tfidf Vectorizer Bigramas

In [22]:
tfidf_vect = TfidfVectorizer(ngram_range=(2,2))
tfidf_bigram_X = tfidf_vect.fit_transform(treino_dataset.resp_text)
tfidf_bigram_Y = treino_dataset.clarity
print(tfidf_bigram_X.shape)
print(tfidf_bigram_Y.shape)

(6000, 202965)
(6000,)


## Tfidf Vectorizer 3-Char

In [34]:
tfidf_vect = TfidfVectorizer(analyzer='char', ngram_range=(3,3))
tfidf_3_char_X = tfidf_vect.fit_transform(treino_dataset.resp_text)
tfidf_3_char_Y = treino_dataset.clarity
print(tfidf_bigram_X.shape)
print(tfidf_bigram_Y.shape)

[' ! ' ' " ' ' ""' ... 'üll' 'ütt' 'üên']
(6000, 202965)
(6000,)


## Tfidf Vectorizer 5-Char

In [30]:
tfidf_vect = TfidfVectorizer(analyzer='char', ngram_range=(5,5))
tfidf_5_char_X = tfidf_vect.fit_transform(treino_dataset.resp_text)
tfidf_5_char_Y = treino_dataset.clarity
print(tfidf_bigram_X.shape)
print(tfidf_bigram_Y.shape)

(6000, 202965)
(6000,)


## Modelos

### Dummy

In [19]:
clf_dummy = DummyClassifier(strategy='most_frequent', random_state = 100,constant = None).fit(X, Y)
result_dummy = clf_dummy.predict(X)
accuracy_dummy = accuracy_score(Y, result_dummy)
accuracy_dummy

0.3333333333333333

### Regressão Logística

#### Vectorizer

In [20]:
clf_logistic = LogisticRegression(class_weight='balanced', max_iter=1000)
result = cross_val_score(clf_logistic, X, Y, scoring='accuracy', cv=10).mean()
result

0.43600000000000005

#### Tf-Idf Vectorizer

In [21]:
clf_logistic = LogisticRegression(class_weight='balanced', max_iter=1000)
result = cross_val_score(clf_logistic, tfidf_X, tfidf_Y, scoring='accuracy', cv=10).mean()
result

0.4608333333333333

#### Tf-Idf Vectorizer Bigramas

In [35]:
clf_logistic = LogisticRegression(class_weight='balanced', max_iter=1000)
result = cross_val_score(clf_logistic, tfidf_bigram_X, tfidf_bigram_Y, scoring='accuracy', cv=10).mean()
result

KeyboardInterrupt: 

#### Tf-Idf Vectorizer 3 char

In [None]:
clf_logistic = LogisticRegression(class_weight='balanced', max_iter=1000)
result = cross_val_score(clf_logistic, tfidf_3_char_X, tfidf_3_char_Y, scoring='accuracy', cv=10).mean()
result

KeyboardInterrupt: 

#### Tf-Idf Vectorizer 5 char

In [None]:
clf_logistic = LogisticRegression(class_weight='balanced', max_iter=1000)
result = cross_val_score(clf_logistic, tfidf_5_char_X, tfidf_5_char_Y, scoring='accuracy', cv=10).mean()
result

KeyboardInterrupt: 