# EP1 de Processamento de Língua Natural - Grid Search

## Leitura e análise dos dados

In [2]:
import pandas as pd
import numpy as np
import time
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.dummy import DummyClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline


In [3]:
treino_dataset = pd.read_excel(io='../data/ep1_esic2023_clareza_TRAIN.xlsx')
treino_dataset

Unnamed: 0,resp_text,clarity
0,Prezado Sr Jose Taunai Em atenção ao seu pe...,c5
1,"""A pedido do Pró-Reitor de Graduação, informa...",c5
2,"""Prezado (a) Sr. (a), Agradecemos o contato e...",c234
3,"""Prezado (a) Sr. (a), Agradecemos o contato e...",c234
4,"""Prezado Prof. Gilberto Tadeu Reis da Silva ...",c234
...,...,...
5995,Trata-se de solicitação com base na Lei de Ac...,c1
5996,Trata-se de uma solicitação repetida. As info...,c5
5997,Unidade:,c5
5998,Vale dizer que a gestão dos Telefones de Uso ...,c234


## Modelos

In [4]:
X = treino_dataset.resp_text
Y = treino_dataset.clarity

In [5]:
def print_results(grid_search):
    best_params = grid_search.best_params_
    best_estimator = grid_search.best_estimator_

    mean_accuracy = cross_val_score(best_estimator, X, Y, scoring='accuracy', cv=10).mean()

    print("Best Parameters:", best_params)
    print("Best Mean Accuracy:", mean_accuracy)

### Regressão Logística

#### Count Vectorizer

In [8]:
pipeline_logistic_count = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('clf', LogisticRegression(max_iter=9999))
])

param_grid_logistic_count = {
    'vectorizer__max_features': [5000, None],
    'vectorizer__lowercase': [True, False],
    'clf__C': [0.001, 0.01, 0.1, 0.5, 1.0, 5.0, 10.0],
    'clf__class_weight': ['balanced', None],
    'clf__penalty': ['l1', 'l2'],
    'clf__solver': ['lbfgs', 'liblinear']
}

grid_search = GridSearchCV(pipeline_logistic_count, param_grid_logistic_count, cv=10, scoring='accuracy', n_jobs=3, verbose=10)

grid_search.fit(X, Y)

print_results(grid_search)

Fitting 10 folds for each of 224 candidates, totalling 2240 fits
[CV 2/10; 1/224] START clf__C=0.001, clf__class_weight=balanced, clf__penalty=l1, clf__solver=lbfgs, vectorizer__lowercase=True, vectorizer__max_features=5000
[CV 3/10; 1/224] START clf__C=0.001, clf__class_weight=balanced, clf__penalty=l1, clf__solver=lbfgs, vectorizer__lowercase=True, vectorizer__max_features=5000
[CV 1/10; 1/224] START clf__C=0.001, clf__class_weight=balanced, clf__penalty=l1, clf__solver=lbfgs, vectorizer__lowercase=True, vectorizer__max_features=5000
[CV 1/10; 1/224] END clf__C=0.001, clf__class_weight=balanced, clf__penalty=l1, clf__solver=lbfgs, vectorizer__lowercase=True, vectorizer__max_features=5000;, score=nan total time=   0.3s
[CV 3/10; 1/224] END clf__C=0.001, clf__class_weight=balanced, clf__penalty=l1, clf__solver=lbfgs, vectorizer__lowercase=True, vectorizer__max_features=5000;, score=nan total time=   0.3s
[CV 2/10; 1/224] END clf__C=0.001, clf__class_weight=balanced, clf__penalty=l1, cl

560 fits failed out of a total of 2240.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
560 fits failed with the following error:
Traceback (most recent call last):
  File "/opt/homebrew/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 729, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/opt/homebrew/lib/python3.11/site-packages/sklearn/base.py", line 1152, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/homebrew/lib/python3.11/site-packages/sklearn/pipeline.py", line 427, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "/opt/homebrew/lib/python3.11/site-packages/sklearn/base.py", l

Best Parameters: {'clf__C': 0.01, 'clf__class_weight': 'balanced', 'clf__penalty': 'l2', 'clf__solver': 'liblinear', 'vectorizer__lowercase': False, 'vectorizer__max_features': None}
Best Mean Accuracy: 0.4493333333333333


#### TF-IDF Vectorizer

In [None]:
pipeline_logistic_tfidf = Pipeline([
    ('vectorizer', TfidfVectorizer()),
    ('clf', LogisticRegression(max_iter=9999))
])

param_grid_logistic_tfidf = {
    'vectorizer__max_features': [5000, None],
    'vectorizer__lowercase': [True, False],
    'clf__C': [0.001, 0.01, 0.1, 0.5, 1.0, 5.0, 10.0],
    'clf__class_weight': ['balanced', None],
    'clf__penalty': ['l1', 'l2'],
    'clf__solver': ['lbfgs', 'liblinear']
}

grid_search_logistic_tfidf = GridSearchCV(pipeline_logistic_tfidf, param_grid_logistic_tfidf, cv=5, scoring='accuracy', n_jobs=1, verbose=10)

grid_search_logistic_tfidf.fit(X, Y)

print_results(grid_search_logistic_tfidf)

#### TF-IDF Vectorizer Bigramas

In [None]:
pipeline_logistic_tfidf_bigram = Pipeline([
    ('vectorizer', TfidfVectorizer(ngram_range=(2,2))),
    ('clf', LogisticRegression(max_iter=9999))
])

param_grid_logistic_tfidf_bigram = {
    'vectorizer__max_features': [5000, None],
    'vectorizer__lowercase': [True, False],
    'clf__C': [0.001, 0.01, 0.1, 0.5, 1.0, 5.0, 10.0],
    'clf__class_weight': ['balanced', None],
    'clf__penalty': ['l1', 'l2'],
    'clf__solver': ['lbfgs', 'liblinear']
}

grid_search_logistic_tfidf_bigram = GridSearchCV(pipeline_logistic_tfidf_bigram, param_grid_logistic_tfidf_bigram, cv=5, scoring='accuracy', n_jobs=1, verbose=10)

grid_search_logistic_tfidf_bigram.fit(X, Y)

print_results(grid_search_logistic_tfidf_bigram)

#### TF-IDF Vectorizer 3-Char

In [None]:
pipeline_logistic_tfidf_3char = Pipeline([
    ('vectorizer', TfidfVectorizer(analyzer='char', ngram_range=(3,3))),
    ('clf', LogisticRegression(max_iter=9999))
])

param_grid_logistic_tfidf_3char = {
    'vectorizer__max_features': [5000, None],
    'vectorizer__lowercase': [True, False],
    'clf__C': [0.001, 0.01, 0.1, 0.5, 1.0, 5.0, 10.0],
    'clf__class_weight': ['balanced', None],
    'clf__penalty': ['l1', 'l2'],
    'clf__solver': ['lbfgs', 'liblinear']
}

grid_search_logistic_tfidf_3char = GridSearchCV(pipeline_logistic_tfidf_3char, param_grid_logistic_tfidf_3char, cv=5, scoring='accuracy', n_jobs=1, verbose=10)

grid_search_logistic_tfidf_3char.fit(X, Y)

print_results(grid_search_logistic_tfidf_3char)

#### TF-IDF Vectorizer 5-Char

In [None]:
pipeline_logistic_tfidf_5char = Pipeline([
    ('vectorizer', TfidfVectorizer(analyzer='char', ngram_range=(5,5))),
    ('clf', LogisticRegression(max_iter=9999))
])

param_grid_logistic_tfidf_5char = {
    'vectorizer__max_features': [5000, None],
    'vectorizer__lowercase': [True, False],
    'clf__C': [0.001, 0.01, 0.1, 0.5, 1.0, 5.0, 10.0],
    'clf__class_weight': ['balanced', None],
    'clf__penalty': ['l1', 'l2'],
    'clf__solver': ['lbfgs', 'liblinear']
}

grid_search_logistic_tfidf_5char = GridSearchCV(pipeline_logistic_tfidf_5char, param_grid_logistic_tfidf_5char, cv=5, scoring='accuracy', n_jobs=1, verbose=10)

grid_search_logistic_tfidf_5char.fit(X, Y)

print_results(grid_search_logistic_tfidf_5char)

# Rascunho

In [14]:
c_list = np.geomspace(0.01, 1.0, 20)

pipeline = Pipeline([
    ('vectorizer', CountVectorizer(max_features=None)),
    ('clf', LogisticRegression(max_iter=9999))
])

param_grid = {
    'clf__C': c_list,
}

grid_search = GridSearchCV(pipeline, param_grid, cv=10, scoring='accuracy', n_jobs=4, verbose=10)

grid_search.fit(X, Y)

best_params = grid_search.best_params_
best_estimator = grid_search.best_estimator_

# Calculate the mean accuracy using cross-validation
mean_accuracy = cross_val_score(best_estimator, X, Y, scoring='accuracy', cv=10).mean()

print("Best Parameters:", best_params)
print("Best Mean Accuracy:", mean_accuracy)

Fitting 10 folds for each of 20 candidates, totalling 200 fits
[CV 3/10; 1/20] START clf__C=0.01...............................................
[CV 4/10; 1/20] START clf__C=0.01...............................................
[CV 2/10; 1/20] START clf__C=0.01...............................................
[CV 1/10; 1/20] START clf__C=0.01...............................................
[CV 4/10; 1/20] END ................clf__C=0.01;, score=0.453 total time=   2.0s
[CV 5/10; 1/20] START clf__C=0.01...............................................
[CV 2/10; 1/20] END ................clf__C=0.01;, score=0.518 total time=   2.1s
[CV 6/10; 1/20] START clf__C=0.01...............................................
[CV 3/10; 1/20] END ................clf__C=0.01;, score=0.427 total time=   2.1s
[CV 7/10; 1/20] START clf__C=0.01...............................................
[CV 1/10; 1/20] END ................clf__C=0.01;, score=0.450 total time=   2.3s
[CV 8/10; 1/20] START clf__C=0.01.............

In [11]:
pipeline = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('clf', LogisticRegression(max_iter=9999))
])

param_grid = {
    'vectorizer__max_features': [5000, 6000, None],
    'clf__C': [0.01, 0.1, 1.0, 10.0, 100.0],
    'clf__penalty': ['l1', 'l2'],
    'clf__solver': ['lbfgs', 'liblinear', 'saga']
}

grid_search = GridSearchCV(pipeline, param_grid, cv=10, scoring='accuracy', n_jobs=4, verbose=10)

grid_search.fit(X, Y)

best_params = grid_search.best_params_
best_estimator = grid_search.best_estimator_

# Calculate the mean accuracy using cross-validation
mean_accuracy = cross_val_score(best_estimator, X, Y, scoring='accuracy', cv=10).mean()

print("Best Parameters:", best_params)
print("Best Mean Accuracy:", mean_accuracy)

Fitting 10 folds for each of 180 candidates, totalling 1800 fits
[CV 2/10; 1/180] START clf__C=0.01, clf__penalty=l1, clf__solver=lbfgs, vectorizer__max_features=500
[CV 3/10; 1/180] START clf__C=0.01, clf__penalty=l1, clf__solver=lbfgs, vectorizer__max_features=500
[CV 1/10; 1/180] START clf__C=0.01, clf__penalty=l1, clf__solver=lbfgs, vectorizer__max_features=500
[CV 4/10; 1/180] START clf__C=0.01, clf__penalty=l1, clf__solver=lbfgs, vectorizer__max_features=500
[CV 2/10; 1/180] END clf__C=0.01, clf__penalty=l1, clf__solver=lbfgs, vectorizer__max_features=500;, score=nan total time=   0.3s
[CV 3/10; 1/180] END clf__C=0.01, clf__penalty=l1, clf__solver=lbfgs, vectorizer__max_features=500;, score=nan total time=   0.3s
[CV 4/10; 1/180] END clf__C=0.01, clf__penalty=l1, clf__solver=lbfgs, vectorizer__max_features=500;, score=nan total time=   0.3s
[CV 1/10; 1/180] END clf__C=0.01, clf__penalty=l1, clf__solver=lbfgs, vectorizer__max_features=500;, score=nan total time=   0.3s
[CV 5/10; 1

KeyboardInterrupt: 

In [15]:
pipeline = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('clf', LogisticRegression(max_iter=9999))
])

param_grid = {
    'vectorizer__max_features': [5000, None],
    'vectorizer__lowercase': [True, False],
    'clf__C': [0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1.0],
    'clf__class_weight': ['balanced', None],
    'clf__penalty': ['l1', 'l2'],
    'clf__solver': ['lbfgs', 'liblinear', 'saga']
}

grid_search = GridSearchCV(pipeline, param_grid, cv=10, scoring='accuracy', n_jobs=1, verbose=10)

grid_search.fit(X, Y)

best_params = grid_search.best_params_
best_estimator = grid_search.best_estimator_

# Calculate the mean accuracy using cross-validation
mean_accuracy = cross_val_score(best_estimator, X, Y, scoring='accuracy', cv=10).mean()

print("Best Parameters:", best_params)
print("Best Mean Accuracy:", mean_accuracy)

Fitting 10 folds for each of 60 candidates, totalling 600 fits
[CV 1/10; 1/60] START clf__C=0.001, clf__penalty=l1, clf__solver=lbfgs, vectorizer__max_features=5000
[CV 2/10; 1/60] START clf__C=0.001, clf__penalty=l1, clf__solver=lbfgs, vectorizer__max_features=5000
[CV 2/10; 1/60] END clf__C=0.001, clf__penalty=l1, clf__solver=lbfgs, vectorizer__max_features=5000;, score=nan total time=   0.3s
[CV 1/10; 1/60] END clf__C=0.001, clf__penalty=l1, clf__solver=lbfgs, vectorizer__max_features=5000;, score=nan total time=   0.3s
[CV 3/10; 1/60] START clf__C=0.001, clf__penalty=l1, clf__solver=lbfgs, vectorizer__max_features=5000
[CV 4/10; 1/60] START clf__C=0.001, clf__penalty=l1, clf__solver=lbfgs, vectorizer__max_features=5000
[CV 4/10; 1/60] END clf__C=0.001, clf__penalty=l1, clf__solver=lbfgs, vectorizer__max_features=5000;, score=nan total time=   0.3s
[CV 3/10; 1/60] END clf__C=0.001, clf__penalty=l1, clf__solver=lbfgs, vectorizer__max_features=5000;, score=nan total time=   0.3s
[CV 5

100 fits failed out of a total of 600.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
100 fits failed with the following error:
Traceback (most recent call last):
  File "/opt/homebrew/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 729, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/opt/homebrew/lib/python3.11/site-packages/sklearn/base.py", line 1152, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/homebrew/lib/python3.11/site-packages/sklearn/pipeline.py", line 427, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "/opt/homebrew/lib/python3.11/site-packages/sklearn/base.py", li

Best Parameters: {'clf__C': 0.05, 'clf__penalty': 'l2', 'clf__solver': 'liblinear', 'vectorizer__max_features': None}
Best Mean Accuracy: 0.4505
