# EP1
## Classificação da clareza das respostas na plataforma eSIC

ACH2118 - Introdução ao Processamento de Língua Natural

Professor Ivandré Paraboni

Integrantes:
* Luiza Borghi de Mello - 11796037
* Raphael Nobuaki Iwamoto - 11882986


### Base

In [None]:
%pip install pandas
%pip install numpy
%pip install spacy
%pip install nltk
%pip install scikit-learn

!python -m spacy download pt_core_news_lg
!python3 -m spacy download pt_core_news_lg

In [None]:
import pandas as pd
import numpy as np
import spacy
import nltk
import time
import itertools
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score

In [None]:
nlp = spacy.load('pt_core_news_lg')

def aplica_lematizacao(df: pd.DataFrame, coluna: str):
    df[coluna + '_lematizada'] = ''
    i = 0
    for text in df[coluna]:
        doc = nlp(text)
        lemma = [token.lemma_ for token in doc if not token.is_punct and not token.is_space]
        df[coluna + '_lematizada'].iloc[i] = " ".join(x for x in lemma)
        i = i + 1

In [None]:
nltk.download('rslp')

def aplica_stemming(df: pd.DataFrame, coluna: str):
    stemmer = nltk.stem.RSLPStemmer()
    df[coluna + '_stemming'] = df[coluna].apply(lambda x: " ".join(stemmer.stem(x) for x in x.split()))

In [None]:
nltk.download('stopwords')

def remove_stopwords(df: pd.DataFrame, coluna: str):
    stop = nltk.corpus.stopwords.words('portuguese')
    df[coluna + '_stopwords'] = df[coluna].apply(lambda x: " ".join(x for x in x.split() if x not in stop))

### Leitura e análise do dataset

In [None]:
treino_dataset = pd.read_excel('data/ep1_esic2023_clareza_TRAIN.xlsx')
treino_dataset

In [None]:
treino_dataset.describe()

In [None]:
counts = treino_dataset['clarity'].value_counts()
counts

In [None]:
aplica_lematizacao(treino_dataset, 'resp_text')
aplica_stemming(treino_dataset, 'resp_text')
remove_stopwords(treino_dataset, 'resp_text')

treino_dataset

### Grid Search

#### 1. Logistic Regression | sem pré-processamento | Count Vectorizer

In [None]:
X = treino_dataset.resp_text
Y = treino_dataset.clarity

lowercase_list = [True, False]
analyzer_list = ['word', 'char']
ngram_range_list = [(1, 1), (2, 2), (3, 3), (4, 4), (5, 5)]
C_list = [0.001, 0.01, 0.1, 0.5, 1.0, 5.0, 10.0]
class_weight_list = ['balanced', None]
penalty_list = ['l1', 'l2']
solver_list = ['lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky', 'sag', 'saga']

best_accuracy = 0
best_hyperparameters = {}

for lowercase, analyzer, ngram_range, C, class_weight, penalty, solver in itertools.product(lowercase_list, analyzer_list, ngram_range_list, C_list, class_weight_list, penalty_list, solver_list):
    vectorizer = CountVectorizer(analyzer=analyzer, lowercase=lowercase, ngram_range=ngram_range)
    X_count = vectorizer.fit_transform(X)

    clf = LogisticRegression(max_iter=9999, C=C, class_weight=class_weight, penalty=penalty, solver=solver, random_state=100)

    accuracy = 0
    try:
        accuracy = cross_val_score(clf, X_count, Y, scoring='accuracy', cv=10, error_score='raise').mean()

        print(f"Hyperparameters: lowercase={lowercase}, analyzer={analyzer}, ngram_range={ngram_range}, C={C}, class_weight={class_weight}, penalty={penalty}, solver={solver}")
        print(f"Mean Accuracy: {accuracy}\n")

        if accuracy > best_accuracy:
            best_accuracy = accuracy
            best_hyperparameters = {
                'lowercase': lowercase,
                'analyzer': analyzer,
                'ngram_range': ngram_range,
                'C': C,
                'class_weight': class_weight,
                'penalty': penalty,
                'solver': solver
            }
    except:
        print('Combination skipped\n')

print("Best Hyperparameters:", best_hyperparameters)
print("Best Mean Accuracy:", best_accuracy)

#### 2. Logistic Regression | Lematização | TF-IDF Vectorizer

In [None]:
X = treino_dataset.resp_text_lematizada
Y = treino_dataset.clarity

lowercase_list = [True, False]
analyzer_list = ['word', 'char']
ngram_range_list = [(2, 2), (3, 3), (4, 4), (5, 5)]
C_list = np.geomspace(3.0, 5.0, num=20)
class_weight_list = ['balanced', None]
penalty_list = ['l1', 'l2']
solver_list = ['lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky', 'sag', 'saga']

best_accuracy = 0
best_hyperparameters = {}

for lowercase, analyzer, ngram_range, C, class_weight, penalty, solver in itertools.product(lowercase_list, analyzer_list, ngram_range_list, C_list, class_weight_list, penalty_list, solver_list):
    vectorizer = TfidfVectorizer(analyzer=analyzer, lowercase=lowercase, ngram_range=ngram_range)
    X_tfidf = vectorizer.fit_transform(X)

    clf = TfidfVectorizer(max_iter=9999, C=C, class_weight=class_weight, penalty=penalty, solver=solver, random_state=100)

    accuracy = 0
    try:
        accuracy = cross_val_score(clf, X_tfidf, Y, scoring='accuracy', cv=10, error_score='raise').mean()

        print(f"Hyperparameters: lowercase={lowercase}, analyzer={analyzer}, ngram_range={ngram_range}, C={C}, class_weight={class_weight}, penalty={penalty}, solver={solver}")
        print(f"Mean Accuracy: {accuracy}\n")

        if accuracy > best_accuracy:
            best_accuracy = accuracy
            best_hyperparameters = {
                'lowercase': lowercase,
                'analyzer': analyzer,
                'ngram_range': ngram_range,
                'C': C,
                'class_weight': class_weight,
                'penalty': penalty,
                'solver': solver
            }
    except:
        print('Combination skipped\n')

print("Best Hyperparameters:", best_hyperparameters)
print("Best Mean Accuracy:", best_accuracy)

#### 3. Naive Bayes | sem pré-processamento | Count Vectorizer

In [None]:
X = treino_dataset.resp_text
Y = treino_dataset.clarity

analyzer_list = ['word', 'char']
ngram_range_list = [(1, 1), (2, 2), (3, 3), (5, 5)]
alpha_list = np.geomspace(1e-3, 1, num=50)

best_accuracy = 0
best_hyperparameters = {}

for analyzer, ngram_range, alpha in itertools.product(analyzer_list, ngram_range_list, alpha_list):
    vectorizer = CountVectorizer(ngram_range=ngram_range, analyzer=analyzer)
    X_count = vectorizer.fit_transform(X)

    clf = MultinomialNB(alpha=alpha)

    accuracy = cross_val_score(clf, X_count, Y, scoring='accuracy', cv=10).mean()

    print(f"Hyperparameters: ngram_range={ngram_range}, analyzer={analyzer}, alpha={alpha}")
    print(f"Mean Accuracy: {accuracy}\n")

    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_hyperparameters = {
            'ngram_range': ngram_range,
            'analyzer': analyzer,
            'alpha': alpha
        }

print("Best Hyperparameters:", best_hyperparameters)
print("Best Mean Accuracy:", best_accuracy)

#### 4. Naive Bayes | Lematização | TF-IDF Vectorizer

In [None]:
X = treino_dataset.resp_text_lematizada
Y = treino_dataset.clarity

analyzer_list = ['word', 'char']
ngram_range_list = [(1, 1), (2, 2), (3, 3), (5, 5)]
alpha_list = np.geomspace(1e-3, 1000, num=100)

best_accuracy = 0
best_hyperparameters = {}

for analyzer, ngram_range, alpha in itertools.product(analyzer_list, ngram_range_list, alpha_list):
    vectorizer = TfidfVectorizer(ngram_range=ngram_range, analyzer=analyzer)
    X_tfidf = vectorizer.fit_transform(X)

    clf = MultinomialNB(alpha=alpha)

    accuracy = cross_val_score(clf, X_tfidf, Y, scoring='accuracy', cv=10).mean()

    print(f"Hyperparameters: ngram_range={ngram_range}, analyzer={analyzer}, alpha={alpha}")
    print(f"Mean Accuracy: {accuracy}\n")

    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_hyperparameters = {
            'ngram_range': ngram_range,
            'analyzer': analyzer,
            'alpha': alpha
        }

print("Best Hyperparameters:", best_hyperparameters)
print("Best Mean Accuracy:", best_accuracy)

#### 5. Random Forest | Lematização | Count Vectorizer

In [None]:
X = treino_dataset.resp_text_lematizada
Y = treino_dataset.clarity

analyzer = 'word'
ngram_range = (1, 1)

vectorizer = CountVectorizer(ngram_range=ngram_range, analyzer=analyzer)
X_count = vectorizer.fit_transform(X)

clf = RandomForestClassifier(random_state=100)

accuracy = cross_val_score(clf, X, Y, scoring='accuracy', cv=10, n_jobs=3).mean()

print(f"Hyperparameters: ngram_range={ngram_range}, analyzer={analyzer}")
print(f"Mean Accuracy: {accuracy}\n")

#### 6. Random Forest | Lematização | TF-IDF Vectorizer

In [None]:
X = treino_dataset.resp_text_lematizada
Y = treino_dataset.clarity

analyzer_list = ['word', 'char']
ngram_range_list = [(1, 1), (2, 2)]

best_accuracy = 0
best_hyperparameters = {}

for analyzer, ngram_range in itertools.product(analyzer_list, ngram_range_list):
    vectorizer = TfidfVectorizer(ngram_range=ngram_range, analyzer=analyzer)
    X_tfidf = vectorizer.fit_transform(X)

    clf = RandomForestClassifier(random_state=100)

    accuracy = cross_val_score(clf, X, Y, scoring='accuracy', cv=10).mean()

    print(f"Hyperparameters: ngram_range={ngram_range}, analyzer={analyzer}")
    print(f"Mean Accuracy: {accuracy}\n")

    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_hyperparameters = {
            'ngram_range': ngram_range,
            'analyzer': analyzer,
        }

print("Best Hyperparameters:", best_hyperparameters)
print("Best Mean Accuracy:", best_accuracy)

#### 7. SVC | Lematização | Count Vectorizer

#### 8. SVC | Lematização | TF-IDF Vectorizer

#### 9. SGD Classifier | Lematização | TF-IDF Vectorizer