# Inicializando

## Carregando bibliotecas e funções de auxílio

In [134]:
from sklearn.base import BaseEstimator
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import xml.etree.ElementTree as et


def parse_xml_to_df(xml_root):
    # Cria um dataframe vazio em que as linhas serão concatenadas.
    df = pd.DataFrame(columns=['similarity', 't', 'h'])
    # Para cada par na root.
    for pair in xml_root:
        # Recupera o valor de t.
        t = pair[0].text
        # Recupera o valor de h.
        h = pair[1].text
        # Recupera o valor da variável target.
        is_entailment = pair.attrib['entailment'] == 'Entailment'
        # Recupera o valor de similaridade atribuído.
        similarity = float(pair.attrib['similarity'])
        # Constroi a nova linha.
        new_line = pd.DataFrame([{'t': t, 'h': h, 'similarity': similarity, 'is_entailment': is_entailment}])
        # Adiciona ao dataframe.
        df = pd.concat([df, new_line], ignore_index=True)
    return df

    
class GenericEstimator(BaseEstimator):
    
    def fit(self): pass
    
    def score(self): pass

## Carregando conjuntos de dados

In [77]:
# Recupera o arquivo de entrada de treinamento.
train_xml_root = et.parse('../data/assin2-train.xml').getroot()
# Cria o dataframe de treinamento.
df_train = parse_xml_to_df(train_xml_root)

# Recupera o arquivo de entrada de validação.
dev_xml_root = et.parse('../data/assin2-dev.xml').getroot()
# Cria o dataframe de validação.
df_dev = parse_xml_to_df(dev_xml_root)

# Recupera o arquivo de entrada de validação.
test_xml_root = et.parse('../data/assin2-test.xml').getroot()
df_test = parse_xml_to_df(test_xml_root)

In [78]:
print('Shape de treinamento:', df_train.shape)
print('Shape de validação:', df_dev.shape)
print('Shape de teste:', df_test.shape)

Shape de treinamento: (6500, 4)
Shape de validação: (500, 4)
Shape de teste: (2448, 4)


## Transformação dos dados

In [93]:
# Criando uma coluna que concatena as duas frases em uma nas bases de dados.
df_train['t_h'] = df_train['t'] + ' ' + df_train['h']  
df_dev['t_h'] = df_dev['t'] + ' ' + df_dev['h']  
df_test['t_h'] = df_test['t'] + ' ' + df_test['h']  

# Modelagem

## Abordagem 1: concatenando ambas as frases em uma única frase

In [143]:
from sklearn.feature_extraction.text import CountVectorizer

# Instancia um objeto para realizar o bag of words.
vectorizer = CountVectorizer()
# Ajusta o BoW no conjunto de treinamento.
vectorizer.fit(df_train['t_h'])

# Vetorizando o conjunto de treinamento.
df_train_vec = pd.DataFrame(vectorizer.transform(df_train['t_h']).toarray(), columns=vectorizer.get_feature_names_out())
df_train_vec['is_entailment'] = df_train['is_entailment']

# Vetorizando o conjunto de validação.
df_dev_vec = pd.DataFrame(vectorizer.transform(df_dev['t_h']).toarray(), columns=vectorizer.get_feature_names_out())
df_dev_vec['is_entailment'] = df_dev['is_entailment']

# Vetorizando o conjunto de validação.
df_test_vec = pd.DataFrame(vectorizer.transform(df_test['t_h']).toarray(), columns=vectorizer.get_feature_names_out())
df_test_vec['is_entailment'] = df_test['is_entailment']

X_train, y_train = df_train_vec.drop(columns='is_entailment'), df_train_vec['is_entailment'].astype(int)
X_dev, y_dev = df_dev_vec.drop(columns='is_entailment'), df_dev_vec['is_entailment'].astype(int)
X_test, y_test = df_test_vec.drop(columns='is_entailment'), df_test_vec['is_entailment'].astype(int)

print('Shape de treinamento:', X_train.shape, y_train.shape)
print('Shape de validação:', X_dev.shape, y_dev.shape)
print('Shape de teste:', X_test.shape, y_test.shape)

Shape de treinamento: (6500, 2310) (6500,)
Shape de validação: (500, 2310) (500,)
Shape de teste: (2448, 2310) (2448,)


In [169]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

        
# Cria o pipeline de transformação.
pipe = Pipeline([('clf', GenericEstimator())])

# Define um espaço de busca com diferentes algoritmos.
search_space = [

    {'clf': [LogisticRegression()],
     'clf__penalty': ['l1', 'l2'],
     'clf__C': np.logspace(0, 4, 10)},
    
    {'clf': [RandomForestClassifier()],
     'clf__n_estimators': [150],
     'clf__max_depth': [2, 20, 10]},

]
    
# Cria um objeto de busca em grid com semente setada para reprodutibilidade.    
grid = GridSearchCV(
    pipe,
    search_space, 
    n_jobs=10,
    cv=5,
    scoring='f1',
    verbose=10
)

# Ajusta no conjunto de treinamento.
grid.fit(X_train, y_train)

df_training_results = pd.DataFrame(grid.cv_results_)[['params', 'mean_test_score', 'std_test_score']]

print('-' * 60)
print('Resultados')
print('-' * 60)
print('Treinamento:')
print(classification_report(y_train, grid.predict(X_train)))
print('-' * 60)
print('Validação:')
print(classification_report(y_dev, grid.predict(X_dev)))
print('-' * 60)
print('Teste:')
print(classification_report(y_test, grid.predict(X_test)))
print('-' * 60)

Fitting 5 folds for each of 23 candidates, totalling 115 fits


50 fits failed out of a total of 115.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
50 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\Windows\Documents\virtualenvs\phd-env\Lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\Windows\Documents\virtualenvs\phd-env\Lib\site-packages\sklearn\pipeline.py", line 405, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "C:\Users\Windows\Documents\virtualenvs\phd-env\Lib\site-packages\sklearn\linear_model\_logistic.py", line 1162, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
             ^^^^^^^^^^^^^^^

------------------------------------------------------------
Resultados
------------------------------------------------------------
Treinamento:
              precision    recall  f1-score   support

           0       0.89      0.87      0.88      3250
           1       0.88      0.89      0.88      3250

    accuracy                           0.88      6500
   macro avg       0.88      0.88      0.88      6500
weighted avg       0.88      0.88      0.88      6500

------------------------------------------------------------
Validação:
              precision    recall  f1-score   support

           0       0.79      0.81      0.80       250
           1       0.81      0.79      0.80       250

    accuracy                           0.80       500
   macro avg       0.80      0.80      0.80       500
weighted avg       0.80      0.80      0.80       500

------------------------------------------------------------
Teste:
              precision    recall  f1-score   support

     

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
