# Análise comparativa de resultados


## 1. Preparação dos dados

### 1.1 Configurações iniciais

In [None]:
import pandas as pd
import seaborn as sns

from pathlib import Path

%pip install -U scikit-learn
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.metrics import accuracy_score, precision_score, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import (
    OneHotEncoder, StandardScaler, RobustScaler, OrdinalEncoder, LabelEncoder
)

### 1.2 Obtendo os dados

In [599]:
data_path = Path('../data/raw/data.csv')
dict_path = Path('../data/external/dicionario.csv')

In [600]:
# Ler conjunto de dados
df = (
    pd.read_csv(data_path)
)
# Ler dicionário de dados
df_dict = pd.read_csv(dict_path)
df_dict

Unnamed: 0,variavel,significado,tipo,subtipo,resposta
0,gender,Gênero do aluno,Qualitativa,Nominal,False
1,NationalITy,Nacionalidade do aluno,Qualitativa,Nominal,False
2,PlaceofBirth,Local de nascimento do aluno,Qualitativa,Nominal,False
3,StageID,Nível de escolaridade a que o aluno pertence,Qualitativa,Ordinal,False
4,GradeID,A série em que o aluno está matriculado,Qualitativa,Ordinal,False
5,SectionID,Sala de aula à qual o aluno pertence,Qualitativa,Nominal,False
6,Topic,Tópico do curso,Qualitativa,Nominal,False
7,Semester,Semestre do ano letivo,Qualitativa,Ordinal,False
8,Relation,Progenitor responsável pelo aluno,Qualitativa,Nominal,False
9,raisedhands,Número de vezes que o aluno levantou a mão,Quantitativa,Discreta,False


In [601]:
# Separar variáveis

variaveis_interesse = df_dict.query('resposta == False').variavel.to_list()
variavel_resposta = df_dict.query('resposta == True').variavel.to_list()


df_X = df[variaveis_interesse]
df_y = df[variavel_resposta]

# Separar variáveis de interesse por tipo
nominal_columns = df_dict.query('variavel in @variaveis_interesse and subtipo == "Nominal"').variavel.to_list()
continuos_columns = df_dict.query('variavel in @variaveis_interesse and subtipo == "Discreta"').variavel.to_list()

### 1.3 Tratando dados discrepantes

In [602]:
# criando pipelines para tratar os dados

nominal_preprocessor = Pipeline(steps=[
    ('missing', SimpleImputer(strategy='most_frequent')), # Tratamento de dados faltantes ()
    ('encoding', OneHotEncoder(sparse=False, handle_unknown='ignore')), # Codificação de variáveis
    # Seleção de variáveis
     ('normalization', StandardScaler()) # Normalização de variáveis
])
continuous_preprocessor = Pipeline(steps=[
    # Tratamento de dados discrepantes
    ('missing', KNNImputer(n_neighbors=5)), # Tratamento de dados faltantes
    # Seleção de variáveis
    ('normalization', RobustScaler()) # Normalização
])

preprocessor = ColumnTransformer([
    ('nominal', nominal_preprocessor, nominal_columns),
    ('continuos', continuous_preprocessor, continuos_columns)
])

preprocessor

In [603]:
preprocessor.fit(df_X)

X = preprocessor.transform(df_X)

X

array([[-0.7574764 ,  0.7574764 , -0.1382327 , ..., -0.765625  ,
        -0.70454545, -0.38      ],
       [-0.7574764 ,  0.7574764 , -0.1382327 , ..., -0.703125  ,
        -0.68181818, -0.28      ],
       [-0.7574764 ,  0.7574764 , -0.1382327 , ..., -0.90625   ,
        -0.75      , -0.18      ],
       ...,
       [ 1.32017315, -1.32017315, -0.1382327 , ...,  0.140625  ,
        -0.18181818, -0.2       ],
       [ 1.32017315, -1.32017315, -0.1382327 , ..., -0.75      ,
        -0.43181818,  0.36      ],
       [ 1.32017315, -1.32017315, -0.1382327 , ..., -0.796875  ,
        -0.22727273,  0.46      ]])

In [604]:
y = df_y['Class'].replace({'L': 0, 'M': 1, 'H':2}).to_numpy().ravel()

y

array([1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 2, 1, 0, 0, 2, 1, 1, 1, 1, 2, 1, 1,
       1, 0, 0, 0, 1, 0, 1, 1, 2, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1,
       1, 0, 0, 2, 2, 1, 0, 0, 1, 2, 0, 0, 0, 0, 1, 1, 0, 1, 2, 1, 0, 0,
       1, 2, 2, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 2, 0, 0, 0, 1, 2, 0, 2, 0,
       0, 0, 0, 2, 2, 2, 0, 2, 2, 1, 1, 1, 1, 2, 0, 0, 1, 0, 1, 2, 1, 1,
       2, 1, 0, 0, 0, 0, 1, 2, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1,
       0, 0, 2, 2, 2, 1, 2, 1, 0, 0, 1, 2, 0, 1, 2, 1, 1, 2, 2, 1, 2, 0,
       1, 2, 1, 1, 0, 1, 2, 1, 2, 1, 1, 2, 1, 2, 2, 1, 2, 1, 0, 0, 1, 0,
       2, 1, 2, 1, 2, 0, 2, 1, 0, 2, 1, 1, 2, 1, 0, 0, 1, 1, 1, 1, 2, 2,
       0, 1, 2, 2, 1, 1, 0, 2, 1, 1, 1, 1, 2, 1, 2, 0, 0, 0, 1, 1, 2, 1,
       1, 1, 1, 2, 2, 1, 0, 0, 2, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 2, 2, 1,
       0, 1, 2, 1, 2, 1, 0, 1, 2, 0, 1, 0, 2, 2, 2, 1, 1, 0, 0, 1, 1, 1,
       1, 2, 1, 1, 1, 1, 2, 1, 1, 2, 2, 2, 2, 2, 1, 1, 2, 2, 2, 2, 1, 1,
       2, 2, 1, 1, 0, 0, 2, 2, 1, 1, 2, 2, 1, 1, 0,

## 2. Escolha do modelo

### 2.1 Escolha do modelo

Iremos análisar quatro modelos, que serão testados utilizando um método de validação cruzada por holdout, os modelos que serão testados serão:

* Linear Regression (LR)
* K-Nearest-Neighbors (KNN)
* Random Forest (RF)
* Logistic Regression (NB)

#### Linear Regression 

In [605]:
# Dividir o dataset em conjuntos de treinamento e teste
X_train, X_test, y_train, y_test = train_test_split(
    X ,y, test_size = 0.3, random_state = 42)

In [606]:
model = LinearRegression()
model.fit(X_test,y_test)


In [607]:
y_test_hat = model.predict(X_test)
y_test_hat

array([ 0.9599762 ,  0.10049438,  1.58410645,  1.53533936,  1.87893677,
        0.42593384,  0.62542725,  0.20857239,  1.0287323 ,  1.21176147,
        1.4080658 ,  1.82693481,  1.49339294,  0.26939392,  1.14529419,
        1.67216492,  0.34005737,  1.06152344,  0.55361938,  0.79986572,
        1.85978699,  2.0843811 ,  0.54943848,  1.00253296,  0.20262146,
        0.27201843,  1.67826843,  1.09542847,  0.22499084,  0.24743652,
        1.5824585 ,  0.4415741 ,  0.56991577,  1.02192688,  1.4813385 ,
        1.16862488,  0.58770752, -0.02853394,  1.01950073,  1.23445129,
        1.05204773,  1.97494507,  0.42279053,  1.18006897,  1.754776  ,
        0.04299927,  2.04167175,  1.09645081,  1.55207825,  0.31588745,
        1.48565674,  0.87367249,  0.1461792 ,  1.84082031,  1.00721741,
        1.22305298,  0.40579224,  1.70645142,  0.53459167,  0.43995667,
        1.21722412,  0.11276245, -0.39060974,  0.98562622,  1.23435974,
        0.82000732, -0.0039978 ,  1.75271606,  0.04779053,  1.43

In [608]:
#generate a classification report
class_report = classification_report(y_test,y_pred)
print("Classification Report:\n", class_report)

Classification Report:
               precision    recall  f1-score   support

           0       0.28      0.27      0.27        45
           1       0.48      0.47      0.47        68
           2       0.15      0.16      0.15        31

    accuracy                           0.34       144
   macro avg       0.30      0.30      0.30       144
weighted avg       0.34      0.34      0.34       144



### K-Nearest-Neighbors 

In [610]:
#Model Building KNN
X_train, X_test, y_train, y_test = train_test_split(
    X,y, test_size = 0.30, random_state=0
)

In [611]:
#model building
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)

In [614]:
#generate a classification report
class_report = classification_report(y_test,y_pred)
print("Classification Report:\n", class_report)

Classification Report:
               precision    recall  f1-score   support

           0       0.60      0.76      0.68        34
           1       0.61      0.58      0.59        71
           2       0.59      0.51      0.55        39

    accuracy                           0.60       144
   macro avg       0.60      0.62      0.61       144
weighted avg       0.60      0.60      0.60       144



### Random Forest

In [616]:
#Model Building 
X_train, X_test, y_train, y_test = train_test_split(
    X,y, test_size = 0.20, random_state=0
)

In [617]:
#initialize and train classifier
clf = RandomForestClassifier(random_state=42)
clf.fit(X_train, y_train)

In [618]:
#predict on the test data
y_pred = clf.predict(X_test)

In [621]:
#generate a classification report
class_report = classification_report(y_test,y_pred)
print("Classification Report:\n", class_report)

Classification Report:
               precision    recall  f1-score   support

           0       0.85      0.74      0.79        23
           1       0.64      0.80      0.71        45
           2       0.70      0.50      0.58        28

    accuracy                           0.70        96
   macro avg       0.73      0.68      0.70        96
weighted avg       0.71      0.70      0.69        96



### Logistic Regression

In [622]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=0)

In [623]:
logistic_regression = LogisticRegression()

logistic_regression.fit(X_train,y_train)

y_pred = logistic_regression.predict(X_test)

In [625]:
#generate a classification report
class_report = classification_report(y_test,y_pred)
print("Classification Report:\n", class_report)

Classification Report:
               precision    recall  f1-score   support

           0       0.60      0.76      0.68        34
           1       0.61      0.58      0.59        71
           2       0.59      0.51      0.55        39

    accuracy                           0.60       144
   macro avg       0.60      0.62      0.61       144
weighted avg       0.60      0.60      0.60       144



## Resultados

Por meio das métricas análisadas (precision, recall, f1-score), foi possível observar que o **Random Forest** obteve maior número de predições corretas, sendo, nesta análise comparativa, o modelo mais adequado. Enquanto o **Linear Regression** obteve o menor número de predições corretas, sendo o modelo menos indicato para nossas predições.