In [1]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

df = pd.read_csv('lpor_explorer.csv')

# One-hot encode columns in columns with 2 categories
df = pd.get_dummies(df, columns=['Escola', 'Sexo', 'TP_Moradia', 'Tamanho_Familia', 'Situacao_Pais', 'Apoio_Educacao_Extra', 'Apoio_Educacao_Pais', 'Aulas_Particulares', 'Tem_Internet', 'Quer_Fazer_Graduacao', 'Frequentou_Creche', 'Ativ_Extracurricular', 'Esta_Namorando'])
# One-hot encode columns in columns with more than 2 categories
df = pd.get_dummies(df, columns=['Motivo_Escolha_Escolar', 'Responsavel_Legal', 'Trabalho_Mae', 'Trabalho_Pai'])
# Drop columns: 'Frequentou_Creche_Não', 'Escola_Mousinho da Silveira' and 11 other columns
df = df.drop(columns=['Trabalho_Mae_other', 'Trabalho_Pai_other', 'Responsavel_Legal_Outro', 'Motivo_Escolha_Escolar_Outro', 'Frequentou_Creche_Não', 'Escola_Mousinho da Silveira', 'Sexo_Mulher', 'TP_Moradia_Rural', 'Tamanho_Familia_Até 3', 'Situacao_Pais_Morando Juntos', 'Apoio_Educacao_Extra_Não', 'Apoio_Educacao_Pais_Não', 'Aulas_Particulares_Não', 'Tem_Internet_Não', 'Quer_Fazer_Graduacao_Não', 'Ativ_Extracurricular_Não', 'Esta_Namorando_Não'])
df.columns = df.columns.str.replace('_Sim', '')    

# Put numbers instead categories in a column
def encoding(column, categories):
    for i, category in enumerate(categories):
        column = column.replace(category, i)
    df[column.name] = column

encoding(df.Estado_Saude, ['Muito Ruim', 'Ruim', 'Razoavel', 'Bom', 'Muito Bom'])
encoding(df.Boa_Convivencia_Familia, ['Muito Ruim', 'Ruim', 'Razoavel', 'Bom', 'Excelente']) 
encoding(df.Tempo_Estudo_Semanal, ['Até 2h,', '2 a 5h', '5 a 10h', 'Mais de 10h'])
encoding(df.Tempo_ida_Escola, ['Até 15 min', '15 a 30 min', '30 min a 1h', 'Mais de 1h'])
encoding(df.Educacao_Pai, ['Nenhuma', 'Ensino Fundamental 1', 'Ensino Fundamental 2', 'Ensino Médio', 'Ensino Superior'])
encoding(df.Educacao_Mae, ['Nenhuma', 'Ensino Fundamental 1', 'Ensino Fundamental 2', 'Ensino Médio', 'Ensino Superior'])
cat = ['Muito Baixo', 'Baixo', 'Moderado', 'Alto', 'Muito Alto']
encoding(df.Tempo_Livre_Apos_Escola, cat)
encoding(df.Tempo_com_Amigos, cat)
encoding(df.Alcool_Dia_Util, cat)
encoding(df.Alcool_Fim_Semana, cat)

# Replace both grades with the average of the two semesters
df['Media'] = df. Nota_1Semestre + df.Nota_2Semestre / 2
df = df.drop(columns=['Nota_1Semestre', 'Nota_2Semestre']) 

###############################################################################
target = ['Boa_Convivencia_Familia']

# Split features and target
X = df.drop(columns=target)
y = df[target].values.ravel()

# Normalization of DF
X = pd.DataFrame(MinMaxScaler().fit_transform(X), columns=X.columns)

# Split train and test data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

###############################################################################
def ML(model, modelName):
    # Model train
    model.fit(X_train, y_train)

    # Model evaluation
    print(modelName)
    print(f'Train: {model.score(X_train, y_train):.2%}')
    print(f'Test: {model.score(X_test, y_test):.2%}')
    print('')

# Caso Boa_Convivencia_Familia

In [2]:
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB

ML(SVC(), 'SVC')
ML(RandomForestClassifier(), 'RandomForestClassifier')
ML(KNeighborsClassifier(), 'KNeighborsClassifier')
ML(XGBClassifier(), 'XGBClassifier')
ML(DecisionTreeClassifier(), 'DecisionTreeClassifier')
ML(AdaBoostClassifier(), 'AdaBoostClassifier')
ML(GaussianNB(), 'GaussianNB')

SVC
Train: 53.56%
Test: 48.46%

RandomForestClassifier
Train: 100.00%
Test: 45.38%

KNeighborsClassifier
Train: 58.19%
Test: 43.08%

XGBClassifier
Train: 100.00%
Test: 41.54%

DecisionTreeClassifier
Train: 100.00%
Test: 32.31%

AdaBoostClassifier
Train: 46.63%
Test: 32.31%

GaussianNB
Train: 20.62%
Test: 15.38%



In [3]:
from sklearn.linear_model import LogisticRegression
from lightgbm import LGBMClassifier

ML(LogisticRegression(), 'LogisticRegression')
ML(LGBMClassifier(), 'LGBMClassifier')

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression
Train: 56.26%
Test: 40.00%

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000230 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 156
[LightGBM] [Info] Number of data points in the train set: 519, number of used features: 37
[LightGBM] [Info] Start training from score -3.307465
[LightGBM] [Info] Start training from score -3.160861
[LightGBM] [Info] Start training from score -1.895195
[LightGBM] [Info] Start training from score -0.710640
[LightGBM] [Info] Start training from score -1.275170
LGBMClassifier
Train: 100.00%
Test: 40.00%



In [5]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from sklearn.metrics import accuracy_score


# Criar uma rede neural simples com TensorFlow/Keras
model = Sequential()
model.add(Dense(units=64, activation='relu', input_dim=X_train.shape[1]))
model.add(Dense(units=128, activation='relu'))
model.add(Dense(units=1, activation='sigmoid'))
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Treinar o modelo
model.fit(X_train, y_train, epochs=100, batch_size=32, validation_data=(X_test, y_test))

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<keras.src.callbacks.History at 0x1fc15d33f50>