In [28]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from imblearn.over_sampling import SMOTE, RandomOverSampler, ADASYN, BorderlineSMOTE, SMOTENC

from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [29]:
#Celda para importar datos

main_df = pd.read_csv('dropout_students_data.csv', sep=';')
main_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4424 entries, 0 to 4423
Data columns (total 37 columns):
 #   Column                                          Non-Null Count  Dtype  
---  ------                                          --------------  -----  
 0   Marital status                                  4424 non-null   int64  
 1   Application mode                                4424 non-null   int64  
 2   Application order                               4424 non-null   int64  
 3   Course                                          4424 non-null   int64  
 4   Daytime/evening attendance	                     4424 non-null   int64  
 5   Previous qualification                          4424 non-null   int64  
 6   Previous qualification (grade)                  4424 non-null   float64
 7   Nacionality                                     4424 non-null   int64  
 8   Mother's qualification                          4424 non-null   int64  
 9   Father's qualification                   

In [30]:
#Celda para funcion de preprocesamiento
def preprocess_data(dataf):
    
    #Definimos listas con atributos para hot y para normalizar
    attributes_hot = ['Marital status', 'Application mode','Application order', 'Previous qualification',
                  'Nacionality',"Mother's qualification","Father's qualification","Mother's occupation",
                  "Father's occupation"]
    
    attributes_norm = ["Previous qualification (grade)","Curricular units 1st sem (credited)",
                    "Curricular units 1st sem (enrolled)", "Curricular units 1st sem (evaluations)",
                    "Curricular units 1st sem (approved)", "Curricular units 1st sem (grade)",
                    "Curricular units 1st sem (without evaluations)", "Curricular units 2nd sem (credited)",
                    "Curricular units 2nd sem (enrolled)", "Curricular units 2nd sem (evaluations)", 
                    "Curricular units 2nd sem (approved)", "Curricular units 2nd sem (grade)",
                    "Curricular units 2nd sem (without evaluations)","Admission grade", "Age at enrollment"]
     
    def normalize_cols(data, feature):

        data_backup = data.copy()

        for feature_item in feature:
            max_val = data[feature_item].max()
            min_val = data[feature_item].min()
            data_backup[feature_item] = (data[feature_item] - min_val) / (max_val - min_val)

        return data_backup
    
    #Eliminamos atributos innecesarios
    dataf.drop(columns=['Unemployment rate','Inflation rate','Course','GDP'],inplace=True)
    
    #Hacemos hot sobre los atributos categoricos
    hot_df = pd.get_dummies(data=dataf, columns=attributes_hot)
    
    #Normalizamos las columnas que nos interesan
    norm_df = normalize_cols(hot_df, attributes_norm)
    
    #------------------------------------------------------
    # Sección para muestreo sobre el dataframe
    class_0, class_1, class_2 = norm_df['Target'].value_counts() #número de clases del target

    df_class_0 = norm_df[norm_df['Target']=='Dropout'] #Separamos el dataframe en clases
    df_class_1 = norm_df[norm_df['Target']=='Enrolled']
    df_class_2 = norm_df[norm_df['Target']=='Graduate']
    
    #Sub-muestreo de la clase más grande
    df_class_2_under = df_class_2.sample(class_1)
    data_under_df = pd.concat([df_class_0,df_class_1,df_class_2_under],axis=0)
    
    X_pre = data_under_df.drop(columns=['Target'])
    y_pre = data_under_df['Target']
    """
    #Sobre-muestreo SMOTE para la menor clase
    sm = SMOTE(random_state = 12)
    X, y = sm.fit_resample(X_pre, y_pre)
    
    #Sobre-muestreo naive-random
    ros = RandomOverSampler(random_state=12)
    X_ros, y_ros = ros.fit_resample(X_pre, y_pre)
    
    #Sobre-muestro SMONTEC
    smote_nc = SMOTENC(categorical_features=["Dropout", "Enrolled","Graduate"], random_state=12)
    X_smoc, y_smoc = smote_nc.fit_resample(X_pre, y_pre)
    
    #Sobre-muestreo BorderlineSMOTE
    blsmote = BorderlineSMOTE()
    X_bls, y_bls = blsmote.fit_resample(X_pre, y_pre)
    """
    #aplicamos encoding del Target
    label_deco = LabelEncoder()
    y_deco = label_deco.fit_transform(y_pre)
    y_deco = pd.DataFrame(y_deco)
    
    #Sobre-muestreo ADASYN
    ada = ADASYN()
    X_ada, y_ada = ada.fit_resample(X_pre, y_deco)
    """
    #Sobre-muestreo SMOTE para la menor clase
    sm = SMOTE(random_state = 12)
    X_sm, y_sm = sm.fit_resample(X_pre, y_deco)
    """
    return X_ada, y_ada, label_deco

In [31]:
X, y, label_deco = preprocess_data(main_df)

In [32]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=12)

In [33]:
gb_model = GaussianNB()
gb_model.fit(X_train, y_train)

y_pred = gb_model.predict(X_test)

a0 = label_deco.inverse_transform(y_test)
a1 = label_deco.inverse_transform(y_pred)

print(classification_report(a0,a1))

              precision    recall  f1-score   support

     Dropout       0.73      0.13      0.23       419
    Enrolled       0.38      0.97      0.54       436
    Graduate       0.44      0.08      0.13       424

    accuracy                           0.40      1279
   macro avg       0.51      0.39      0.30      1279
weighted avg       0.51      0.40      0.30      1279



  return f(*args, **kwargs)
  return f(*args, **kwargs)


In [27]:
gb_model = GaussianNB()
gb_model.fit(X_train, y_train)

y_pred = gb_model.predict(X_test)

a0 = label_deco.inverse_transform(y_test)
a1 = label_deco.inverse_transform(y_pred)

print(classification_report(a0,a1))

              precision    recall  f1-score   support

     Dropout       0.75      0.18      0.29       414
    Enrolled       0.24      0.96      0.38       237
    Graduate       0.60      0.04      0.08       440

    accuracy                           0.29      1091
   macro avg       0.53      0.39      0.25      1091
weighted avg       0.58      0.29      0.22      1091



  return f(*args, **kwargs)
  return f(*args, **kwargs)
