# 1. Preparación de los datos


In [1]:
import pandas as pd


In [2]:
# Cargamos el dataset. Asegúrate de tener el archivo csv en la carpeta correcta
df = pd.read_csv('datasets/StudentsPerformance.csv')
print(len(df))
df.head().T

1000


Unnamed: 0,0,1,2,3,4
gender,female,female,female,male,male
race/ethnicity,group B,group C,group B,group A,group C
parental level of education,bachelor's degree,some college,master's degree,associate's degree,some college
lunch,standard,standard,standard,free/reduced,standard
test preparation course,none,completed,none,none,none
math score,72,69,90,47,76
reading score,72,90,95,57,78
writing score,74,88,93,44,75


In [3]:
df.dtypes

gender                         object
race/ethnicity                 object
parental level of education    object
lunch                          object
test preparation course        object
math score                      int64
reading score                   int64
writing score                   int64
dtype: object

In [4]:
#Primero limpiamos las columnas para quitar espacios y caracteres especiales para poder trabajar con ellas.
replacer = lambda str: str.lower().str.replace(' ', '_').str.replace('/', '_').str.replace("'",'_')
df.columns = replacer(df.columns.str)
#hace lo mismo que la linea de arriba pero para el interior de las columnas y no para solo los titulos de las columnas
for col in list(df.dtypes[df.dtypes == 'object'].index):
    df[col] = replacer(df[col].str)
df.head().T


Unnamed: 0,0,1,2,3,4
gender,female,female,female,male,male
race_ethnicity,group_b,group_c,group_b,group_a,group_c
parental_level_of_education,bachelor_s_degree,some_college,master_s_degree,associate_s_degree,some_college
lunch,standard,standard,standard,free_reduced,standard
test_preparation_course,none,completed,none,none,none
math_score,72,69,90,47,76
reading_score,72,90,95,57,78
writing_score,74,88,93,44,75


In [5]:
#clculamos la media para definir el "passed", de 50 para arriba.
df.math_score.fillna(0)
df.reading_score.fillna(0)
df.writing_score.fillna(0)
df['media'] = (df.math_score + df.reading_score +df.writing_score)/3
df['passed'] = (df.media >= 80)
#quitamos las notas una vez hecha la media porque luego no las vamos a usar de dato. 
del df['math_score']
del df['reading_score']
del df['writing_score']
del df['media']
df.T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,990,991,992,993,994,995,996,997,998,999
gender,female,female,female,male,male,female,female,male,male,female,...,male,female,female,female,male,female,male,female,female,female
race_ethnicity,group_b,group_c,group_b,group_a,group_c,group_b,group_b,group_b,group_d,group_b,...,group_e,group_b,group_d,group_d,group_a,group_e,group_c,group_c,group_d,group_d
parental_level_of_education,bachelor_s_degree,some_college,master_s_degree,associate_s_degree,some_college,associate_s_degree,some_college,some_college,high_school,high_school,...,high_school,some_high_school,associate_s_degree,bachelor_s_degree,high_school,master_s_degree,high_school,high_school,some_college,some_college
lunch,standard,standard,standard,free_reduced,standard,standard,standard,free_reduced,free_reduced,free_reduced,...,free_reduced,standard,free_reduced,free_reduced,standard,standard,free_reduced,free_reduced,standard,free_reduced
test_preparation_course,none,completed,none,none,none,none,completed,none,completed,none,...,completed,completed,none,none,none,completed,none,completed,completed,none
passed,False,True,True,False,False,False,True,False,False,False,...,True,False,False,False,False,True,False,False,False,True


In [6]:
# ademas de como antes que hemos pasado los espacios ahora tambien pasamos a minusculas
df.columns = replacer(df.columns.str)

In [7]:
df.passed = (df.passed).astype(int) # convierte la columna passed a 0 y 1 y le dice que yes es el true, el resto es false

In [8]:
categorical = ['gender', 'race_ethnicity', 'parental_level_of_education','lunch', 'test_preparation_course']
# numerical =  ['math_score', 'reading_score', 'writing_score']
df[categorical].nunique()

gender                         2
race_ethnicity                 5
parental_level_of_education    6
lunch                          2
test_preparation_course        2
dtype: int64

In [9]:
# Separacion de los datos
from sklearn.model_selection import train_test_split

#Dividimos en entrenamiento y test.
df_train_full, df_test = train_test_split(df, test_size=0.2, random_state=1) #es igual que poenr train_size=0.8

#Dividimos a su vez el conjunto df_train_full en entrenamiento y validación

df_train, df_val = train_test_split(df_train_full, test_size=0.33, random_state=1) # normalmente se pone x_train, x_val, x_test

#Guarda las etiquetas de los ejemplos en una variable
y_train = df_train.passed.values
y_val = df_val.passed.values

# elimina la columna de etiquetas del conjunto de datos
 #quito media y passed y dejo las notas de cada una de las tres asignaturas, aunque sean directamente determinantes
 #para que pase o no, tal vez una de ellas sea más determinante que las otras o sea la que determine en mayor mediada un aprobado de un suspenso

del df_train['passed']
del df_val['passed']

In [10]:
print(len(df_train))
print(len(y_train))

536
536


In [11]:
global_mean = df_train_full.passed.mean()
round(global_mean, 3)

np.float64(0.199)

In [12]:
# Calcula la media de passed para cada una de las categorical para hacernos una idea. 
# Para gender
print(df_train_full.groupby('gender').passed.mean().round(3))

# Para race_ethnicity
print(df_train_full.groupby('race_ethnicity').passed.mean().round(3))

# Para parental_level_of_education
print(df_train_full.groupby('parental_level_of_education').passed.mean().round(3))

# Para lunch
print(df_train_full.groupby('lunch').passed.mean().round(3))

# Para test_preparation_course
print(df_train_full.groupby('test_preparation_course').passed.mean().round(3))

gender
female    0.234
male      0.162
Name: passed, dtype: float64
race_ethnicity
group_a    0.119
group_b    0.174
group_c    0.175
group_d    0.210
group_e    0.318
Name: passed, dtype: float64
parental_level_of_education
associate_s_degree    0.245
bachelor_s_degree     0.259
high_school           0.101
master_s_degree       0.300
some_college          0.192
some_high_school      0.186
Name: passed, dtype: float64
lunch
free_reduced    0.125
standard        0.239
Name: passed, dtype: float64
test_preparation_course
completed    0.294
none         0.143
Name: passed, dtype: float64


esto nos esta diciciendo que aunque la mayoria aprueba es cierto que hay algunas más determinantes que otras como por ejemplo el almuerzo hay mucha distanca entre los gratis y los standar, el curso de preparacion tambien tiene mucha diferencia, y por etnia tambien hay un salto mayor. 

In [13]:
from sklearn.metrics import mutual_info_score

calculate_mi = lambda col: mutual_info_score(col, df_train_full.passed)

#Con categorical es una lista con los nombres de las columnas categoricas, 
# aplicamos la funcion calculate_mi a cada una de las columnas categoricas
#y nos devuelve una serie con los valores de mi para cada columna, este mi
# nos dice la relacion que tiene cada variable con la variable objetivo passed

df_mi = df_train_full[categorical].apply(calculate_mi)
df_mi = df_mi.sort_values(ascending=False).to_frame(name='MI')
df_mi

Unnamed: 0,MI
test_preparation_course,0.016211
parental_level_of_education,0.011533
lunch,0.009924
race_ethnicity,0.008211
gender,0.004041


In [14]:
# print(df_train_full[numerical].corrwith(df_train_full.passed))

In [15]:
#orient='records' hace que cada fila del dataframe se convierta en un diccionario
# train_dict = df_train[categorical + numerical].to_dict(orient='records')
train_dict = df_train[categorical].to_dict(orient='records')
dict(sorted(train_dict[0].items()))

{'gender': 'male',
 'lunch': 'standard',
 'parental_level_of_education': 'high_school',
 'race_ethnicity': 'group_d',
 'test_preparation_course': 'none'}

In [16]:
from sklearn.feature_extraction import DictVectorizer
# crea el objeto DictVectorizer, que convierte listas de diccionarios en matrices numéricas
dv = DictVectorizer(sparse=False)
dv.fit(train_dict)

0,1,2
,dtype,<class 'numpy.float64'>
,separator,'='
,sparse,False
,sort,True


In [17]:
X_train = dv.transform(train_dict)

In [18]:
X_train[0]

array([0., 1., 0., 1., 0., 0., 1., 0., 0., 0., 0., 0., 0., 1., 0., 0., 1.])

In [19]:
dv.get_feature_names_out()

array(['gender=female', 'gender=male', 'lunch=free_reduced',
       'lunch=standard', 'parental_level_of_education=associate_s_degree',
       'parental_level_of_education=bachelor_s_degree',
       'parental_level_of_education=high_school',
       'parental_level_of_education=master_s_degree',
       'parental_level_of_education=some_college',
       'parental_level_of_education=some_high_school',
       'race_ethnicity=group_a', 'race_ethnicity=group_b',
       'race_ethnicity=group_c', 'race_ethnicity=group_d',
       'race_ethnicity=group_e', 'test_preparation_course=completed',
       'test_preparation_course=none'], dtype=object)

In [20]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(solver='liblinear')

model.fit(X_train, y_train)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'liblinear'
,max_iter,100


In [21]:
# val_dict = df_val[categorical + numerical].to_dict(orient='records')
val_dict = df_val[categorical].to_dict(orient='records')
X_val = dv.transform(val_dict)

In [22]:
# Lanzamos predicciones sobre el conjunto de validación
y_pred = model.predict_proba(X_val)
y_pred

array([[0.98276997, 0.01723003],
       [0.64043667, 0.35956333],
       [0.77201232, 0.22798768],
       [0.97250828, 0.02749172],
       [0.94311153, 0.05688847],
       [0.64043667, 0.35956333],
       [0.96346271, 0.03653729],
       [0.79451293, 0.20548707],
       [0.8627067 , 0.1372933 ],
       [0.8497671 , 0.1502329 ],
       [0.95121781, 0.04878219],
       [0.82289437, 0.17710563],
       [0.92362496, 0.07637504],
       [0.9129509 , 0.0870491 ],
       [0.94540288, 0.05459712],
       [0.83249955, 0.16750045],
       [0.82901873, 0.17098127],
       [0.65606624, 0.34393376],
       [0.94311153, 0.05688847],
       [0.76308125, 0.23691875],
       [0.50690395, 0.49309605],
       [0.91080576, 0.08919424],
       [0.76823287, 0.23176713],
       [0.51254652, 0.48745348],
       [0.69460084, 0.30539916],
       [0.6897902 , 0.3102098 ],
       [0.82597127, 0.17402873],
       [0.67897755, 0.32102245],
       [0.9101634 , 0.0898366 ],
       [0.47200701, 0.52799299],
       [0.

In [23]:
y_pred = model.predict_proba(X_val)[:, 1]
y_pred

array([0.01723003, 0.35956333, 0.22798768, 0.02749172, 0.05688847,
       0.35956333, 0.03653729, 0.20548707, 0.1372933 , 0.1502329 ,
       0.04878219, 0.17710563, 0.07637504, 0.0870491 , 0.05459712,
       0.16750045, 0.17098127, 0.34393376, 0.05688847, 0.23691875,
       0.49309605, 0.08919424, 0.23176713, 0.48745348, 0.30539916,
       0.3102098 , 0.17402873, 0.32102245, 0.0898366 , 0.52799299,
       0.0281323 , 0.28208869, 0.28989858, 0.04878219, 0.08123458,
       0.22403798, 0.11541637, 0.15345438, 0.08115344, 0.15372935,
       0.11776066, 0.35082763, 0.30539916, 0.28811303, 0.08115344,
       0.24701795, 0.15085676, 0.11842104, 0.15085676, 0.07589898,
       0.09595577, 0.1372933 , 0.42025902, 0.31765697, 0.18152522,
       0.30323808, 0.28166053, 0.22267079, 0.13729936, 0.04459244,
       0.23176713, 0.24701795, 0.30539916, 0.1409412 , 0.02731135,
       0.15372935, 0.29055261, 0.48548453, 0.48548453, 0.48745348,
       0.37074136, 0.06853429, 0.04364031, 0.18152522, 0.07637

In [24]:
passed = y_pred >=0.5
passed

array([False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False,  True, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False,  True, False, False, False,  True, False, False, False,
       False, False, False, False, False, False, False, False, False,
        True, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False,

In [25]:
y_val

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1,
       0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1])

In [26]:
round((y_val == passed).mean(), 3)

np.float64(0.761)

In [27]:
round((y_val == passed).mean(), 3)

np.float64(0.761)

In [28]:
import pickle
with open('models/students-model.pck', 'wb') as f:
    pickle.dump((dv,model), f)