# 1. PREPROCESAMIENTO


In [2]:
import pandas as pd


In [3]:
# Cargamos el dataset. 
df = pd.read_csv('datasets/student-mat.csv', sep=';')
print(len(df))
df.head().T

395


Unnamed: 0,0,1,2,3,4
school,GP,GP,GP,GP,GP
sex,F,F,F,F,F
age,18,17,15,15,16
address,U,U,U,U,U
famsize,GT3,GT3,LE3,GT3,GT3
Pstatus,A,T,T,T,T
Medu,4,1,1,4,3
Fedu,4,1,1,2,3
Mjob,at_home,at_home,at_home,health,other
Fjob,teacher,other,other,services,other


In [4]:
df.dtypes

school        object
sex           object
age            int64
address       object
famsize       object
Pstatus       object
Medu           int64
Fedu           int64
Mjob          object
Fjob          object
reason        object
guardian      object
traveltime     int64
studytime      int64
failures       int64
schoolsup     object
famsup        object
paid          object
activities    object
nursery       object
higher        object
internet      object
romantic      object
famrel         int64
freetime       int64
goout          int64
Dalc           int64
Walc           int64
health         int64
absences       int64
G1             int64
G2             int64
G3             int64
dtype: object

In [5]:
# calculamos la columna "passed", de 10 para arriba es aprobado (en este dataset va de 0 a 20).
# G3 es la nota final asi que nos centramos en esa.
df['passed'] = (df['G3'] >= 10)

# quitamos las notas G1, G2 y G3 una vez definido el aprobado porque son el resultado 
# y no las vamos a usar de dato de entrada (sería trampa).
del df['G1']
del df['G2']
del df['G3']

df.head().T

Unnamed: 0,0,1,2,3,4
school,GP,GP,GP,GP,GP
sex,F,F,F,F,F
age,18,17,15,15,16
address,U,U,U,U,U
famsize,GT3,GT3,LE3,GT3,GT3
Pstatus,A,T,T,T,T
Medu,4,1,1,4,3
Fedu,4,1,1,2,3
Mjob,at_home,at_home,at_home,health,other
Fjob,teacher,other,other,services,other


## 1.1 Normalización de los nombres de los atributos

In [6]:
# Primero limpiamos las columnas para quitar espacios y caracteres especiales para poder trabajar con ellas (si los hubiera).
replacer = lambda str: str.lower().str.replace(' ', '_').str.replace('/', '_').str.replace("'",'_')
df.columns = replacer(df.columns.str)

# hace lo mismo que la linea de arriba pero para el interior de las columnas y no para solo los titulos de las columnas
for col in list(df.dtypes[df.dtypes == 'object'].index):
    df[col] = replacer(df[col].str)
df.head().T

Unnamed: 0,0,1,2,3,4
school,gp,gp,gp,gp,gp
sex,f,f,f,f,f
age,18,17,15,15,16
address,u,u,u,u,u
famsize,gt3,gt3,le3,gt3,gt3
pstatus,a,t,t,t,t
medu,4,1,1,4,3
fedu,4,1,1,2,3
mjob,at_home,at_home,at_home,health,other
fjob,teacher,other,other,services,other


In [7]:
df.passed = (df.passed).astype(int) # convierte la columna passed a 0 y 1 y le dice que yes es el true, el resto es false

In [8]:
# Definimos las listas dependiendo de si son cuantitativas o cualitativas, es decir, si son valores numericos o no. 
categorical = ['school', 'sex', 'address', 'famsize', 'pstatus', 'mjob', 'fjob', 'reason', 'guardian', 'schoolsup', 'famsup', 'paid', 'activities', 'nursery', 'higher', 'internet', 'romantic']
numerical = ['age', 'medu', 'fedu', 'traveltime', 'studytime', 'failures', 'famrel', 'freetime', 'goout', 'dalc', 'walc', 'health', 'absences']

df[categorical].nunique()

school        2
sex           2
address       2
famsize       2
pstatus       2
mjob          5
fjob          5
reason        4
guardian      3
schoolsup     2
famsup        2
paid          2
activities    2
nursery       2
higher        2
internet      2
romantic      2
dtype: int64

## 1.2 Limpieza de nulos. 

In [9]:
# buscamos los nulos, y podemos ver que en este caso no hay nulos porque el array que nos da esta vacio. 

nulos = df.isnull().sum()
print(nulos[nulos > 0])

Series([], dtype: int64)


In [10]:
# en el caso de valores "unknown" o "?" se podria hacer así, pero como se puede ver mas adelante no hay valores de este tipo en el dataset, lo dejo
# solo porque se vea como se haría para un dataset que si los tenga.

for col in df.select_dtypes(include=['object']):
    num_unknown = len(df[df[col] == '?']) 
    if num_unknown > 0:
        print(f"La columna {col} tiene {num_unknown} valores 'unknown'")

## 1.3 Separación de datos de entrenamiento


In [11]:
from sklearn.model_selection import train_test_split

#Dividimos en entrenamiento y test.
df_train_full, df_test = train_test_split(df, test_size=0.2, random_state=1) #es igual que poenr train_size=0.8

#Dividimos a su vez el conjunto df_train_full en entrenamiento y validación
df_train, df_val = train_test_split(df_train_full, test_size=0.33, random_state=1) # normalmente se pone x_train, x_val, x_test

#Guarda las etiquetas de los ejemplos en una variable
y_train = df_train.passed.values
y_val = df_val.passed.values

# elimina la columna de etiquetas del conjunto de datos
del df_train['passed']
del df_val['passed']

In [12]:
# comprobamos que tienen la misma longitud
print(len(df_train))
print(len(y_train))

211
211


## 2. ANÁLISIS DE LAS PROPIEDADES

In [13]:
global_mean = df_train_full.passed.mean()
round(global_mean, 3)
#la media general de aprobadosk

np.float64(0.665)

In [14]:
# Calcula la media de aprobados para algunas de las categorical para hacernos una idea. 
print(df_train_full.groupby('school').passed.mean().round(3))
print(df_train_full.groupby('sex').passed.mean().round(3))
print(df_train_full.groupby('mjob').passed.mean().round(3))
print(df_train_full.groupby('higher').passed.mean().round(3))

school
gp    0.676
ms    0.571
Name: passed, dtype: float64
sex
f    0.627
m    0.707
Name: passed, dtype: float64
mjob
at_home     0.614
health      0.828
other       0.623
services    0.714
teacher     0.622
Name: passed, dtype: float64
higher
no     0.267
yes    0.684
Name: passed, dtype: float64


con estos datos nos podemos hacer una idea de que si pasan alrededor de 66% de los alumnos variables como que la madre trabaje en el sector de la salud
 o si tienen intención de cursar estudios superiores son muy influyentes. 

In [15]:
from sklearn.metrics import mutual_info_score

calculate_mi = lambda col: mutual_info_score(col, df_train_full.passed)

#calculamos la relación que tiene cada parametro con los aprobados
df_mi = df_train_full[categorical].apply(calculate_mi)
df_mi = df_mi.sort_values(ascending=False).to_frame(name='MI')
df_mi

Unnamed: 0,MI
higher,0.016517
mjob,0.010384
guardian,0.009977
romantic,0.006039
reason,0.004895
schoolsup,0.003973
paid,0.003627
sex,0.003611
famsup,0.002958
address,0.002942


In [16]:
# la correlación en el caso de los parametros numericos 
print(df_train_full[numerical].corrwith(df_train_full.passed))

age          -0.195289
medu          0.111574
fedu          0.094905
traveltime   -0.092451
studytime     0.103798
failures     -0.300776
famrel        0.040809
freetime      0.037061
goout        -0.145614
dalc         -0.025545
walc         -0.007662
health       -0.054127
absences     -0.069449
dtype: float64


# ***Antes de empezar voy a quitar columnas:
> Finalmente se quedan muchisimas variables que para manejar el formulario de Streamlit se hace muy engorroso para mirarlo todo, asi que vamos a dejar como mucho unas 10 o asi, trato de dejar las que parecen tener más relación con la tasa de aprobados. Es que hay algunas como el alcohol
que beben los fines de semana que la relación es casi nula y van a estorbar más que otra cosa.  Hay otras que si que tenial algo más de relación pero he tenido que cortar por algun sitio. 

In [17]:
categorical = ['higher', 'mjob']
numerical = ['failures', 'age', 'goout', 'medu', 'studytime', 'fedu', 'traveltime', 'absences']

## 3. INGENIERÍA DE PROPIEDADES

In [18]:
#orient='records' hace que cada fila del dataframe se convierta en un diccionario
train_dict = df_train[categorical + numerical].to_dict(orient='records')
dict(sorted(train_dict[0].items()))

{'absences': 8,
 'age': 16,
 'failures': 0,
 'fedu': 3,
 'goout': 2,
 'higher': 'yes',
 'medu': 3,
 'mjob': 'services',
 'studytime': 1,
 'traveltime': 1}

### Transformando los datos antes de poder lanzar el entrenamiento de los modelos

Para poder entrenar el modelo previamente los datos tienen que pasarse a un formato numerico y que podamos pasar como una matriz de vectores. 

In [19]:
from sklearn.feature_extraction import DictVectorizer
# crea el objeto DictVectorizer, que convierte listas de diccionarios en matrices numéricas
dv = DictVectorizer(sparse=False)
dv.fit(train_dict)

0,1,2
,"dtype  dtype: dtype, default=np.float64 The type of feature values. Passed to Numpy array/scipy.sparse matrix constructors as the dtype argument.",<class 'numpy.float64'>
,"separator  separator: str, default=""="" Separator string used when constructing new features for one-hot coding.",'='
,"sparse  sparse: bool, default=True Whether transform should produce scipy.sparse matrices.",False
,"sort  sort: bool, default=True Whether ``feature_names_`` and ``vocabulary_`` should be sorted when fitting.",True


In [20]:
X_train = dv.transform(train_dict)

In [21]:
#vemos como ahora son numeros
X_train[0]

array([ 8., 16.,  0.,  3.,  2.,  0.,  1.,  3.,  0.,  0.,  0.,  1.,  0.,
        1.,  1.])

In [22]:
dv.get_feature_names_out()

array(['absences', 'age', 'failures', 'fedu', 'goout', 'higher=no',
       'higher=yes', 'medu', 'mjob=at_home', 'mjob=health', 'mjob=other',
       'mjob=services', 'mjob=teacher', 'studytime', 'traveltime'],
      dtype=object)

### Nota 
>para agilizar más tarde la comprobación en los modelos vamos a crea antes una funcion para calcular la precisión y la accuracy de forma agil.

In [23]:
def calcular_matriz_confusion(y_real, y_pred):
    tp = ((y_pred==1) & (y_real==1)).sum()
    fp = ((y_pred==1) & (y_real==0)).sum()
    tn = ((y_pred==0) & (y_real==0)).sum()
    fn = ((y_pred==0) & (y_real==1)).sum()
    
    precision = tp/(tp+fp)
    accuracy = (tp+tn)/(tp+tn+fp+fn)

    return f"precisión: {round(precision, 3)}", f"accuracy: {round(accuracy, 3)}"

## 4. ENTRENAMIENTO DE LOS MODELOS

## 4.1 Regresión lineal

In [24]:
#importamos y seleccionamos el modelo
from sklearn.linear_model import LogisticRegression
model_regresion = LogisticRegression(solver='liblinear')

#este es el entrenamiento 
model_regresion.fit(X_train, y_train)

0,1,2
,"penalty  penalty: {'l1', 'l2', 'elasticnet', None}, default='l2' Specify the norm of the penalty: - `None`: no penalty is added; - `'l2'`: add a L2 penalty term and it is the default choice; - `'l1'`: add a L1 penalty term; - `'elasticnet'`: both L1 and L2 penalty terms are added. .. warning::  Some penalties may not work with some solvers. See the parameter  `solver` below, to know the compatibility between the penalty and  solver. .. versionadded:: 0.19  l1 penalty with SAGA solver (allowing 'multinomial' + L1) .. deprecated:: 1.8  `penalty` was deprecated in version 1.8 and will be removed in 1.10.  Use `l1_ratio` instead. `l1_ratio=0` for `penalty='l2'`, `l1_ratio=1` for  `penalty='l1'` and `l1_ratio` set to any float between 0 and 1 for  `'penalty='elasticnet'`.",'deprecated'
,"C  C: float, default=1.0 Inverse of regularization strength; must be a positive float. Like in support vector machines, smaller values specify stronger regularization. `C=np.inf` results in unpenalized logistic regression. For a visual example on the effect of tuning the `C` parameter with an L1 penalty, see: :ref:`sphx_glr_auto_examples_linear_model_plot_logistic_path.py`.",1.0
,"l1_ratio  l1_ratio: float, default=0.0 The Elastic-Net mixing parameter, with `0 <= l1_ratio <= 1`. Setting `l1_ratio=1` gives a pure L1-penalty, setting `l1_ratio=0` a pure L2-penalty. Any value between 0 and 1 gives an Elastic-Net penalty of the form `l1_ratio * L1 + (1 - l1_ratio) * L2`. .. warning::  Certain values of `l1_ratio`, i.e. some penalties, may not work with some  solvers. See the parameter `solver` below, to know the compatibility between  the penalty and solver. .. versionchanged:: 1.8  Default value changed from None to 0.0. .. deprecated:: 1.8  `None` is deprecated and will be removed in version 1.10. Always use  `l1_ratio` to specify the penalty type.",0.0
,"dual  dual: bool, default=False Dual (constrained) or primal (regularized, see also :ref:`this equation `) formulation. Dual formulation is only implemented for l2 penalty with liblinear solver. Prefer `dual=False` when n_samples > n_features.",False
,"tol  tol: float, default=1e-4 Tolerance for stopping criteria.",0.0001
,"fit_intercept  fit_intercept: bool, default=True Specifies if a constant (a.k.a. bias or intercept) should be added to the decision function.",True
,"intercept_scaling  intercept_scaling: float, default=1 Useful only when the solver `liblinear` is used and `self.fit_intercept` is set to `True`. In this case, `x` becomes `[x, self.intercept_scaling]`, i.e. a ""synthetic"" feature with constant value equal to `intercept_scaling` is appended to the instance vector. The intercept becomes ``intercept_scaling * synthetic_feature_weight``. .. note::  The synthetic feature weight is subject to L1 or L2  regularization as all other features.  To lessen the effect of regularization on synthetic feature weight  (and therefore on the intercept) `intercept_scaling` has to be increased.",1
,"class_weight  class_weight: dict or 'balanced', default=None Weights associated with classes in the form ``{class_label: weight}``. If not given, all classes are supposed to have weight one. The ""balanced"" mode uses the values of y to automatically adjust weights inversely proportional to class frequencies in the input data as ``n_samples / (n_classes * np.bincount(y))``. Note that these weights will be multiplied with sample_weight (passed through the fit method) if sample_weight is specified. .. versionadded:: 0.17  *class_weight='balanced'*",
,"random_state  random_state: int, RandomState instance, default=None Used when ``solver`` == 'sag', 'saga' or 'liblinear' to shuffle the data. See :term:`Glossary ` for details.",
,"solver  solver: {'lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky', 'sag', 'saga'}, default='lbfgs' Algorithm to use in the optimization problem. Default is 'lbfgs'. To choose a solver, you might want to consider the following aspects: - 'lbfgs' is a good default solver because it works reasonably well for a wide  class of problems. - For :term:`multiclass` problems (`n_classes >= 3`), all solvers except  'liblinear' minimize the full multinomial loss, 'liblinear' will raise an  error. - 'newton-cholesky' is a good choice for  `n_samples` >> `n_features * n_classes`, especially with one-hot encoded  categorical features with rare categories. Be aware that the memory usage  of this solver has a quadratic dependency on `n_features * n_classes`  because it explicitly computes the full Hessian matrix. - For small datasets, 'liblinear' is a good choice, whereas 'sag'  and 'saga' are faster for large ones; - 'liblinear' can only handle binary classification by default. To apply a  one-versus-rest scheme for the multiclass setting one can wrap it with the  :class:`~sklearn.multiclass.OneVsRestClassifier`. .. warning::  The choice of the algorithm depends on the penalty chosen (`l1_ratio=0`  for L2-penalty, `l1_ratio=1` for L1-penalty and `0 < l1_ratio < 1` for  Elastic-Net) and on (multinomial) multiclass support:  ================= ======================== ======================  solver l1_ratio multinomial multiclass  ================= ======================== ======================  'lbfgs' l1_ratio=0 yes  'liblinear' l1_ratio=1 or l1_ratio=0 no  'newton-cg' l1_ratio=0 yes  'newton-cholesky' l1_ratio=0 yes  'sag' l1_ratio=0 yes  'saga' 0<=l1_ratio<=1 yes  ================= ======================== ====================== .. note::  'sag' and 'saga' fast convergence is only guaranteed on features  with approximately the same scale. You can preprocess the data with  a scaler from :mod:`sklearn.preprocessing`. .. seealso::  Refer to the :ref:`User Guide ` for more  information regarding :class:`LogisticRegression` and more specifically the  :ref:`Table `  summarizing solver/penalty supports. .. versionadded:: 0.17  Stochastic Average Gradient (SAG) descent solver. Multinomial support in  version 0.18. .. versionadded:: 0.19  SAGA solver. .. versionchanged:: 0.22  The default solver changed from 'liblinear' to 'lbfgs' in 0.22. .. versionadded:: 1.2  newton-cholesky solver. Multinomial support in version 1.6.",'liblinear'


In [25]:
val_dict = df_val[categorical + numerical].to_dict(orient='records')
X_val = dv.transform(val_dict)

In [26]:
# Lanzamos predicciones sobre el conjunto de validación
y_pred = model_regresion.predict_proba(X_val)[:, 1]
passed = y_pred >=0.5
passed

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True, False,  True,  True,  True,  True,  True,
       False,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True, False,  True, False,  True,  True,  True,  True, False,
        True,  True, False,  True,  True, False,  True, False,  True,
        True,  True,  True,  True,  True, False,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True, False,  True,
       False,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True, False, False,  True])

In [27]:
#evaluamos su precisión y accuracy sobre la predicción
print(calcular_matriz_confusion(y_val, passed))

('precisión: 0.793', 'accuracy: 0.781')


## 4.2 SVM

In [28]:
from sklearn.svm import SVC

# Primero probamos con un kernel lineal
model_svm = SVC(kernel='linear', C=1.0, random_state=1)
model_svm.fit(X_train, y_train)

0,1,2
,"C  C: float, default=1.0 Regularization parameter. The strength of the regularization is inversely proportional to C. Must be strictly positive. The penalty is a squared l2 penalty. For an intuitive visualization of the effects of scaling the regularization parameter C, see :ref:`sphx_glr_auto_examples_svm_plot_svm_scale_c.py`.",1.0
,"kernel  kernel: {'linear', 'poly', 'rbf', 'sigmoid', 'precomputed'} or callable, default='rbf' Specifies the kernel type to be used in the algorithm. If none is given, 'rbf' will be used. If a callable is given it is used to pre-compute the kernel matrix from data matrices; that matrix should be an array of shape ``(n_samples, n_samples)``. For an intuitive visualization of different kernel types see :ref:`sphx_glr_auto_examples_svm_plot_svm_kernels.py`.",'linear'
,"degree  degree: int, default=3 Degree of the polynomial kernel function ('poly'). Must be non-negative. Ignored by all other kernels.",3
,"gamma  gamma: {'scale', 'auto'} or float, default='scale' Kernel coefficient for 'rbf', 'poly' and 'sigmoid'. - if ``gamma='scale'`` (default) is passed then it uses  1 / (n_features * X.var()) as value of gamma, - if 'auto', uses 1 / n_features - if float, must be non-negative. .. versionchanged:: 0.22  The default value of ``gamma`` changed from 'auto' to 'scale'.",'scale'
,"coef0  coef0: float, default=0.0 Independent term in kernel function. It is only significant in 'poly' and 'sigmoid'.",0.0
,"shrinking  shrinking: bool, default=True Whether to use the shrinking heuristic. See the :ref:`User Guide `.",True
,"probability  probability: bool, default=False Whether to enable probability estimates. This must be enabled prior to calling `fit`, will slow down that method as it internally uses 5-fold cross-validation, and `predict_proba` may be inconsistent with `predict`. Read more in the :ref:`User Guide `.",False
,"tol  tol: float, default=1e-3 Tolerance for stopping criterion.",0.001
,"cache_size  cache_size: float, default=200 Specify the size of the kernel cache (in MB).",200
,"class_weight  class_weight: dict or 'balanced', default=None Set the parameter C of class i to class_weight[i]*C for SVC. If not given, all classes are supposed to have weight one. The ""balanced"" mode uses the values of y to automatically adjust weights inversely proportional to class frequencies in the input data as ``n_samples / (n_classes * np.bincount(y))``.",


In [29]:
# Lanzamos predicciones sobre las validaciones
y_pred_svm = model_svm.predict(X_val)

In [30]:
print(calcular_matriz_confusion(y_val, y_pred_svm))

('precisión: 0.789', 'accuracy: 0.79')


### 4.2.1 Pruebas
#### GridSearch

In [59]:
from sklearn.model_selection import GridSearchCV
from sklearn import svm

# Definimos los parámetros a probar.
parameters = {'kernel':('linear', 'rbf'),
              'C':[0.1, 0.2,0.3,1, 10],
              'gamma': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.1]}

# Instanciamos el modelo base
svc = svm.SVC()
clf = GridSearchCV(svc, parameters)

#Entrenamos
results = clf.fit(X_train, y_train)

# Imprimimos los mejores resultados
print(results.best_score_, results.best_params_)

0.6589147286821706 {'C': 0.1, 'gamma': 0.0001, 'kernel': 'linear'}


**gamma** al ser finalmente el kernel lineal no lo tenemos en cuenta porque realmente no es un parametro que usemos, es un parametro de rbf.
¿ Porque nos da un resultado diferente a nuestro calculo manual?
Porque GridSearchCV divide por defecto el X_train en 5 trozos, y los compara,
hace una validación cruzzada, realmente es un resultado más realista que como lo hemos calculado nosotros de forma manual si esa validación. 


#### Algunas pruebas más (manuales)

In [30]:
model_svm = SVC(kernel='linear', C=0.1, random_state=1)
model_svm.fit(X_train, y_train)
y_pred_svm = model_svm.predict(X_val)
print(calcular_matriz_confusion(y_val, y_pred_svm))

('precisión: 0.784', 'accuracy: 0.79')


In [31]:
model_svm = SVC(kernel='linear', C=0.3, random_state=1)
model_svm.fit(X_train, y_train)
y_pred_svm = model_svm.predict(X_val)
print(calcular_matriz_confusion(y_val, y_pred_svm))

('precisión: 0.789', 'accuracy: 0.79')


Curiosamente cuando pasamos de 0.3 hasta 10000 da igual siempre mantiene las mismas precisión y accuracy de ('precisión: 0.789', 'accuracy: 0.79')

en principio a mayor C más posibilidad de overfiting y a menor más posibilidad de underfiting. 

## 4.3 Arbol de Desisiones

In [32]:
from sklearn.tree import DecisionTreeClassifier

# Aquí vamos a hacer un for directamente para medir los hiperparametros de profundidad del arbol 
for depth in [1, 2, 3, 4, 5, 8, 10, None]:
    dt = DecisionTreeClassifier(max_depth=depth, random_state=1)
    dt.fit(X_train, y_train)
    
    y_pred_temp = dt.predict(X_val)
    
    # Mostramos los resultados para cada profundidad
    metrics = calcular_matriz_confusion(y_val, y_pred_temp)
    print(f"Depth: {depth}:  {metrics}")


Depth: 1:  ('precisión: 0.733', 'accuracy: 0.733')
Depth: 2:  ('precisión: 0.758', 'accuracy: 0.571')
Depth: 3:  ('precisión: 0.756', 'accuracy: 0.686')
Depth: 4:  ('precisión: 0.743', 'accuracy: 0.61')
Depth: 5:  ('precisión: 0.754', 'accuracy: 0.562')
Depth: 8:  ('precisión: 0.789', 'accuracy: 0.657')
Depth: 10:  ('precisión: 0.803', 'accuracy: 0.676')
Depth: None:  ('precisión: 0.795', 'accuracy: 0.676')


Vamos a seleccionar la mejor profundidad, que de una precisión y accuracy mas compensadas. Creo que es mejor Depth 1, ya que aunque 8 10 y ninguna tienen más precisión, está mas equilibrado con la accuracy. 

In [33]:
#usamos depth 1 porque es la que mejores metricas tiene y entrenamos 
model_dt = DecisionTreeClassifier(max_depth=1, random_state=1)
model_dt.fit(X_train, y_train)
y_pred_dt = model_dt.predict(X_val)

print(calcular_matriz_confusion(y_val, y_pred_dt))

('precisión: 0.733', 'accuracy: 0.733')


# 5 Evaluación final con el conjunto de Test

In [35]:
# Preparamos de los datos de Test, al igual que hicimos con la validación, extraemos la variable objetivo y transformamos los parametros
y_test = df_test.passed.values
test_dict = df_test[categorical + numerical].to_dict(orient='records')
X_test = dv.transform(test_dict)




### Evaluamos cada modelo igual que con la validación

In [36]:
# Regresión Logística
y_pred_test_reg = model_regresion.predict_proba(X_test)[:, 1] >= 0.5
print(f"Test Regresión Logística: {calcular_matriz_confusion(y_test, y_pred_test_reg)}")

Test Regresión Logística: ('precisión: 0.77', 'accuracy: 0.722')


In [37]:
# SVM
y_pred_test_svm = model_svm.predict(X_test)
print(f"Test SVM: {calcular_matriz_confusion(y_test, y_pred_test_svm)}")


Test SVM: ('precisión: 0.773', 'accuracy: 0.759')


In [38]:
# Arbol de Decisiones
y_pred_test_dt = model_dt.predict(X_test)
print(f"Test Árbol de Decisión: {calcular_matriz_confusion(y_test, y_pred_test_dt)}")

Test Árbol de Decisión: ('precisión: 0.696', 'accuracy: 0.696')


# 6 Serialización de los modelos

guardamos cada modelo en un archivo dentro de la carpeta models, realmente al final solo vamos a usar uno. 

In [34]:
import pickle

# regresion lineal
with open('models/students-model-regresion.pck', 'wb') as f:
    pickle.dump((dv,model_regresion), f)

# SVM
with open('models/students-model-svm.pck', 'wb') as f:
    pickle.dump((dv, model_svm), f)

# Arbol de decision
with open('models/students-model-decision-tree.pck', 'wb') as f:
    pickle.dump((dv, model_dt), f)

# Conclusiones finales
---------------------------
[!Nota]
>Tal vez hemos quitado demasiadas metricas del estudio, en su mayoria eran atributos que 
>aportaban poco o nada al resutlado final, pero para el ejercicio y poder hacer pruebas en
>Streamlit era más facil con una decena de atributos, aún así he quitado los menos >significativos.

-------------------------
Metricas de cada modelo (con los hiperparametros ya ajustados):

Regresión Lineal : Accuracy de 0,781, precision de 0,791

SVM: Accuracy de 0,79 y precisión de 0'789

Arbol de Decisiones: Accuracy de 0'733 y precision de 0'733


Por metricas y con las pruebas que he hecho, se podria elegir tanto el de **Regresión Lineal  como el de SVM**, finalmente he elegido el de Regresión Lineal, que aunque tenga unas métricas ligeremante peores que el SVM me parece mas sencillo para implementar en el Streamlit, donde si queremos jugar con los atributos va a responder mejor que el Arbol de decisiones que además de tener peores metricas va a ser mas engañoso porque solo va a cambiar la probabilidad si vamos ajustando dentro de su arbol de decisiones (lo he probado en el streamlit), es decir, si considera que el trabajo de la madre es la más importante, ese va a ser el que determine mayormente exito o fracaso, y pdremos cambiar "tiempo de casa al colegio" y no apreciar un cambio en la probabilidad.


 
