# Trabajo práctico integrador - Análisis de datos

## Limpieza y preparación de datos / Ingeniería de features

In [1]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder, KBinsDiscretizer, PowerTransformer
from sklearn.metrics import accuracy_score,plot_confusion_matrix,roc_auc_score, classification_report, confusion_matrix, precision_recall_curve, auc

from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import GridSearchCV

import pickle

In [2]:
# Set color palette
color = ['#1ED760', '#FAF5F5']
sns.set_palette(sns.color_palette(color))

In [3]:
# Load dataset from GitHub repository
df_original = pd.read_csv('data/data_playlist.csv')
df = df_original.copy(deep=True)

In [4]:
# Show first 10 rows
df.head(10)

Unnamed: 0,acousticness,danceability,duration,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence,label
0,0.713,0.514,100125,0.521,0.816,8,0.112,-14.835,0,0.0444,119.879,4,0.143,1
1,0.192,0.714,207019,0.614,0.0,4,0.263,-6.935,1,0.0319,123.969,4,0.582,1
2,0.333,0.63,216200,0.455,4e-06,5,0.127,-9.29,1,0.0292,139.931,4,0.199,1
3,0.601,0.81,136413,0.221,0.21,5,0.184,-11.005,1,0.0429,109.96,4,0.798,1
4,0.883,0.465,181440,0.459,0.000173,6,0.0692,-8.137,0,0.0351,90.807,4,0.288,1
5,0.524,0.633,244360,0.401,0.0,4,0.123,-12.549,1,0.0439,134.978,4,0.523,1
6,0.597,0.507,183573,0.795,0.0,9,0.296,-6.966,1,0.0607,165.54,4,0.9,0
7,0.452,0.825,259102,0.435,0.609,1,0.0953,-9.582,1,0.0568,119.038,4,0.243,1
8,0.748,0.42,366179,0.324,0.839,9,0.0723,-14.7,0,0.0556,183.02,3,0.33,1
9,0.913,0.292,197613,0.246,0.0883,0,0.209,-9.758,1,0.033,140.316,4,0.249,1


In [5]:
# Show dataset shape
df.shape

(750, 14)

## 1. Análisis de datos faltantes

In [6]:
df.isna().sum()

acousticness        0
danceability        0
duration            0
energy              0
instrumentalness    0
key                 0
liveness            0
loudness            0
mode                0
speechiness         0
tempo               0
time_signature      0
valence             0
label               0
dtype: int64

- No se evidencia la presencia de valores faltantes ni nulos, por este motivo, no se aplican técnicas de imputación.

## 2. Pre-procesamiento de las variables

In [7]:
numerical_features = ['acousticness', 'danceability', 'energy', 'instrumentalness', 'liveness', 'loudness', 'speechiness', 'valence']
categorical_features = ['key', 'mode', 'time_signature']
special_categorical_features = ['tempo']

In [8]:
X = df[numerical_features + categorical_features + special_categorical_features]
y = df['label']

In [9]:
def print_info(var, is_numeric=True):
    print("*" * 50)
    print(f"Variable: {var}")
    cant_unique = len(df[var].unique())
    if is_numeric:
        print(f"La variable oscila entre los siguientes valores: {df[var].min()} - {df[var].max()}")
    else:
        
        if cant_unique < 10:
            print(f"Categorías: {df[var].unique()}")
    print(f"La variable tiene {cant_unique} valores únicos")

### Definición de las técnicas a utilizar:

### 1. Variables numéricas:

### Información

In [10]:
for var in numerical_features:
    print_info(var)

**************************************************
Variable: acousticness
La variable oscila entre los siguientes valores: 1.17e-06 - 0.994
La variable tiene 596 valores únicos
**************************************************
Variable: danceability
La variable oscila entre los siguientes valores: 0.107 - 0.986
La variable tiene 458 valores únicos
**************************************************
Variable: energy
La variable oscila entre los siguientes valores: 0.00925 - 0.995
La variable tiene 502 valores únicos
**************************************************
Variable: instrumentalness
La variable oscila entre los siguientes valores: 0.0 - 0.967
La variable tiene 431 valores únicos
**************************************************
Variable: liveness
La variable oscila entre los siguientes valores: 0.024 - 0.979
La variable tiene 445 valores únicos
**************************************************
Variable: loudness
La variable oscila entre los siguientes valores: -29.601 - -0.5

### Técnicas a utilizar

- **Feature Scaling**: Estandarización, Escalado a mínimo-máximo
- **Transformación de variables**: Yeo-Johnson (debido a que admite variables positivas y negativas)

### 2. Variables categóricas

### Información

In [11]:
for var in categorical_features:
    print_info(var, is_numeric=False)

**************************************************
Variable: key
La variable tiene 12 valores únicos
**************************************************
Variable: mode
Categorías: [0 1]
La variable tiene 2 valores únicos
**************************************************
Variable: time_signature
Categorías: [4 3 5 1]
La variable tiene 4 valores únicos


In [12]:
print_info('tempo')

**************************************************
Variable: tempo
La variable oscila entre los siguientes valores: 55.747 - 204.162
La variable tiene 729 valores únicos


### Técnicas a utilizar

**Variables**:
- key
- mode
- tempo

Codificación: One-Hot-Encoding

**Variable**:
time_Signature

Codificación: Discretización (binning) + One-Hot-Encoding

## Pipelines de pre-procesamiento

#### Pipelines variables numéricas

In [13]:
numeric_std_transformer = Pipeline(steps=[
    ('scaler', StandardScaler()),
    ('power', PowerTransformer(method='yeo-johnson'))])

In [14]:
numeric_minmax_transformer = Pipeline(steps=[
    ('scaler', MinMaxScaler()),
    ('power', PowerTransformer(method='yeo-johnson'))])

#### Pipelines variables categóricas

In [15]:
categorical_binning = KBinsDiscretizer(n_bins=10, encode='onehot' , strategy='uniform' )

In [16]:
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

### Column transformer

In [17]:
preprocessor_std = ColumnTransformer(
    transformers=[
        ('num', numeric_std_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features),
        ('cat_bin', categorical_binning, special_categorical_features)])

In [18]:
preprocessor_min_max = ColumnTransformer(
    transformers=[
        ('num', numeric_minmax_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features),
        ('cat_bin', categorical_binning, special_categorical_features)])

### Pipeline 

In [19]:
preprocessing_std = Pipeline(steps=[('preprocessor', preprocessor_std),])

In [20]:
preprocessing_min_max = Pipeline(steps=[('preprocessor', preprocessor_min_max),])

### Experimentation

#### Define models

In [21]:
logistic_regresion_std_model = Pipeline(steps=[('preprocessor', preprocessing_std),
                       ('classifier', LogisticRegression())])

In [22]:
logistic_regresion_min_max_model = Pipeline(steps=[('preprocessor', preprocessing_min_max),
                       ('classifier', LogisticRegression())])

In [23]:
decision_tree_std_pipe = Pipeline(steps=[('preprocessor', preprocessing_std),
                       ('classifier', DecisionTreeClassifier())])

In [24]:
decision_tree_min_max_pipe = Pipeline(steps=[('preprocessor', preprocessing_min_max),
                       ('classifier', DecisionTreeClassifier())])

In [25]:
random_forest_std_pipe = Pipeline(steps=[('preprocessor', preprocessing_std),
                       ('classifier', RandomForestClassifier())])

In [26]:
random_forest_std_pipe = Pipeline(steps=[('preprocessor', preprocessing_min_max),
                       ('classifier', RandomForestClassifier())])

In [27]:
param_grid = {
    'classifier__max_depth': [15, 30, 45],
    'classifier__criterion': ['gini', 'entropy', 'log_loss'],
}

In [28]:
random_forest_params = {
    'classifier__n_estimators': [50, 100, 150],
}

In [29]:
param_grid_random_forest = dict(param_grid)
param_grid_random_forest.update(random_forest_params)

In [30]:
# Grid Search - Models
gs_decision_tree_std = GridSearchCV(decision_tree_std_pipe, param_grid, cv=5, n_jobs=-1, verbose=3)
gs_decision_tree_min_max = GridSearchCV(decision_tree_min_max_pipe, param_grid, cv=5, n_jobs=-1, verbose=3)

gs_random_forest_std = GridSearchCV(random_forest_std_pipe, param_grid, cv=5, n_jobs=-1, verbose=3)
gs_random_forest_min_max = GridSearchCV(random_forest_std_pipe, param_grid, cv=5, n_jobs=-1, verbose=3)

#### Metric report

In [31]:
def metric_report(y_test, y_pred, y_proba):  
    print(classification_report(y_test, y_pred))  
    print('Area bajo la curva ROC:',np.round(roc_auc_score(y_test, y_proba[:,1]), 4)) 
    precision, recall,threshold=precision_recall_curve(y_test, y_proba[:,1]);
    print('Area bajo la curva Precision-Recall:',np.round(auc(recall, precision), 4))

#### Train-Test Split

In [32]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### Experimentation

#### 1. Logistic Regression - Numeric: Standard Scaler + Yeo-Jhonson

In [33]:
logistic_regresion_std_model.fit(X_train, y_train)

In [34]:
y_pred = logistic_regresion_std_model.predict(X_test)
y_proba = logistic_regresion_std_model.predict_proba(X_test)

In [35]:
metric_report(y_test, y_pred, y_proba)

              precision    recall  f1-score   support

           0       0.67      0.76      0.71        54
           1       0.85      0.79      0.82        96

    accuracy                           0.78       150
   macro avg       0.76      0.78      0.77       150
weighted avg       0.79      0.78      0.78       150

Area bajo la curva ROC: 0.8773
Area bajo la curva Precision-Recall: 0.9142


#### 2. Logistic Regression - Numeric: MinMax + Yeo-Jhonson 

In [36]:
logistic_regresion_min_max_model.fit(X_train, y_train)

In [37]:
y_pred = logistic_regresion_std_model.predict(X_test)
y_proba = logistic_regresion_std_model.predict_proba(X_test)

In [38]:
metric_report(y_test, y_pred, y_proba)

              precision    recall  f1-score   support

           0       0.67      0.76      0.71        54
           1       0.85      0.79      0.82        96

    accuracy                           0.78       150
   macro avg       0.76      0.78      0.77       150
weighted avg       0.79      0.78      0.78       150

Area bajo la curva ROC: 0.8773
Area bajo la curva Precision-Recall: 0.9142


### Decision Tree

#### Decision tree: Numeric: Standard Scaler

In [39]:
gs_decision_tree_std.fit(X_train, y_train)

Fitting 5 folds for each of 9 candidates, totalling 45 fits


In [40]:
pd.DataFrame(gs_decision_tree_std.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_classifier__criterion,param_classifier__max_depth,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.113406,0.009881,0.021305,0.002795,gini,15,"{'classifier__criterion': 'gini', 'classifier_...",0.816667,0.808333,0.758333,0.791667,0.716667,0.778333,0.036742,4
1,0.111413,0.013531,0.022908,0.005256,gini,30,"{'classifier__criterion': 'gini', 'classifier_...",0.825,0.8,0.741667,0.775,0.733333,0.775,0.034561,8
2,0.121117,0.008486,0.023799,0.002924,gini,45,"{'classifier__criterion': 'gini', 'classifier_...",0.825,0.8,0.766667,0.766667,0.741667,0.78,0.029155,2
3,0.120117,0.010486,0.030398,0.011005,entropy,15,"{'classifier__criterion': 'entropy', 'classifi...",0.791667,0.783333,0.725,0.816667,0.775,0.778333,0.030092,6
4,0.115662,0.012531,0.021211,0.00258,entropy,30,"{'classifier__criterion': 'entropy', 'classifi...",0.8,0.791667,0.725,0.791667,0.783333,0.778333,0.027183,4
5,0.119737,0.007912,0.025423,0.004728,entropy,45,"{'classifier__criterion': 'entropy', 'classifi...",0.8,0.766667,0.741667,0.816667,0.775,0.78,0.026141,2
6,0.132592,0.007309,0.025403,0.003384,log_loss,15,"{'classifier__criterion': 'log_loss', 'classif...",0.808333,0.791667,0.725,0.816667,0.766667,0.781667,0.033082,1
7,0.130598,0.016462,0.021001,0.002827,log_loss,30,"{'classifier__criterion': 'log_loss', 'classif...",0.783333,0.8,0.7,0.808333,0.775,0.773333,0.038514,9
8,0.103996,0.008313,0.021201,0.00574,log_loss,45,"{'classifier__criterion': 'log_loss', 'classif...",0.808333,0.8,0.733333,0.808333,0.733333,0.776667,0.035512,7


In [41]:
gs_decision_tree_std.best_estimator_

In [42]:
gs_decision_tree_std.best_score_

0.7816666666666666

In [43]:
y_pred = gs_decision_tree_std.predict(X_test)
y_proba = gs_decision_tree_std.predict_proba(X_test)

In [44]:
metric_report(y_test, y_pred, y_proba)

              precision    recall  f1-score   support

           0       0.73      0.80      0.76        54
           1       0.88      0.83      0.86        96

    accuracy                           0.82       150
   macro avg       0.80      0.81      0.81       150
weighted avg       0.83      0.82      0.82       150

Area bajo la curva ROC: 0.8148
Area bajo la curva Precision-Recall: 0.9096


#### Decision tree: Numeric: Min - Max

In [45]:
gs_decision_tree_min_max.fit(X_train, y_train)

Fitting 5 folds for each of 9 candidates, totalling 45 fits


In [46]:
pd.DataFrame(gs_decision_tree_min_max.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_classifier__criterion,param_classifier__max_depth,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.115188,0.032315,0.024601,0.01048,gini,15,"{'classifier__criterion': 'gini', 'classifier_...",0.833333,0.791667,0.75,0.775,0.691667,0.768333,0.046963,8
1,0.111507,0.007895,0.021614,0.00321,gini,30,"{'classifier__criterion': 'gini', 'classifier_...",0.8,0.783333,0.758333,0.766667,0.691667,0.76,0.037044,9
2,0.109026,0.007345,0.021121,0.002083,gini,45,"{'classifier__criterion': 'gini', 'classifier_...",0.8,0.816667,0.733333,0.783333,0.733333,0.773333,0.034319,6
3,0.119412,0.018812,0.020001,0.003288,entropy,15,"{'classifier__criterion': 'entropy', 'classifi...",0.808333,0.783333,0.75,0.783333,0.758333,0.776667,0.020683,3
4,0.111599,0.008867,0.023201,0.003656,entropy,30,"{'classifier__criterion': 'entropy', 'classifi...",0.783333,0.791667,0.716667,0.808333,0.758333,0.771667,0.031885,7
5,0.111811,0.007212,0.025391,0.008563,entropy,45,"{'classifier__criterion': 'entropy', 'classifi...",0.816667,0.791667,0.716667,0.8,0.766667,0.778333,0.034801,2
6,0.109866,0.00926,0.026432,0.007786,log_loss,15,"{'classifier__criterion': 'log_loss', 'classif...",0.8,0.808333,0.75,0.808333,0.775,0.788333,0.02273,1
7,0.11378,0.005844,0.023612,0.005978,log_loss,30,"{'classifier__criterion': 'log_loss', 'classif...",0.791667,0.791667,0.725,0.808333,0.758333,0.775,0.029814,5
8,0.119495,0.013776,0.021413,0.007884,log_loss,45,"{'classifier__criterion': 'log_loss', 'classif...",0.8,0.783333,0.733333,0.791667,0.775,0.776667,0.023214,3


In [47]:
gs_decision_tree_min_max.best_estimator_

In [48]:
gs_decision_tree_min_max.best_score_

0.7883333333333333

In [49]:
y_pred = gs_decision_tree_min_max.predict(X_test)
y_proba = gs_decision_tree_min_max.predict_proba(X_test)

In [50]:
metric_report(y_test, y_pred, y_proba)

              precision    recall  f1-score   support

           0       0.75      0.83      0.79        54
           1       0.90      0.84      0.87        96

    accuracy                           0.84       150
   macro avg       0.82      0.84      0.83       150
weighted avg       0.85      0.84      0.84       150

Area bajo la curva ROC: 0.8385
Area bajo la curva Precision-Recall: 0.9219


In [51]:
# save the model
filename = 'models/decision_tree_min_max.sav'
pickle.dump(gs_decision_tree_min_max, open(filename, 'wb'))

### Random Forest

#### Random Forest - Numeric: Standard Scaler

In [52]:
gs_random_forest_std.fit(X_train, y_train)

Fitting 5 folds for each of 9 candidates, totalling 45 fits


In [53]:
pd.DataFrame(gs_random_forest_std.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_classifier__criterion,param_classifier__max_depth,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.591306,0.01496,0.055,0.010864,gini,15,"{'classifier__criterion': 'gini', 'classifier_...",0.783333,0.858333,0.766667,0.841667,0.766667,0.803333,0.038944,9
1,0.588533,0.030098,0.0656,0.006499,gini,30,"{'classifier__criterion': 'gini', 'classifier_...",0.791667,0.908333,0.766667,0.841667,0.775,0.816667,0.052705,6
2,0.595766,0.017945,0.062398,0.007393,gini,45,"{'classifier__criterion': 'gini', 'classifier_...",0.8,0.883333,0.791667,0.875,0.808333,0.831667,0.039229,1
3,0.646245,0.028882,0.070401,0.015408,entropy,15,"{'classifier__criterion': 'entropy', 'classifi...",0.791667,0.875,0.766667,0.883333,0.783333,0.82,0.049046,4
4,0.648733,0.017816,0.059299,0.003481,entropy,30,"{'classifier__criterion': 'entropy', 'classifi...",0.783333,0.883333,0.766667,0.85,0.8,0.816667,0.043461,7
5,0.656364,0.01516,0.0526,0.001854,entropy,45,"{'classifier__criterion': 'entropy', 'classifi...",0.8,0.891667,0.766667,0.866667,0.8,0.825,0.046547,2
6,0.636599,0.01742,0.0596,0.007335,log_loss,15,"{'classifier__criterion': 'log_loss', 'classif...",0.783333,0.883333,0.783333,0.841667,0.8,0.818333,0.038873,5
7,0.651602,0.021543,0.055796,0.007112,log_loss,30,"{'classifier__criterion': 'log_loss', 'classif...",0.791667,0.866667,0.766667,0.85,0.791667,0.813333,0.038224,8
8,0.603607,0.099031,0.047805,0.013475,log_loss,45,"{'classifier__criterion': 'log_loss', 'classif...",0.808333,0.875,0.766667,0.866667,0.8,0.823333,0.041298,3


In [54]:
gs_random_forest_std.best_estimator_

In [55]:
gs_random_forest_std.best_score_

0.8316666666666667

In [56]:
y_pred = gs_random_forest_std.predict(X_test)
y_proba = gs_random_forest_std.predict_proba(X_test)

In [57]:
metric_report(y_test, y_pred, y_proba)

              precision    recall  f1-score   support

           0       0.79      0.83      0.81        54
           1       0.90      0.88      0.89        96

    accuracy                           0.86       150
   macro avg       0.85      0.85      0.85       150
weighted avg       0.86      0.86      0.86       150

Area bajo la curva ROC: 0.9135
Area bajo la curva Precision-Recall: 0.9465


#### Random Forest - Numeric: Min Max Scaler


In [58]:
gs_random_forest_min_max.fit(X_train, y_train)

Fitting 5 folds for each of 9 candidates, totalling 45 fits


In [59]:
pd.DataFrame(gs_random_forest_min_max.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_classifier__criterion,param_classifier__max_depth,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.680024,0.03273,0.070199,0.004443,gini,15,"{'classifier__criterion': 'gini', 'classifier_...",0.791667,0.883333,0.775,0.866667,0.8,0.823333,0.043269,2
1,0.659413,0.038014,0.0662,0.010439,gini,30,"{'classifier__criterion': 'gini', 'classifier_...",0.766667,0.9,0.783333,0.858333,0.791667,0.82,0.050717,4
2,0.687848,0.048696,0.0718,0.011443,gini,45,"{'classifier__criterion': 'gini', 'classifier_...",0.8,0.875,0.8,0.85,0.775,0.82,0.036742,4
3,0.740125,0.024634,0.068001,0.006693,entropy,15,"{'classifier__criterion': 'entropy', 'classifi...",0.791667,0.883333,0.783333,0.9,0.791667,0.83,0.050717,1
4,0.710068,0.024316,0.068202,0.017103,entropy,30,"{'classifier__criterion': 'entropy', 'classifi...",0.8,0.858333,0.791667,0.858333,0.775,0.816667,0.03496,8
5,0.681054,0.028074,0.06003,0.008673,entropy,45,"{'classifier__criterion': 'entropy', 'classifi...",0.816667,0.866667,0.758333,0.866667,0.791667,0.82,0.042361,6
6,0.757647,0.106592,0.061205,0.007475,log_loss,15,"{'classifier__criterion': 'log_loss', 'classif...",0.816667,0.883333,0.758333,0.875,0.775,0.821667,0.050717,3
7,0.872673,0.052166,0.066001,0.005899,log_loss,30,"{'classifier__criterion': 'log_loss', 'classif...",0.791667,0.875,0.783333,0.85,0.775,0.815,0.03993,9
8,0.717581,0.041194,0.081551,0.016815,log_loss,45,"{'classifier__criterion': 'log_loss', 'classif...",0.791667,0.883333,0.775,0.85,0.8,0.82,0.040346,6


In [60]:
gs_random_forest_min_max.best_estimator_

In [61]:
gs_random_forest_min_max.best_score_

0.8299999999999998

In [62]:
y_pred = gs_random_forest_min_max.predict(X_test)
y_proba = gs_random_forest_min_max.predict_proba(X_test)

In [63]:
metric_report(y_test, y_pred, y_proba)

              precision    recall  f1-score   support

           0       0.79      0.83      0.81        54
           1       0.90      0.88      0.89        96

    accuracy                           0.86       150
   macro avg       0.85      0.85      0.85       150
weighted avg       0.86      0.86      0.86       150

Area bajo la curva ROC: 0.9107
Area bajo la curva Precision-Recall: 0.9423


In [64]:
# save the model
filename = 'models/random_forest_min_max.sav'
pickle.dump(gs_random_forest_min_max, open(filename, 'wb'))

### Conclusión:

Mejor modelo: Random Forest, con preprocesado numérico: MinMaxScaler + Yeo Johnson. Intuimos que esto se debe a que la mayoría de nuestras variables numéricas se encuentran en el rango 0 - 1, por lo que al hacer min-max scaler, se mantiene la distribución original de los datos, la cual es "normalizada", empleando Yeo Johnson. De esta forma los datos resultan útiles para el modelo, motivo por el cual las métricas arrojadas por el mismo son alentadoras (F1 score 0.81 para la clase minoritaria y 0.89 para la mayoritaria).