In [4]:
# Se importan las librerias
import pandas as pd
import numpy as np
import scipy as sp
import sklearn as sk
from sklearn.model_selection import train_test_split, KFold, cross_val_score
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, roc_auc_score
import numpy as np
import seaborn as sns
from sklearn.preprocessing import StandardScaler 
from warnings import filterwarnings
filterwarnings('ignore')
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from xgboost import XGBClassifier
from sklearn.compose import ColumnTransformer

In [5]:
# división del train/test
df = pd.read_csv("data/processed/train.csv", index_col=0)

In [6]:
# Usando Pipes

logistic_pipeline = Pipeline(
    [("Escalado", StandardScaler()),
    ("Modelo", LogisticRegression())
    ])

random_pipeline = Pipeline(
    [("Escalado", StandardScaler()),
    ("Modelo", RandomForestClassifier())
    ])

xgb_pipeline = Pipeline(
    [("Escalado", StandardScaler()),
    ("Modelo", XGBClassifier())
    ])

for name,pipe in zip(["logistic","randomF", "XGB"],[logistic_pipeline, random_pipeline, xgb_pipeline]):
    resultado = cross_val_score(pipe, df.drop(["target"], axis=1), df['target'], cv = 5, scoring = 'roc_auc')
    print(f"{name}: {np.mean(resultado):.4f}")
    print(resultado)
    

logistic: 0.7886
[0.80295049 0.79911665 0.78692382 0.77805457 0.77571379]
randomF: 0.9735
[0.88939813 0.99452324 0.99383079 0.99455515 0.99507277]
XGB: 0.8608
[0.87591185 0.86644586 0.84988357 0.8554143  0.85640882]


In [7]:
pipe_reg_log_param = {    
                 "Modelo__penalty": [None,"l2"], 
                 "Modelo__C": np.logspace(0, 4, 10),
                 "Modelo__class_weight": ['balanced', None]
                }

pipe_rand_forest_param = {
    'Modelo__n_estimators': [10, 100, 200, 400],
    'Modelo__max_depth': [1,2,4,8],
    'Modelo__max_features': [1, 2, 3],
    "Modelo__class_weight": ['balanced', None]
    }

xgb_param = {
    'Modelo__n_estimators': [10, 100, 200, 400],
    'Modelo__max_depth': [1,2,4,8],
    'Modelo__learning_rate': [0.1,0.2,0.5,1.0],
    "Modelo__class_weight": ['balanced', None]
}

cv = 5

gs_reg_log = GridSearchCV(logistic_pipeline,
                            pipe_reg_log_param,
                            cv=cv,
                            scoring='roc_auc',
                            verbose=1,
                            n_jobs=-1)

gs_rand_forest = GridSearchCV(random_pipeline,
                            pipe_rand_forest_param,
                            cv=cv,
                            scoring='roc_auc',
                            verbose=1,
                            n_jobs=-1)

gs_xgb = GridSearchCV(XGBClassifier(),
                        xgb_param,
                        cv=cv,
                        scoring="roc_auc",
                        verbose=1,
                        n_jobs=-1)

pipe_grids = {"gs_reg_log":gs_reg_log,
         "gs_rand_forest":gs_rand_forest,
         "gs_xgb":gs_xgb}

In [8]:
for nombre, grid_search in pipe_grids.items():
    grid_search.fit(df.drop(["target"], axis=1), df['target'])

Fitting 5 folds for each of 40 candidates, totalling 200 fits
Fitting 5 folds for each of 96 candidates, totalling 480 fits
Fitting 5 folds for each of 128 candidates, totalling 640 fits
Parameters: { "Modelo__class_weight", "Modelo__learning_rate", "Modelo__max_depth", "Modelo__n_estimators" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


Parameters: { "Modelo__class_weight", "Modelo__learning_rate", "Modelo__max_depth", "Modelo__n_estimators" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


Parameters: { "Mode

In [14]:
pipe_grids["gs_rand_forest"].best_params_

{'Modelo__class_weight': None,
 'Modelo__max_depth': 8,
 'Modelo__max_features': 2,
 'Modelo__n_estimators': 200}

In [15]:
pipe_grids["gs_xgb"].best_params_

{'Modelo__class_weight': 'balanced',
 'Modelo__learning_rate': 0.1,
 'Modelo__max_depth': 1,
 'Modelo__n_estimators': 10}

In [9]:
best_grids = [(i, j.best_score_) for i, j in pipe_grids.items()]

best_grids = pd.DataFrame(best_grids, columns=["Grid", "Best score"]).sort_values(by="Best score", ascending=False)
best_grids

Unnamed: 0,Grid,Best score
2,gs_xgb,0.860352
1,gs_rand_forest,0.858005
0,gs_reg_log,0.788713


In [16]:
# Se importa el dataset de test
df_test = pd.read_csv("data/processed/test.csv", index_col=0)

predictions_submit = pipe_grids["gs_xgb"].best_estimator_.predict(df_test)
predictions_submit

array([0, 0, 0, ..., 0, 1, 0])

In [17]:
sample = pd.read_csv("data/raw/sample_submission.csv")
submission = pd.DataFrame({"Id": sample['Id'], "target": predictions_submit})

In [18]:
def chequeator(df_to_submit):
    """
    Esta función se asegura de que tu submission tenga la forma requerida por Kaggle.
    
    Si es así, se guardará el dataframe en un `csv` y estará listo para subir a Kaggle.
    
    Si no, LEE EL MENSAJE Y HAZLE CASO.
    
    Si aún no:
    - apaga tu ordenador, 
    - date una vuelta, 
    - enciendelo otra vez, 
    - abre este notebook y 
    - leelo todo de nuevo. 
    Todos nos merecemos una segunda oportunidad. También tú.
    """
    if df_to_submit.shape == sample.shape:
        if df_to_submit.columns.all() == sample.columns.all():
            if df_to_submit.Id.all() == sample.Id.all():
                print("You're ready to submit!")
                submission.to_csv("deliveries/submission_8.csv", index = False) #muy importante el index = False
                urllib.request.urlretrieve("https://i.kym-cdn.com/photos/images/facebook/000/747/556/27a.jpg", "gfg.png")     
                img = Image.open("gfg.png")
                img.show()   
            else:
                print("Check the ids and try again")
        else:
            print("Check the names of the columns and try again")
    else:
        print("Check the number of rows and/or columns and try again")
        print("\nMensaje secreto de Clara: No me puedo creer que después de todo este notebook hayas hecho algún cambio en las filas de `diamonds_test.csv`. Lloro.")


In [19]:
chequeator(submission)

You're ready to submit!


NameError: name 'urllib' is not defined

In [20]:
submission["target"].value_counts()

0    25238
1     6204
Name: target, dtype: int64