# Mejor Modelo XGB - Proyecto 1 - Boson de Higgs 
## Grupo F

In [1]:
import math
import pandas as pd
import numpy as np
import os
import csv
import pickle
import joblib
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import make_scorer
from taa_utils import *
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import plot_tree
from sklearn.model_selection import RandomizedSearchCV
from xgboost import XGBClassifier

### Se cargan los datos

In [2]:
df_train = pd.read_csv('training.csv')

## Preprocesamiento

In [3]:
class CustomTransformer(BaseEstimator, TransformerMixin):
    
    def __init__(self):
        pass
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        
        # Reemplaza outiers con nan.
        X_nan = X.replace(-999, np.nan)
        
        # Drop de las columnas que terminan con phi.
        X_nan_sin_phi = X_nan.drop(columns= ['PRI_tau_phi',
                                             'PRI_lep_phi',
                                             'PRI_met_phi',
                                             'PRI_jet_leading_phi',
                                             'PRI_jet_subleading_phi'])
        
        # Drop de las columnas con pocos datos y baja corr. que se decidieron descartar.
        X_nan_sin_phi = X_nan_sin_phi.drop(columns = ['PRI_jet_subleading_eta', 
                                                      'PRI_jet_subleading_pt'])
        
        # Calcula los porcentajes de no-nan para cada columna
        X_result = X_nan_sin_phi.drop(columns = ['PRI_jet_all_pt',  
                                                 'PRI_met_sumet'])
        
               
        return X_result

In [4]:
# Descartamos Label, Weight y EventId.
y_train = df_train['Label']
EventId = df_train['EventId']
X_train = df_train.drop(['EventId', 'Weight', 'Label'], axis=1)

# Transformamos labels a binario.
y_train, _ = labels_a_binario(y_train, _)

In [5]:
# Pipeline de pre-procesamiento.
XGB_pipeline = Pipeline([
    ('Transform', CustomTransformer()),
    ('Imputer', SimpleImputer(strategy='mean')),
    ('Classifier', XGBClassifier(colsample_bytree=0.7,learning_rate=0.01,max_depth=15,n_estimators=200,subsample=0.5,random_state=42))
])

## Entrenamiento

In [6]:
xgb_model = XGB_pipeline.fit(X=X_train, y=y_train)
xgb_model

In [7]:
# Almacenamos el modelo.
joblib.dump(xgb_model, "Proy1_GrupoF_xgb_profs.pkl")

['Proy1_GrupoF_xgb_profs.pkl']

# Correr Modelo Entrenado

### Se puede selecionar entre el modelo ya generado o el entrenado por ustedes en la segunda celda.

In [8]:
# Se cargan los datos de test
df_test = pd.read_csv('test.csv')
EventID_test, X_test = dropEventID(df_test)

In [9]:
# Modelo generado de antemano:
final_model = joblib.load("Proy1_GrupoF_xgb_pre_entrenado.pkl")

# Modelo generado por profs: (Tal vez tengan que cambiar el nombre del load si lo cambiaron en el dump)
# final_model = joblib.load("Proy1_GrupoF_xgb_profs.pkl")

In [10]:
# Se predice sobre el conjunto de Test.
y_test_pred_xgb = final_model.predict(X_test)
y_test_pred_xgb = pd.Series(y_test_pred_xgb)
print(f'y_test_pred: \n {y_test_pred_xgb}')

y_test_pred: 
 0         0
1         0
2         0
3         1
4         0
         ..
549995    0
549996    0
549997    0
549998    0
549999    0
Length: 550000, dtype: int32


In [11]:
# Se crea la submission de Kaggle
y_test_cat_xgb = y_test_pred_xgb.map({1: 's', 0: 'b'})
crear_submission_file(EventID_test, y_test_cat_xgb, 'Submission_xgb_Grupo_F_profs.csv')