In [None]:
# Tratamiento de datos
# ==============================================================================
import numpy as np
import pandas as pd
import statsmodels.api as sm

# Gráficos
# ==============================================================================
import matplotlib.pyplot as plt

# Preprocesado y modelado
# ==============================================================================
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import ParameterGrid
from sklearn.inspection import permutation_importance
import multiprocessing
from sklearn.metrics import f1_score

# PREPARO CSV


In [None]:
labels=pd.read_csv("train_labels.csv")
labels.head()

Unnamed: 0,building_id,damage_grade
0,802906,3
1,28830,2
2,94947,3
3,590882,2
4,201944,3


In [None]:
values=pd.read_csv("train_values.csv")
values.head()

Unnamed: 0,building_id,geo_level_1_id,geo_level_2_id,geo_level_3_id,count_floors_pre_eq,age,area_percentage,height_percentage,land_surface_condition,foundation_type,roof_type,ground_floor_type,other_floor_type,position,plan_configuration,has_superstructure_adobe_mud,has_superstructure_mud_mortar_stone,has_superstructure_stone_flag,has_superstructure_cement_mortar_stone,has_superstructure_mud_mortar_brick,has_superstructure_cement_mortar_brick,has_superstructure_timber,has_superstructure_bamboo,has_superstructure_rc_non_engineered,has_superstructure_rc_engineered,has_superstructure_other,legal_ownership_status,count_families,has_secondary_use,has_secondary_use_agriculture,has_secondary_use_hotel,has_secondary_use_rental,has_secondary_use_institution,has_secondary_use_school,has_secondary_use_industry,has_secondary_use_health_post,has_secondary_use_gov_office,has_secondary_use_use_police,has_secondary_use_other
0,802906,6,487,12198,2,30,6,5,t,r,n,f,q,t,d,1,1,0,0,0,0,0,0,0,0,0,v,1,0,0,0,0,0,0,0,0,0,0,0
1,28830,8,900,2812,2,10,8,7,o,r,n,x,q,s,d,0,1,0,0,0,0,0,0,0,0,0,v,1,0,0,0,0,0,0,0,0,0,0,0
2,94947,21,363,8973,2,10,5,5,t,r,n,f,x,t,d,0,1,0,0,0,0,0,0,0,0,0,v,1,0,0,0,0,0,0,0,0,0,0,0
3,590882,22,418,10694,2,10,6,5,t,r,n,f,x,s,d,0,1,0,0,0,0,1,1,0,0,0,v,1,0,0,0,0,0,0,0,0,0,0,0
4,201944,11,131,1488,3,30,8,9,t,r,n,f,x,s,d,1,0,0,0,0,0,0,0,0,0,0,v,1,0,0,0,0,0,0,0,0,0,0,0


In [None]:
values.isnull().values.any()

False

In [None]:
values = values.dropna()

In [None]:
labels.isnull().values.any()

False

In [None]:
values.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 260601 entries, 0 to 260600
Data columns (total 39 columns):
 #   Column                                  Non-Null Count   Dtype 
---  ------                                  --------------   ----- 
 0   building_id                             260601 non-null  int64 
 1   geo_level_1_id                          260601 non-null  int64 
 2   geo_level_2_id                          260601 non-null  int64 
 3   geo_level_3_id                          260601 non-null  int64 
 4   count_floors_pre_eq                     260601 non-null  int64 
 5   age                                     260601 non-null  int64 
 6   area_percentage                         260601 non-null  int64 
 7   height_percentage                       260601 non-null  int64 
 8   land_surface_condition                  260601 non-null  object
 9   foundation_type                         260601 non-null  object
 10  roof_type                               260601 non-null 

In [None]:
to_be_categorized = ["land_surface_condition", "foundation_type", "roof_type",\
                     "position", "ground_floor_type", "other_floor_type",\
                     "plan_configuration", "legal_ownership_status", "geo_level_1_id"]
for row in to_be_categorized:
    values[row] = values[row].astype("category")
values.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 260601 entries, 0 to 260600
Data columns (total 39 columns):
 #   Column                                  Non-Null Count   Dtype   
---  ------                                  --------------   -----   
 0   building_id                             260601 non-null  int64   
 1   geo_level_1_id                          260601 non-null  category
 2   geo_level_2_id                          260601 non-null  int64   
 3   geo_level_3_id                          260601 non-null  int64   
 4   count_floors_pre_eq                     260601 non-null  int64   
 5   age                                     260601 non-null  int64   
 6   area_percentage                         260601 non-null  int64   
 7   height_percentage                       260601 non-null  int64   
 8   land_surface_condition                  260601 non-null  category
 9   foundation_type                         260601 non-null  category
 10  roof_type                       

In [None]:
datatypes = dict(values.dtypes)
for row in values.columns:
    if datatypes[row] != "int64" and datatypes[row] != "int32" and \
       datatypes[row] != "int16" and datatypes[row] != "int8":
        continue
    if values[row].nlargest(1).item() > 32767 and values[row].nlargest(1).item() < 2**31:
        values[row] = values[row].astype(np.int32)
    elif values[row].nlargest(1).item() > 127:
        values[row] = values[row].astype(np.int16)
    else:
        values[row] = values[row].astype(np.int8)
values.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 260601 entries, 0 to 260600
Data columns (total 39 columns):
 #   Column                                  Non-Null Count   Dtype   
---  ------                                  --------------   -----   
 0   building_id                             260601 non-null  int32   
 1   geo_level_1_id                          260601 non-null  category
 2   geo_level_2_id                          260601 non-null  int16   
 3   geo_level_3_id                          260601 non-null  int16   
 4   count_floors_pre_eq                     260601 non-null  int8    
 5   age                                     260601 non-null  int16   
 6   area_percentage                         260601 non-null  int8    
 7   height_percentage                       260601 non-null  int8    
 8   land_surface_condition                  260601 non-null  category
 9   foundation_type                         260601 non-null  category
 10  roof_type                       

In [None]:
labels.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 260601 entries, 0 to 260600
Data columns (total 2 columns):
 #   Column        Non-Null Count   Dtype
---  ------        --------------   -----
 0   building_id   260601 non-null  int64
 1   damage_grade  260601 non-null  int64
dtypes: int64(2)
memory usage: 4.0 MB


In [None]:
labels["building_id"] = labels["building_id"].astype(np.int32)
labels["damage_grade"] = labels["damage_grade"].astype(np.int8)
labels.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 260601 entries, 0 to 260600
Data columns (total 2 columns):
 #   Column        Non-Null Count   Dtype
---  ------        --------------   -----
 0   building_id   260601 non-null  int32
 1   damage_grade  260601 non-null  int8 
dtypes: int32(1), int8(1)
memory usage: 1.2 MB


In [None]:
important_values = values\
                .merge(labels, on="building_id")
important_values.drop(columns=["building_id"], inplace = True)

In [None]:
important_values.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 260601 entries, 0 to 260600
Data columns (total 39 columns):
 #   Column                                  Non-Null Count   Dtype   
---  ------                                  --------------   -----   
 0   geo_level_1_id                          260601 non-null  category
 1   geo_level_2_id                          260601 non-null  int16   
 2   geo_level_3_id                          260601 non-null  int16   
 3   count_floors_pre_eq                     260601 non-null  int8    
 4   age                                     260601 non-null  int16   
 5   area_percentage                         260601 non-null  int8    
 6   height_percentage                       260601 non-null  int8    
 7   land_surface_condition                  260601 non-null  category
 8   foundation_type                         260601 non-null  category
 9   roof_type                               260601 non-null  category
 10  ground_floor_type               

In [None]:
X_train, X_test, y_train, y_test = train_test_split(important_values.drop(columns = 'damage_grade'),
                                        important_values['damage_grade'], test_size = 0.15, random_state = 123)


def feature_engieniere(df):
  df["volume_percentage"] = df["height_percentage"] *  df["area_percentage"]
  df["less_than_25_years"] = df["age"] >= 25
  df["more_than_20_metres"] = df["height_percentage"] <= 20 
  for i in [1,2,3]:
    df[f"geo_{i}_height_avg"] = df.groupby(f"geo_level_{i}_id")["height_percentage"].transform(np.average)
    df[f"geo_{i}_area_avg"] = df.groupby(f"geo_level_{i}_id")["area_percentage"].transform(np.average)
    df[f"geo_{i}_age_avg"] =  df.groupby(f"geo_level_{i}_id")["age"].transform(np.average)
    df[f"geo_{i}_families_avg"] = df.groupby(f"geo_level_{i}_id")["count_families"].transform(np.average)
    df[f"geo_{i}_volume_avg"] = df.groupby(f"geo_level_{i}_id")["volume_percentage"].transform(np.average)
    df[f"geo_{i}_floors_avg"] = df.groupby(f"geo_level_{i}_id")["count_floors_pre_eq"].transform(np.average)
  return df

#X_train = feature_engieniere(X_train)
#X_test = feature_engieniere(X_test)

category_cols = X_train.select_dtypes(include=['category', "bool"]).columns.to_list()
numeric_cols = X_train.select_dtypes(include=['int16', 'int8', "float64"]).columns.to_list()

preprocessor = ColumnTransformer([('onehot', OneHotEncoder(handle_unknown='ignore'), category_cols)],
                                  remainder='passthrough')

X_train_prep = preprocessor.fit_transform(X_train)
X_test_prep  = preprocessor.transform(X_test)

encoded_cat = preprocessor.named_transformers_['onehot'].get_feature_names(category_cols)
labels = np.concatenate([numeric_cols, encoded_cat])

X_train_prep = pd.DataFrame(X_train_prep, columns=labels)
X_test_prep  = pd.DataFrame(X_test_prep, columns=labels)
X_train_prep.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 221510 entries, 0 to 221509
Data columns (total 98 columns):
 #   Column                                  Non-Null Count   Dtype  
---  ------                                  --------------   -----  
 0   geo_level_2_id                          221510 non-null  float64
 1   geo_level_3_id                          221510 non-null  float64
 2   count_floors_pre_eq                     221510 non-null  float64
 3   age                                     221510 non-null  float64
 4   area_percentage                         221510 non-null  float64
 5   height_percentage                       221510 non-null  float64
 6   has_superstructure_adobe_mud            221510 non-null  float64
 7   has_superstructure_mud_mortar_stone     221510 non-null  float64
 8   has_superstructure_stone_flag           221510 non-null  float64
 9   has_superstructure_cement_mortar_stone  221510 non-null  float64
 10  has_superstructure_mud_mortar_brick     2215

In [None]:
param_grid = {'n_estimators'  : [350,300],
              'max_depth'     : [9,8], 
              'subsample'     : [0.6, 0.8], 
              'learning_rate' : [0.15, 0.45] 
             }

grid = GridSearchCV(
        estimator  = GradientBoostingClassifier(random_state=123),
        param_grid = param_grid,
        scoring    = 'accuracy',
        n_jobs     = -1,
        cv         = RepeatedKFold(n_splits=3, n_repeats=1, random_state=123), 
        refit      = True,
        verbose    = 1,
        return_train_score = True
       )

grid.fit(X = X_train_prep, y = y_train)

resultados = pd.DataFrame(grid.cv_results_)
resultados.filter(regex = '(param*|mean_t|std_t)') \
    .drop(columns = 'params') \
    .sort_values('mean_test_score', ascending = False) \
    .head(4)

modelo_final = grid.best_estimator_ 

Fitting 3 folds for each of 16 candidates, totalling 48 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.


In [None]:
# Mejores hiperparámetros por validación cruzada
# ==============================================================================
print("----------------------------------------")
print("Mejores hiperparámetros encontrados (cv)")
print("----------------------------------------")
print(grid.best_params_, ":", grid.best_score_, grid.scoring)

In [None]:
modelo_final = GradientBoostingClassifier(n_estimators = 305,
                                  max_depth = 9,
                                  min_samples_split = 2,
                                  min_samples_leaf = 3,
                                  subsample=0.6,
                                  verbose=True,
                                  learning_rate=0.15)
modelo_final.fit(X = X_train_prep, y = y_train)

predicciones = modelo_final.predict(X = X_test_prep)
mat_confusion = confusion_matrix(
                    y_true    = y_test,
                    y_pred    = predicciones
                )

accuracy = accuracy_score(
            y_true    = y_test,
            y_pred    = predicciones,
            normalize = True
           )

print("Matriz de confusión")
print("-------------------")
print(mat_confusion)
print("")
print(f"El accuracy de test es: {100 * accuracy} %")

In [None]:
y_preds = modelo_final.predict(X_test_prep)
f1_score(y_test, y_preds, average='micro')

In [None]:
test_values = pd.read_csv('test_values.csv', index_col = "building_id")
test_values

In [None]:
for row in to_be_categorized:
    test_values[row] = test_values[row].astype("category")

datatypes = dict(test_values.dtypes)
for row in test_values.columns:
    if datatypes[row] != "int64" and datatypes[row] != "int32" and \
       datatypes[row] != "int16" and datatypes[row] != "int8":
        continue
    if test_values[row].nlargest(1).item() > 32767 and test_values[row].nlargest(1).item() < 2**31:
        test_values[row] = test_values[row].astype(np.int32)
    elif test_values[row].nlargest(1).item() > 127:
        test_values[row] = test_values[row].astype(np.int16)
    else:
        test_values[row] = test_values[row].astype(np.int8)

cat_cols = test_values.select_dtypes(include=['category', "bool"]).columns.to_list()
num_cols = test_values.select_dtypes(include=['int16', 'int8', "float64"]).columns.to_list()

preprocessor_2 = ColumnTransformer([('onehot', OneHotEncoder(handle_unknown='ignore'), cat_cols)],
                                  remainder='passthrough')

test_values_reduce = preprocessor_2.fit_transform(test_values)

en_cat = preprocessor_2.named_transformers_['onehot'].get_feature_names(cat_cols)
labels_2 = np.concatenate([num_cols, en_cat])

test_values_reduce = pd.DataFrame(test_values_reduce, columns=labels_2)

test_values_reduce.info()

In [None]:
preds = modelo_final.predict(test_values_reduce)
submission_format = pd.read_csv("submission_format.csv", index_col = "building_id")
my_submission = pd.DataFrame(data=preds,
                             columns=submission_format.columns,
                             index=submission_format.index)

In [None]:
my_submission.to_csv('gradient_boosting_model_1.csv')
!head gradient_boosting_model_1.csv