In [9]:
import numpy as np
import pandas as pd
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

Definimos algunas funciones

In [10]:
def encodingByMean(values, columnData, setDatos):
    df = pd.DataFrame()
    column = columnData
    if(setDatos=='set_train'):
        df[values] = pd.get_dummies(set_train[column])
    else:
        df[values] = pd.get_dummies(test_values[column])
    df['sumatory'] = 0
    for i in range(len(values)):
        df['sumatory'] += df[values[i]] * df[values[i]].mean()
    
    return df['sumatory']

In [11]:
def merge_per_concat(dataframe1, dataframe2, mergeEn):
  merge = dataframe1[mergeEn]
  merge = merge.merge(dataframe2, on=mergeEn, how='left')
  features = [col for col in list(merge) if col not in mergeEn]
  dataframe1 = pd.concat([dataframe1, merge[features]], axis=1)
  return dataframe1

Leemos los archivos

In [12]:
train_values = pd.read_csv('train_values.csv')
train_labels = pd.read_csv('train_labels.csv')
test_values = pd.read_csv('test_values.csv',index_col='building_id')

Realizamos un merge entre train values y train labels

In [13]:
set_train = train_values.merge(train_labels)
set_train.drop( columns=["building_id"], inplace=True)

Creamos nuevos Features. Agrupamos los 3 niveles de Geo Level, y calculamos el promedio de la edad, el area y la altura para cada combinación posible de geolevels. 

In [14]:
all_data = pd.concat([set_train, test_values], axis=0).reset_index(drop=True)

In [15]:
geolevel_age_mean = all_data.groupby(['geo_level_1_id','geo_level_2_id','geo_level_3_id'])['age'].agg(['mean']).reset_index().rename(columns={'mean':'geolevel_grouped_age_mean'})
geolevel_height_mean = all_data.groupby(['geo_level_1_id','geo_level_2_id','geo_level_3_id'])['height_percentage'].agg(['mean']).reset_index().rename(columns={'mean':'geolevel_grouped_height_mean'})
geolevel_area_mean = all_data.groupby(['geo_level_1_id','geo_level_2_id','geo_level_3_id'])['area_percentage'].agg(['mean']).reset_index().rename(columns={'mean':'geolevel_grouped_area_mean'})

construction_type_age_mean = all_data.groupby(['foundation_type','roof_type','ground_floor_type','other_floor_type'])['age'].agg(['mean']).reset_index().rename(columns={'mean':'construction_type_grouped_age_mean'})
construction_type_height_mean = all_data.groupby(['foundation_type','roof_type','ground_floor_type','other_floor_type'])['height_percentage'].agg(['mean']).reset_index().rename(columns={'mean':'construction_type_grouped_height_mean'})
construction_type_area_mean = all_data.groupby(['foundation_type','roof_type','ground_floor_type','other_floor_type'])['area_percentage'].agg(['mean']).reset_index().rename(columns={'mean':'construction_type_grouped_area_mean'})

all_data = merge_per_concat(all_data, geolevel_age_mean, ['geo_level_1_id','geo_level_2_id','geo_level_3_id'])
all_data = merge_per_concat(all_data, geolevel_height_mean, ['geo_level_1_id','geo_level_2_id','geo_level_3_id'])
all_data = merge_per_concat(all_data, geolevel_area_mean, ['geo_level_1_id','geo_level_2_id','geo_level_3_id'])
all_data = merge_per_concat(all_data, construction_type_age_mean, ['foundation_type','roof_type','ground_floor_type','other_floor_type'])
all_data = merge_per_concat(all_data, construction_type_height_mean, ['foundation_type','roof_type','ground_floor_type','other_floor_type'])
all_data = merge_per_concat(all_data, construction_type_area_mean, ['foundation_type','roof_type','ground_floor_type','other_floor_type'])

In [16]:
set_train = all_data.iloc[0:260601, :]

In [17]:
test_values = all_data.iloc[260601:347469, :]

Creamos el feature Base Condition

In [18]:
set_train['base_condition'] = encodingByMean(['h', 'i', 'r', 'u', 'w'] , 'foundation_type', 'set_train') + encodingByMean(['n', 'o', 't'] , 'land_surface_condition', 'set_train')
test_values['base_condition'] = encodingByMean(['h', 'i', 'r', 'u', 'w'] , 'foundation_type', 'test_values') + encodingByMean(['n', 'o', 't'] , 'land_surface_condition', 'test_values')

Creamos el feature Volume Percentage

In [19]:
set_train['volume_percentage'] = set_train['area_percentage'] * set_train['height_percentage']
test_values['volume_percentage'] = test_values['area_percentage'] * test_values['height_percentage']

Realizamos un proceso de One Hot Encoding para las variables categoricas

In [20]:
set_train['roof_type'] = encodingByMean(['n', 'q', 'x'] , 'roof_type', 'set_train')
set_train['ground_floor_type'] = encodingByMean(['f','m','v','x','z'] , 'ground_floor_type', 'set_train')
set_train['other_floor_type'] = encodingByMean(['j','q','s','x'] , 'other_floor_type', 'set_train')
set_train['land_surface_condition'] = encodingByMean(['n','o','t'],  'land_surface_condition', 'set_train')
set_train['foundation_type'] = encodingByMean(['h','i','r','u','w'], 'foundation_type', 'set_train')    
set_train['position'] = encodingByMean(['j','o','s','t'], 'position', 'set_train')                                              
set_train['plan_configuration'] = encodingByMean(['a', 'c', 'd', 'f', 'm', 'n', 'o', 'q', 's', 'u'], 'plan_configuration', 'set_train')

set_train[['land_surface_condition_n', 'land_surface_condition_o', 'land_surface_condition_t']] = pd.get_dummies(set_train['land_surface_condition'])
set_train[['foundation_type_h', 'foundation_type_i', 'foundation_type_r', 'foundation_type_u', 'foundation_type_w']] = pd.get_dummies(set_train['foundation_type'])
set_train[['roof_type_n', 'roof_type_q', 'roof_type_x']] = pd.get_dummies(set_train['roof_type'])
set_train[['ground_floor_type_f', 'ground_floor_type_m', 'ground_floor_type_v', 'ground_floor_type_x', 'ground_floor_type_z']] = pd.get_dummies(set_train['ground_floor_type'])
set_train[['other_floor_type_j', 'other_floor_type_q', 'other_floor_type_s', 'other_floor_type_x']] = pd.get_dummies(set_train['other_floor_type'])
set_train[['position_j', 'position_o', 'position_s', 'position_t']] = pd.get_dummies(set_train['position'])
set_train[['plan_configuration_a', 'plan_configuration_c', 'plan_configuration_d', 'plan_configuration_f', 'plan_configuration_m', 'plan_configuration_n', 'plan_configuration_o', 'plan_configuration_q', 'plan_configuration_s', 'plan_configuration_u']] = pd.get_dummies(set_train['plan_configuration'])

In [21]:
test_values['roof_type'] = encodingByMean(['n', 'q', 'x'] , 'roof_type', 'test_values')
test_values['ground_floor_type'] = encodingByMean(['f','m','v','x','z'] , 'ground_floor_type', 'test_values')
test_values['other_floor_type'] = encodingByMean(['j','q','s','x'] , 'other_floor_type', 'test_values')
test_values['land_surface_condition'] = encodingByMean(['n','o','t'],  'land_surface_condition', 'test_values')
test_values['foundation_type'] = encodingByMean(['h','i','r','u','w'], 'foundation_type', 'test_values')                                               
test_values['position'] = encodingByMean(['j','o','s','t'], 'position', 'test_values')                                              
test_values['plan_configuration'] = encodingByMean(['a', 'c', 'd', 'f', 'm', 'n', 'o', 'q', 's', 'u'], 'plan_configuration', 'test_values')

test_values[['land_surface_condition_n', 'land_surface_condition_o', 'land_surface_condition_t']] = pd.get_dummies(test_values['land_surface_condition'])
test_values[['foundation_type_h', 'foundation_type_i', 'foundation_type_r', 'foundation_type_u', 'foundation_type_w']] = pd.get_dummies(test_values['foundation_type'])
test_values[['roof_type_n', 'roof_type_q', 'roof_type_x']] = pd.get_dummies(test_values['roof_type'])
test_values[['ground_floor_type_f', 'ground_floor_type_m', 'ground_floor_type_v', 'ground_floor_type_x', 'ground_floor_type_z']] = pd.get_dummies(test_values['ground_floor_type'])
test_values[['other_floor_type_j', 'other_floor_type_q', 'other_floor_type_s', 'other_floor_type_x']] = pd.get_dummies(test_values['other_floor_type'])
test_values[['position_j', 'position_o', 'position_s', 'position_t']] = pd.get_dummies(test_values['position'])
test_values[['plan_configuration_a', 'plan_configuration_c', 'plan_configuration_d', 'plan_configuration_f', 'plan_configuration_m', 'plan_configuration_n', 'plan_configuration_o', 'plan_configuration_q', 'plan_configuration_s', 'plan_configuration_u']] = pd.get_dummies(test_values['plan_configuration'])

Seleccionamos los features a utilizar en nuestro modelo

In [22]:
features = ['age',
            'geo_level_1_id',
            'geo_level_2_id',
            'geo_level_3_id',
            'count_families',
            'base_condition',
            'volume_percentage',
            'area_percentage',
            'height_percentage', 
            'geolevel_grouped_age_mean',
            'geolevel_grouped_height_mean',
            'geolevel_grouped_area_mean',
            'construction_type_grouped_age_mean',
            'construction_type_grouped_height_mean',
            'construction_type_grouped_area_mean',
            'count_floors_pre_eq',
            'has_secondary_use', 'has_secondary_use_agriculture', 'has_secondary_use_hotel',
            'has_secondary_use_rental', 'has_secondary_use_institution', 'has_secondary_use_school',
            'has_secondary_use_industry', 'has_secondary_use_health_post', 'has_secondary_use_gov_office',
            'has_secondary_use_use_police', 'has_secondary_use_other', 
            'has_superstructure_rc_engineered', 'has_superstructure_rc_non_engineered', 'has_superstructure_cement_mortar_brick',
            'has_superstructure_cement_mortar_stone', 'has_superstructure_mud_mortar_brick', 'has_superstructure_mud_mortar_stone',
            'has_superstructure_stone_flag', 'has_superstructure_timber', 'has_superstructure_adobe_mud', 
            'has_superstructure_bamboo', 'has_superstructure_other',
            'land_surface_condition_n', 'land_surface_condition_o', 'land_surface_condition_t',
            'foundation_type_h', 'foundation_type_i', 'foundation_type_r', 'foundation_type_u', 'foundation_type_w',
            'roof_type_n', 'roof_type_q', 'roof_type_x',
            'ground_floor_type_f', 'ground_floor_type_m', 'ground_floor_type_v', 'ground_floor_type_x', 'ground_floor_type_z',
            'other_floor_type_j', 'other_floor_type_q', 'other_floor_type_s', 'other_floor_type_x',
            'position_j', 'position_o', 'position_s', 'position_t',
            'plan_configuration_a', 'plan_configuration_c', 'plan_configuration_d', 'plan_configuration_f', 'plan_configuration_m', 'plan_configuration_n', 'plan_configuration_o', 'plan_configuration_q', 'plan_configuration_s', 'plan_configuration_u',
            'damage_grade']

Creamos los archivos de set train y test values, listos para utilizar en nuestros modelos

In [23]:
set_train[features].to_csv('data_set.csv', index=False)
test_values.to_csv('data_test.csv')