In [222]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score
import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
from sklearn import tree
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
import seaborn as sns
from datetime import datetime
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.model_selection import cross_validate,KFold
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold
import warnings
warnings.filterwarnings('ignore')

In [223]:
train_values = pd.read_csv('train_values.csv')
train_labels = pd.read_csv('train_labels.csv')
test_values = pd.read_csv('test_values.csv',index_col='building_id')

In [224]:
set_train = train_values.merge(train_labels)
set_train.drop( columns=["building_id"], inplace=True)

In [225]:
promedio = set_train['age'].mean()
b = set_train['age'].values.tolist()
b = [promedio if x > 100 else x for x in b]
set_train['age'] = b

promedio2 = set_train['count_floors_pre_eq'].mean()
a = set_train['count_floors_pre_eq'].values.tolist()
a = [promedio2 if x > 5 else x for x in a]
set_train['count_floors_pre_eq'] = a

In [226]:
def encoding(values, columnData, setDatos):
    df = pd.DataFrame()
    column = columnData
    if(setDatos=='set_train'):
        df[values] = pd.get_dummies(set_train[column])
    else:
        df[values] = pd.get_dummies(test_values[column])
    df['sumatory'] = 0
    for i in range(len(values)):
        df['sumatory'] += df[values[i]] * df[values[i]].mean()
    
    return df['sumatory']

In [227]:
data_test = test_values

In [228]:
set_train[['roof_type_n', 'roof_type_q', 'roof_type_x']] = pd.get_dummies(set_train['roof_type'])
set_train[['ground_floor_type_f','ground_floor_type_m','ground_floor_type_v','ground_floor_type_x','ground_floor_type_z']] = pd.get_dummies(set_train['ground_floor_type'])
set_train[['other_floor_type_j','other_floor_type_q','other_floor_type_s','other_floor_type_x']] = pd.get_dummies(set_train['other_floor_type'])
set_train[['foundation_type_h', 'foundation_type_i', 'foundation_type_r', 'foundation_type_u', 'foundation_type_w']] = pd.get_dummies(set_train['foundation_type'])
set_train[['land_surface_condition_n', 'land_surface_condition_o', 'land_surface_condition_t']] = pd.get_dummies(set_train['land_surface_condition'])
set_train[['position_j', 'position_o', 'position_s', 'position_t']] = pd.get_dummies(set_train['position'])
set_train[['plan_configuration_d', 'plan_configuration_u', 'plan_configuration_s', 'plan_configuration_q', 'plan_configuration_m', 'plan_configuration_c', 'plan_configuration_a', 'plan_configuration_n', 'plan_configuration_f', 'plan_configuration_o']] = pd.get_dummies(set_train['plan_configuration']) 


In [229]:
data_test[['roof_type_n', 'roof_type_q', 'roof_type_x']] = pd.get_dummies(test_values['roof_type'])
data_test[['ground_floor_type_f','ground_floor_type_m','ground_floor_type_v','ground_floor_type_x','ground_floor_type_z']] = pd.get_dummies(test_values['ground_floor_type'])
data_test[['other_floor_type_j','other_floor_type_q','other_floor_type_s','other_floor_type_x']] = pd.get_dummies(test_values['other_floor_type'])
data_test[['foundation_type_h', 'foundation_type_i', 'foundation_type_r', 'foundation_type_u', 'foundation_type_w']] = pd.get_dummies(test_values['foundation_type'])
data_test[['land_surface_condition_n', 'land_surface_condition_o', 'land_surface_condition_t']] = pd.get_dummies(test_values['land_surface_condition'])
data_test[['position_j', 'position_o', 'position_s', 'position_t']] = pd.get_dummies(test_values['position'])
data_test[['plan_configuration_d', 'plan_configuration_u', 'plan_configuration_s', 'plan_configuration_q', 'plan_configuration_m', 'plan_configuration_c', 'plan_configuration_a', 'plan_configuration_n', 'plan_configuration_f', 'plan_configuration_o']] = pd.get_dummies(test_values['plan_configuration']) 


In [230]:
columnas_seleccionadas= ['age', 'geo_level_1_id', 'geo_level_2_id', 'geo_level_3_id',
'count_floors_pre_eq', 'area_percentage', 'height_percentage', 'count_families',
'has_secondary_use','has_secondary_use_agriculture', 'has_secondary_use_hotel', 'has_secondary_use_rental',
'has_secondary_use_institution', 'has_secondary_use_school', 'has_secondary_use_industry', 
'has_secondary_use_health_post', 'has_secondary_use_gov_office','has_secondary_use_use_police',
'has_secondary_use_other',
'has_superstructure_rc_engineered', 'has_superstructure_rc_non_engineered', 'has_superstructure_cement_mortar_brick',
'has_superstructure_cement_mortar_stone', 'has_superstructure_mud_mortar_brick', 'has_superstructure_mud_mortar_stone',
'has_superstructure_stone_flag', 'has_superstructure_timber', 'has_superstructure_adobe_mud', 
'has_superstructure_bamboo', 'has_superstructure_other', 
'roof_type_n', 'roof_type_q', 'roof_type_x',
'ground_floor_type_f','ground_floor_type_m',
'ground_floor_type_v','ground_floor_type_x',
'ground_floor_type_z',
'other_floor_type_j','other_floor_type_q',
'other_floor_type_s','other_floor_type_x',
'foundation_type_h', 'foundation_type_i', 
'foundation_type_r', 'foundation_type_u', 
'foundation_type_w', 'land_surface_condition_n', 
'land_surface_condition_o', 'land_surface_condition_t',
'position_j', 'position_o', 
'position_s', 'position_t',
'plan_configuration_d', 'plan_configuration_u', 
'plan_configuration_s', 'plan_configuration_q', 
'plan_configuration_m', 'plan_configuration_c', 
'plan_configuration_a', 'plan_configuration_n', 
'plan_configuration_f', 'plan_configuration_o',
                         'damage_grade'
]

In [231]:
#data_test[['position_j', 'position_o', 'position_s', 'position_t']] = pd.get_dummies(test_values['position'])

In [232]:
#data_test[['owner_a', 'owner_r', 'owner_v', 'owner_w']]= pd.get_dummies(test_values['legal_ownership_status'])

In [233]:
set_train[columnas_seleccionadas].to_csv('data_set.csv', index=False)
data_test.to_csv('data_test.csv')