In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
plt.style.use('default') # haciendo los graficos un poco mas bonitos en matplotlib
#plt.rcParams['figure.figsize'] = (20, 10)
sns.set(style="whitegrid") # seteando tipo de grid en seaborn
pd.options.display.float_format = '{:20,.2f}'.format # suprimimos la notacion cientifica
                                                     # en los outputs
import warnings
warnings.filterwarnings('ignore')

In [3]:
# Lectura acorde a los tipos
train_labels = pd.read_csv("train_labels.csv",\
                           dtype={"damage_grade":'category'})
train_labels.head(1)

Unnamed: 0,building_id,damage_grade
0,802906,3


In [4]:
train_labels.shape

(260601, 2)

In [5]:
train_labels.dtypes

building_id        int64
damage_grade    category
dtype: object

In [6]:
# Verifico que no haya daños fuera del rango 1 - 3
train_labels['damage_grade'].value_counts()

2    148259
3     87218
1     25124
Name: damage_grade, dtype: int64

In [7]:
# Lectura acorde a los tipos
# Tomo los booleanos como categóricos para evitar ensuciar el DF con True y False
train_values = pd.read_csv("train_values.csv",\
                           dtype={"land_surface_condition":'category', "foundation_type":'category',\
                                  "roof_type":'category', "ground_floor_type":'category',\
                                  "other_floor_type":'category', "position":'category',\
                                  "plan_configuration":'category',\
                                  "has_superstructure_adobe_mud":'category',\
                                  "has_superstructure_mud_mortar_stone":'category',\
                                  "has_superstructure_stone_flag":'category',\
                                  "has_superstructure_cement_mortar_stone":'category',\
                                  "has_superstructure_mud_mortar_brick":'category',\
                                  "has_superstructure_cement_mortar_brick":'category',\
                                  "has_superstructure_timber":'category',\
                                  "has_superstructure_bamboo":'category',\
                                  "has_superstructure_rc_non_engineered":'category',\
                                  "has_superstructure_rc_engineered":'category',\
                                  "has_superstructure_other":'category',\
                                  "legal_ownership_status":'category',\
                                  "has_secondary_use":'category',\
                                  "has_secondary_use_agriculture":'category',\
                                  "has_secondary_use_hotel":'category',\
                                  "has_secondary_use_rental":'category',\
                                  "has_secondary_use_institution":'category',\
                                  "has_secondary_use_school":'category',\
                                  "has_secondary_use_industry":'category',\
                                  "has_secondary_use_health_post":'category',\
                                  "has_secondary_use_gov_office":'category',\
                                  "has_secondary_use_use_police":'category',\
                                  "has_secondary_use_other":'category'})
train_values.head(1)

Unnamed: 0,building_id,geo_level_1_id,geo_level_2_id,geo_level_3_id,count_floors_pre_eq,age,area_percentage,height_percentage,land_surface_condition,foundation_type,...,has_secondary_use_agriculture,has_secondary_use_hotel,has_secondary_use_rental,has_secondary_use_institution,has_secondary_use_school,has_secondary_use_industry,has_secondary_use_health_post,has_secondary_use_gov_office,has_secondary_use_use_police,has_secondary_use_other
0,802906,6,487,12198,2,30,6,5,t,r,...,0,0,0,0,0,0,0,0,0,0


In [8]:
train_values.shape

(260601, 39)

In [9]:
train_values.dtypes

building_id                                  int64
geo_level_1_id                               int64
geo_level_2_id                               int64
geo_level_3_id                               int64
count_floors_pre_eq                          int64
age                                          int64
area_percentage                              int64
height_percentage                            int64
land_surface_condition                    category
foundation_type                           category
roof_type                                 category
ground_floor_type                         category
other_floor_type                          category
position                                  category
plan_configuration                        category
has_superstructure_adobe_mud              category
has_superstructure_mud_mortar_stone       category
has_superstructure_stone_flag             category
has_superstructure_cement_mortar_stone    category
has_superstructure_mud_mortar_b

In [10]:
train_values['building_id'].equals(train_labels['building_id'])

True

In [11]:
# Verifico igualdad entre columnas de dfs
train_values['building_id'].equals(train_labels['building_id'])

True

In [12]:
# Merge de ambos dfs
train_values_damage = train_values.merge(train_labels, how='inner', on='building_id')
train_values_damage

Unnamed: 0,building_id,geo_level_1_id,geo_level_2_id,geo_level_3_id,count_floors_pre_eq,age,area_percentage,height_percentage,land_surface_condition,foundation_type,...,has_secondary_use_hotel,has_secondary_use_rental,has_secondary_use_institution,has_secondary_use_school,has_secondary_use_industry,has_secondary_use_health_post,has_secondary_use_gov_office,has_secondary_use_use_police,has_secondary_use_other,damage_grade
0,802906,6,487,12198,2,30,6,5,t,r,...,0,0,0,0,0,0,0,0,0,3
1,28830,8,900,2812,2,10,8,7,o,r,...,0,0,0,0,0,0,0,0,0,2
2,94947,21,363,8973,2,10,5,5,t,r,...,0,0,0,0,0,0,0,0,0,3
3,590882,22,418,10694,2,10,6,5,t,r,...,0,0,0,0,0,0,0,0,0,2
4,201944,11,131,1488,3,30,8,9,t,r,...,0,0,0,0,0,0,0,0,0,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
260596,688636,25,1335,1621,1,55,6,3,n,r,...,0,0,0,0,0,0,0,0,0,2
260597,669485,17,715,2060,2,0,6,5,t,r,...,0,0,0,0,0,0,0,0,0,3
260598,602512,17,51,8163,3,55,6,7,t,r,...,0,0,0,0,0,0,0,0,0,3
260599,151409,26,39,1851,2,10,14,6,t,r,...,0,0,0,0,0,0,0,0,0,2


In [13]:
# Verifico la conservación de los tipos
train_values_damage.dtypes

building_id                                  int64
geo_level_1_id                               int64
geo_level_2_id                               int64
geo_level_3_id                               int64
count_floors_pre_eq                          int64
age                                          int64
area_percentage                              int64
height_percentage                            int64
land_surface_condition                    category
foundation_type                           category
roof_type                                 category
ground_floor_type                         category
other_floor_type                          category
position                                  category
plan_configuration                        category
has_superstructure_adobe_mud              category
has_superstructure_mud_mortar_stone       category
has_superstructure_stone_flag             category
has_superstructure_cement_mortar_stone    category
has_superstructure_mud_mortar_b

In [14]:
# Verifico que valores porcentuales no excedan 100
area_incorrecta = train_values_damage[train_values_damage['area_percentage'] > 100]['area_percentage'].count()
altura_incorrecta = train_values_damage[train_values_damage['height_percentage'] > 100]['area_percentage'].count()
(altura_incorrecta == 0 & area_incorrecta == 0)

True

In [15]:
# Verifico veracidad de datos respecto al recuento de familias
train_values_damage.nlargest(10, 'count_families')

Unnamed: 0,building_id,geo_level_1_id,geo_level_2_id,geo_level_3_id,count_floors_pre_eq,age,area_percentage,height_percentage,land_surface_condition,foundation_type,...,has_secondary_use_hotel,has_secondary_use_rental,has_secondary_use_institution,has_secondary_use_school,has_secondary_use_industry,has_secondary_use_health_post,has_secondary_use_gov_office,has_secondary_use_use_police,has_secondary_use_other,damage_grade
68031,569833,26,39,9897,2,0,42,7,t,i,...,1,0,0,0,0,0,0,0,0,1
87761,147798,7,157,1763,4,20,57,12,o,i,...,1,0,0,0,0,0,0,0,0,2
220496,1031380,20,158,9229,3,10,17,9,t,i,...,0,1,0,0,0,0,0,0,0,2
257343,331776,10,1397,3396,1,5,11,3,t,r,...,0,0,0,0,0,0,0,0,0,2
171692,930289,26,39,1324,2,25,44,6,t,u,...,1,0,0,0,0,0,0,0,0,1
235262,485597,7,157,7679,3,15,20,9,t,i,...,0,0,0,0,0,0,0,0,0,2
54909,718208,26,39,2005,2,15,23,7,t,r,...,0,1,0,0,0,0,0,0,0,1
97087,106261,3,1387,5122,4,90,9,9,t,r,...,0,0,0,0,0,0,0,0,0,2
114389,299955,17,1393,4475,4,40,28,11,t,r,...,0,0,0,0,0,0,0,0,0,3
125628,266702,4,1334,9215,2,10,14,8,t,i,...,1,0,0,0,0,0,0,0,0,1


In [16]:
# Export:
train_values_damage.to_csv("train_values_damage.csv")