In [None]:
# import data science libraries
import pandas as pd
import seaborn as sb
import matplotlib.pyplot as plt # we only need pyplot
sb.set() # set the default Seaborn style for graphics

In [None]:
# import data
labels = pd.read_csv('../data/earthquake/train_labels.csv')
labels.head()
data = pd.read_csv('../data/earthquake/train_values.csv')
data.head()
# merge damage sustained into normal data
combined_data = pd.merge(data, labels, on='building_id')

In [None]:
# initial data analysis
data.info()
labels.info()
combined_data.info()

In [None]:
data_frame = combined_data[['count_floors_pre_eq','damage_grade']]
data_frame.head()
data_frame.info()

In [None]:
sb.countplot(x='count_floors_pre_eq',hue = 'damage_grade', data=data_frame)

In [None]:
# Find percentage of bulidings damaged/destroyed for each type of floor
for x in range(1,10):
    processed_data_frame = combined_data.copy(deep=True)
    processed_data_frame.head()
    floor_data_frame = processed_data_frame[processed_data_frame['count_floors_pre_eq']==x]
    floor_data_frame.head()
    collapsed = len(floor_data_frame[floor_data_frame['damage_grade'] == 3])
    damaged = len(floor_data_frame[floor_data_frame['damage_grade'] == 2])
    intact = len(floor_data_frame[floor_data_frame['damage_grade'] == 1])
    total = intact+damaged+collapsed
    result = (damaged+collapsed)/total
    print("floors: "+ str(x)+" Percentage: "+str(result)+ " Total: "+str(total)+ " Intact: "+str(intact))

In [None]:
# There is only one building with 8 floors which was damaged and one with 9 floors which was undamaged, might be outliers
# or might want to investigate what makes 8th floor damaged but 9th floor not

In [None]:
ax = sb.violinplot(x="damage_grade", y="count_floors_pre_eq", data=combined_data)

In [None]:
ax = sb.countplot(x="roof_type", hue="damage_grade", data=combined_data)
ax.set_title("Does roof type affect damage?")

In [None]:
ax = sb.countplot(x="foundation_type", hue="damage_grade", data=combined_data)
ax.set_title("Does foundation type affect damage?")

In [None]:
processed_data_frame = combined_data.copy(deep=True)
processed_data_frame.head()
superstructure_data_frame = processed_data_frame[processed_data_frame['has_superstructure_adobe_mud']==1]
ax = sb.countplot(x="damage_grade", data=superstructure_data_frame)
ax.set_title("Damage to mud buildings")

In [None]:
processed_data_frame = combined_data.copy(deep=True)
processed_data_frame.head()
superstructure_data_frame = processed_data_frame[processed_data_frame['has_superstructure_mud_mortar_stone']==1]
ax = sb.countplot(x="damage_grade", data=superstructure_data_frame)
ax.set_title("Damage to mortar-stone buildings")

In [None]:
processed_data_frame = combined_data.copy(deep=True)
processed_data_frame.head()
superstructure_data_frame = processed_data_frame[processed_data_frame['has_superstructure_stone_flag']==1]
ax = sb.countplot(x="damage_grade", data=superstructure_data_frame)
ax.set_title("Damage of stone buildings")

In [None]:
processed_data_frame = combined_data.copy(deep=True)
processed_data_frame.head()
superstructure_data_frame = processed_data_frame[processed_data_frame['has_superstructure_cement_mortar_stone']==1]
ax = sb.countplot(x="damage_grade", data=superstructure_data_frame)
ax.set_title("Damage to cement-mortar buildings")

In [None]:
processed_data_frame = combined_data.copy(deep=True)
processed_data_frame.head()
superstructure_data_frame = processed_data_frame[processed_data_frame['has_superstructure_mud_mortar_brick']==1]
ax = sb.countplot(x="damage_grade", data=superstructure_data_frame)
ax.set_title("Damage to mud-mortar-brick buildings")

In [None]:
processed_data_frame = combined_data.copy(deep=True)
processed_data_frame.head()
superstructure_data_frame = processed_data_frame[processed_data_frame['has_superstructure_cement_mortar_brick']==1]
ax = sb.countplot(x="damage_grade", data=superstructure_data_frame)
ax.set_title("Damage to cement-mortar-brick buildings")

In [None]:
processed_data_frame = combined_data.copy(deep=True)
processed_data_frame.head()
superstructure_data_frame = processed_data_frame[processed_data_frame['has_superstructure_timber']==1]
ax = sb.countplot(x="damage_grade", data=superstructure_data_frame)
ax.set_title("Damage to timber buildings")

In [None]:
processed_data_frame = combined_data.copy(deep=True)
processed_data_frame.head()
superstructure_data_frame = processed_data_frame[processed_data_frame['has_superstructure_bamboo']==1]
ax = sb.countplot(x="damage_grade", data=superstructure_data_frame)
ax.set_title("Damage to bambooo buildings")

In [None]:
processed_data_frame = combined_data.copy(deep=True)
processed_data_frame.head()
superstructure_data_frame = processed_data_frame[processed_data_frame['has_superstructure_rc_non_engineered']==1]
ax = sb.countplot(x="damage_grade", data=superstructure_data_frame)
ax.set_title("Damage to non-engineered reinforced concrete buildings")

In [None]:
processed_data_frame = combined_data.copy(deep=True)
processed_data_frame.head()
superstructure_data_frame = processed_data_frame[processed_data_frame['has_superstructure_rc_engineered']==1]
ax = sb.countplot(x="damage_grade", data=superstructure_data_frame)
ax.set_title("Damage to engineered reinforced concrete buildings")

In [None]:
processed_data_frame = combined_data.copy(deep=True)
processed_data_frame.head()
superstructure_data_frame = processed_data_frame[processed_data_frame['has_secondary_use']==1]
ax = sb.countplot(x="damage_grade", data=superstructure_data_frame)
ax.set_title("Damage to buildings with secondary use")
print("buildings with secondary use are generally less damaged than those with one use, better engineered?")

In [None]:
processed_data_frame = combined_data.copy(deep=True)
processed_data_frame.head()
superstructure_data_frame = processed_data_frame[processed_data_frame['has_secondary_use']==0]
ax = sb.countplot(x="damage_grade", data=superstructure_data_frame)
ax.set_title("Damage to buildings with no secondary use")