# Import Data

First we import the data from the csv file. We use the pandas library to read the csv file and store it in a dataframe.

In [1]:
import sys

sys.path.append("../src")

import importer

raw_train_values, raw_train_labels, raw_test_values = importer.import_data(directory="../Data")

# Print the shapes of the data
print("Train values shape: ", raw_train_values.shape)
print("Train labels shape: ", raw_train_labels.shape)
print("Test values shape: ", raw_test_values.shape)


Train values shape:  (260601, 39)
Train labels shape:  (260601, 2)
Test values shape:  (86868, 39)


# Clean Data

We clean the data by removing the rows categorical data. This is a fast implementation of the data cleaning process.

In [2]:
import cleaner

train_data, test_data = cleaner.clean(raw_train_values, raw_train_labels, raw_test_values)

# Print the shapes of the new data
print("Train data shape: ", train_data.shape)
print("Test data shape: ", test_data.shape)

Train data shape:  (260601, 40)
Test data shape:  (86868, 39)


# Encode Data

In [3]:
import encoder

train_data, test_data = encoder.encode(train_data, test_data)

# Print the shapes of the new data
print("Train data shape: ", train_data.shape)
print("Test data shape: ", test_data.shape)

# Print the columns of the new data
print("Train data columns: ", train_data.columns)
print("Test data columns: ", test_data.columns)

Train data shape:  (260601, 82)
Test data shape:  (86868, 81)
Train data columns:  Index(['building_id', 'geo_level_1_id_0', 'geo_level_1_id_1',
       'geo_level_1_id_2', 'geo_level_1_id_3', 'geo_level_1_id_4',
       'geo_level_2_id_0', 'geo_level_2_id_1', 'geo_level_2_id_2',
       'geo_level_2_id_3', 'geo_level_2_id_4', 'geo_level_2_id_5',
       'geo_level_2_id_6', 'geo_level_2_id_7', 'geo_level_2_id_8',
       'geo_level_2_id_9', 'geo_level_2_id_10', 'geo_level_3_id_0',
       'geo_level_3_id_1', 'geo_level_3_id_2', 'geo_level_3_id_3',
       'geo_level_3_id_4', 'geo_level_3_id_5', 'geo_level_3_id_6',
       'geo_level_3_id_7', 'geo_level_3_id_8', 'geo_level_3_id_9',
       'geo_level_3_id_10', 'geo_level_3_id_11', 'geo_level_3_id_12',
       'geo_level_3_id_13', 'count_floors_pre_eq', 'age', 'area_percentage',
       'height_percentage', 'land_surface_condition_0',
       'land_surface_condition_1', 'foundation_type_0', 'foundation_type_1',
       'foundation_type_2', 'roof_type

# Resample Data

In [4]:
import resampler

train_data = resampler.resample(train_data)

# Create a model

We create a model using the sklearn library. We use the XGBoost to create a model.

In [5]:
import model

model = model.XGBoost(train_data)

# Evaluate the model

In [None]:
import evaluator

predictions = evaluator.print_model_summary(model, test_data)

# Submit

In [None]:
import pandas as pd

# Create a data frame with the predictions
predictions = pd.DataFrame(predictions, columns=['damage_grade_0', 'damage_grade_1'])

# Add the building_id column
predictions['building_id'] = raw_test_values['building_id']

# Rearrange the columns
predictions = predictions[['building_id', 'damage_grade_0', 'damage_grade_1']]

# Save the predictions in a csv file
predictions.to_csv('../Data/predictions.csv', index=False)

predictions.head()