In [1]:
%matplotlib inline

from pathlib import Path

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import f1_score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.metrics import make_scorer

In [2]:
def load_datasets():
    train_values = pd.read_csv('train_values.csv', index_col='building_id')
    train_labels = pd.read_csv( 'train_labels.csv', index_col='building_id')
    test_values = pd.read_csv('test_values.csv', index_col = 'building_id')

    nominal_categorical_values = ['land_surface_condition', 'foundation_type','roof_type','ground_floor_type',
                             'other_floor_type','position','plan_configuration','legal_ownership_status']

    drop_secondary_use_category_list = {'has_secondary_use_other','has_secondary_use_use_police','has_secondary_use_gov_office',
                                       'has_secondary_use_health_post','has_secondary_use_industry','has_secondary_use_school','has_secondary_use_institution',
                                       'has_secondary_use_rental','has_secondary_use_hotel','has_secondary_use_agriculture'}
    
    geo_1_max = 30
    geo_2_max = 1427
    geo_3_max = 12567

    train_values = pd.get_dummies(train_values, columns=nominal_categorical_values, prefix=nominal_categorical_values)
    test_values = pd.get_dummies(test_values, columns=nominal_categorical_values, prefix=nominal_categorical_values)
       
    train_values = train_values.drop(columns = drop_secondary_use_category_list)
    test_values = test_values.drop(columns = drop_secondary_use_category_list)
    
    return train_values,train_labels,test_values


In [3]:
#evaluation for each model 
def evaluate_model(X, y, model):
    # define evaluation procedure
    cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=42)
    # evaluate model
    scores = cross_val_score(model, X, y, scoring= make_scorer(f1_score), cv=cv, n_jobs=-1)
    return scores

In [4]:
train_values,train_labels,test_values = load_datasets()

In [5]:
test_values.head()

Unnamed: 0_level_0,geo_level_1_id,geo_level_2_id,geo_level_3_id,count_floors_pre_eq,age,area_percentage,height_percentage,has_superstructure_adobe_mud,has_superstructure_mud_mortar_stone,has_superstructure_stone_flag,...,plan_configuration_m,plan_configuration_n,plan_configuration_o,plan_configuration_q,plan_configuration_s,plan_configuration_u,legal_ownership_status_a,legal_ownership_status_r,legal_ownership_status_v,legal_ownership_status_w
building_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
300051,17,596,11307,3,20,7,6,0,1,0,...,0,0,0,0,0,0,0,0,1,0
99355,6,141,11987,2,25,13,5,0,1,0,...,0,0,0,0,0,0,0,0,1,0
890251,22,19,10044,2,5,4,5,0,1,0,...,0,0,0,0,0,0,0,0,1,0
745817,26,39,633,1,0,19,3,0,0,0,...,0,0,0,0,0,0,0,0,1,0
421793,17,289,7970,3,15,8,7,0,1,0,...,0,0,0,0,0,0,0,0,1,0


In [6]:
train_values.head()

Unnamed: 0_level_0,geo_level_1_id,geo_level_2_id,geo_level_3_id,count_floors_pre_eq,age,area_percentage,height_percentage,has_superstructure_adobe_mud,has_superstructure_mud_mortar_stone,has_superstructure_stone_flag,...,plan_configuration_m,plan_configuration_n,plan_configuration_o,plan_configuration_q,plan_configuration_s,plan_configuration_u,legal_ownership_status_a,legal_ownership_status_r,legal_ownership_status_v,legal_ownership_status_w
building_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
802906,6,487,12198,2,30,6,5,1,1,0,...,0,0,0,0,0,0,0,0,1,0
28830,8,900,2812,2,10,8,7,0,1,0,...,0,0,0,0,0,0,0,0,1,0
94947,21,363,8973,2,10,5,5,0,1,0,...,0,0,0,0,0,0,0,0,1,0
590882,22,418,10694,2,10,6,5,0,1,0,...,0,0,0,0,0,0,0,0,1,0
201944,11,131,1488,3,30,8,9,1,0,0,...,0,0,0,0,0,0,0,0,1,0


In [7]:
train_values.var()

geo_level_1_id                            6.453900e+01
geo_level_2_id                            1.703301e+05
geo_level_3_id                            1.329601e+07
count_floors_pre_eq                       5.294957e-01
age                                       5.411947e+03
area_percentage                           1.929169e+01
height_percentage                         3.680328e+00
has_superstructure_adobe_mud              8.078745e-02
has_superstructure_mud_mortar_stone       1.813908e-01
has_superstructure_stone_flag             3.315361e-02
has_superstructure_cement_mortar_stone    1.790233e-02
has_superstructure_mud_mortar_brick       6.350927e-02
has_superstructure_cement_mortar_brick    6.960327e-02
has_superstructure_timber                 1.899696e-01
has_superstructure_bamboo                 7.778458e-02
has_superstructure_rc_non_engineered      4.077626e-02
has_superstructure_rc_engineered          1.560803e-02
has_superstructure_other                  1.476011e-02
count_fami

In [8]:
X_train = 

from sklearn.decomposition import PCA

pca = PCA(n_components=1)
X_train = pca.fit_transform(X_train)
X_test = pca.transform(X_test)

SyntaxError: invalid syntax (<ipython-input-8-83fdb9d7129b>, line 1)

In [9]:
from sklearn.ensemble import RandomForestClassifier


In [10]:
# define models to test
def get_models():
	models, names = list(), list()
	models.append(RandomForestClassifier(n_estimators=1000))
	names.append('RF')

	return models, names

In [11]:
models, names = get_models()
results = list()

In [12]:
for i in range(len(models)):
	# evaluate the model and store results
	scores = evaluate_model(train_values, train_labels, models[i])
	results.append(scores)
	# summarize performance
	print('>%s %.3f (%.3f)' % (names[i], mean(scores), std(scores)))

KeyboardInterrupt: 