In [2]:
import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
from sklearn import tree
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.metrics import f1_score

In [3]:
# for preprocessing the data
from sklearn.preprocessing import StandardScaler

import lightgbm as lgb
from lightgbm import LGBMClassifier

# for combining the preprocess with model training
from sklearn.pipeline import make_pipeline

# for optimizing the hyperparameters of the pipeline
from sklearn.model_selection import GridSearchCV

<h2>Obtengo la data</h2>

In [4]:
train_values_tar_enc = pd.read_csv('data/Target Encoding/train_Target_enc.csv', index_col='building_id')
test_values_tar_enc = pd.read_csv('data/Target Encoding/test_Target_enc.csv', index_col='building_id')
train_values_lab_enc = pd.read_csv('data/train_label_enc.csv', index_col='building_id')
test_values_lab_enc = pd.read_csv('data/test_label_enc.csv', index_col='building_id')
train_labels = pd.read_csv('data/train_labels.csv', index_col='building_id')

In [5]:
train_values_tar_enc.head()

Unnamed: 0_level_0,geo_level_1_id,geo_level_2_id,geo_level_3_id,count_floors_pre_eq,age,area_percentage,height_percentage,land_surface_condition,foundation_type,roof_type,...,has_secondary_use_use_police,has_secondary_use_other,land_surface_condition_t,foundation_type_t,roof_type_t,ground_floor_type_t,other_floor_type_t,position_t,plan_configuration_t,legal_ownership_status_t
building_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
802906,6,487,12198,2,30,6,5,t,r,n,...,0,0,525291.207011,526036.097182,525377.373596,525775.91985,525362.679536,523354.378054,525560.934091,525833.039627
28830,8,900,2812,2,10,8,7,o,r,n,...,0,0,523144.803391,526036.097182,525377.373596,524673.896933,525362.679536,526141.281172,525560.934091,525833.039627
94947,21,363,8973,2,10,5,5,t,r,n,...,0,0,525291.207011,526036.097182,525377.373596,525775.91985,526204.981104,523354.378054,525560.934091,525833.039627
590882,22,418,10694,2,10,6,5,t,r,n,...,0,0,525291.207011,526036.097182,525377.373596,525775.91985,526204.981104,526141.281172,525560.934091,525833.039627
201944,11,131,1488,3,30,8,9,t,r,n,...,0,0,525291.207011,526036.097182,525377.373596,525775.91985,526204.981104,526141.281172,525560.934091,525833.039627


In [6]:
train_values_lab_enc.columns

Index(['geo_level_1_id', 'geo_level_2_id', 'geo_level_3_id',
       'count_floors_pre_eq', 'age', 'area_percentage', 'height_percentage',
       'land_surface_condition', 'foundation_type', 'roof_type',
       'ground_floor_type', 'other_floor_type', 'position',
       'plan_configuration', 'has_superstructure_adobe_mud',
       'has_superstructure_mud_mortar_stone', 'has_superstructure_stone_flag',
       'has_superstructure_cement_mortar_stone',
       'has_superstructure_mud_mortar_brick',
       'has_superstructure_cement_mortar_brick', 'has_superstructure_timber',
       'has_superstructure_bamboo', 'has_superstructure_rc_non_engineered',
       'has_superstructure_rc_engineered', 'has_superstructure_other',
       'legal_ownership_status', 'count_families', 'has_secondary_use',
       'has_secondary_use_agriculture', 'has_secondary_use_hotel',
       'has_secondary_use_rental', 'has_secondary_use_institution',
       'has_secondary_use_school', 'has_secondary_use_industry',
     

In [7]:
train_values_lab_enc.head()

Unnamed: 0_level_0,geo_level_1_id,geo_level_2_id,geo_level_3_id,count_floors_pre_eq,age,area_percentage,height_percentage,land_surface_condition,foundation_type,roof_type,...,has_secondary_use_agriculture,has_secondary_use_hotel,has_secondary_use_rental,has_secondary_use_institution,has_secondary_use_school,has_secondary_use_industry,has_secondary_use_health_post,has_secondary_use_gov_office,has_secondary_use_use_police,has_secondary_use_other
building_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
802906,6,487,12198,2,30,6,5,2,2,0,...,0,0,0,0,0,0,0,0,0,0
28830,8,900,2812,2,10,8,7,1,2,0,...,0,0,0,0,0,0,0,0,0,0
94947,21,363,8973,2,10,5,5,2,2,0,...,0,0,0,0,0,0,0,0,0,0
590882,22,418,10694,2,10,6,5,2,2,0,...,0,0,0,0,0,0,0,0,0,0
201944,11,131,1488,3,30,8,9,2,2,0,...,0,0,0,0,0,0,0,0,0,0


<h2>Me quedo con los siguientes features:</h2>

In [8]:
train_values_lab_enc.columns

Index(['geo_level_1_id', 'geo_level_2_id', 'geo_level_3_id',
       'count_floors_pre_eq', 'age', 'area_percentage', 'height_percentage',
       'land_surface_condition', 'foundation_type', 'roof_type',
       'ground_floor_type', 'other_floor_type', 'position',
       'plan_configuration', 'has_superstructure_adobe_mud',
       'has_superstructure_mud_mortar_stone', 'has_superstructure_stone_flag',
       'has_superstructure_cement_mortar_stone',
       'has_superstructure_mud_mortar_brick',
       'has_superstructure_cement_mortar_brick', 'has_superstructure_timber',
       'has_superstructure_bamboo', 'has_superstructure_rc_non_engineered',
       'has_superstructure_rc_engineered', 'has_superstructure_other',
       'legal_ownership_status', 'count_families', 'has_secondary_use',
       'has_secondary_use_agriculture', 'has_secondary_use_hotel',
       'has_secondary_use_rental', 'has_secondary_use_institution',
       'has_secondary_use_school', 'has_secondary_use_industry',
     

In [9]:
train_values_lab_enc.count()

geo_level_1_id                            260601
geo_level_2_id                            260601
geo_level_3_id                            260601
count_floors_pre_eq                       260601
age                                       260601
area_percentage                           260601
height_percentage                         260601
land_surface_condition                    260601
foundation_type                           260601
roof_type                                 260601
ground_floor_type                         260601
other_floor_type                          260601
position                                  260601
plan_configuration                        260601
has_superstructure_adobe_mud              260601
has_superstructure_mud_mortar_stone       260601
has_superstructure_stone_flag             260601
has_superstructure_cement_mortar_stone    260601
has_superstructure_mud_mortar_brick       260601
has_superstructure_cement_mortar_brick    260601
has_superstructure_t

In [10]:
#selected_features = ['geo_level_1_id', 'geo_level_2_id', 'geo_level_3_id', 'other_floor_type',
#                     'area_percentage', 'age', 'foundation_type',
#                     'height_percentage']

selected_features = ['geo_level_1_id', 'geo_level_2_id', 'geo_level_3_id',
       'count_floors_pre_eq', 'age', 'area_percentage', 'height_percentage',
       'land_surface_condition', 'foundation_type', 'roof_type',
       'ground_floor_type', 'other_floor_type', 'position',
       'plan_configuration', 'has_superstructure_adobe_mud',
       'has_superstructure_mud_mortar_stone', 'has_superstructure_stone_flag',
       'has_superstructure_cement_mortar_stone',
       'has_superstructure_mud_mortar_brick',
       'has_superstructure_cement_mortar_brick', 'has_superstructure_timber',
       'has_superstructure_bamboo', 'has_superstructure_rc_non_engineered',
       'has_superstructure_rc_engineered', 'has_superstructure_other',
       'legal_ownership_status']

selected_features_tar_enc = ['geo_level_1_id', 'geo_level_2_id', 'geo_level_3_id',
       'count_floors_pre_eq', 'age', 'area_percentage', 'height_percentage',
       'land_surface_condition_t', 'foundation_type_t', 'roof_type_t',
       'ground_floor_type_t', 'other_floor_type_t', 'position_t',
       'plan_configuration_t', 'has_superstructure_adobe_mud',
       'has_superstructure_mud_mortar_stone', 'has_superstructure_stone_flag',
       'has_superstructure_cement_mortar_stone',
       'has_superstructure_mud_mortar_brick',
       'has_superstructure_cement_mortar_brick', 'has_superstructure_timber',
       'has_superstructure_bamboo', 'has_superstructure_rc_non_engineered',
       'has_superstructure_rc_engineered', 'has_superstructure_other']

In [11]:
train_values_lab_enc_subset = train_values_lab_enc[selected_features]
test_values_lab_enc_subset = test_values_lab_enc[selected_features]

In [12]:
train_values_tar_enc_subset = train_values_tar_enc[selected_features_tar_enc]
test_values_tar_enc_subset = test_values_tar_enc[selected_features_tar_enc]

In [13]:
train_values_lab_enc_subset.head()

Unnamed: 0_level_0,geo_level_1_id,geo_level_2_id,geo_level_3_id,count_floors_pre_eq,age,area_percentage,height_percentage,land_surface_condition,foundation_type,roof_type,...,has_superstructure_stone_flag,has_superstructure_cement_mortar_stone,has_superstructure_mud_mortar_brick,has_superstructure_cement_mortar_brick,has_superstructure_timber,has_superstructure_bamboo,has_superstructure_rc_non_engineered,has_superstructure_rc_engineered,has_superstructure_other,legal_ownership_status
building_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
802906,6,487,12198,2,30,6,5,2,2,0,...,0,0,0,0,0,0,0,0,0,2
28830,8,900,2812,2,10,8,7,1,2,0,...,0,0,0,0,0,0,0,0,0,2
94947,21,363,8973,2,10,5,5,2,2,0,...,0,0,0,0,0,0,0,0,0,2
590882,22,418,10694,2,10,6,5,2,2,0,...,0,0,0,0,1,1,0,0,0,2
201944,11,131,1488,3,30,8,9,2,2,0,...,0,0,0,0,0,0,0,0,0,2


In [14]:
train_values_tar_enc_subset.head()

Unnamed: 0_level_0,geo_level_1_id,geo_level_2_id,geo_level_3_id,count_floors_pre_eq,age,area_percentage,height_percentage,land_surface_condition_t,foundation_type_t,roof_type_t,...,has_superstructure_mud_mortar_stone,has_superstructure_stone_flag,has_superstructure_cement_mortar_stone,has_superstructure_mud_mortar_brick,has_superstructure_cement_mortar_brick,has_superstructure_timber,has_superstructure_bamboo,has_superstructure_rc_non_engineered,has_superstructure_rc_engineered,has_superstructure_other
building_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
802906,6,487,12198,2,30,6,5,525291.207011,526036.097182,525377.373596,...,1,0,0,0,0,0,0,0,0,0
28830,8,900,2812,2,10,8,7,523144.803391,526036.097182,525377.373596,...,1,0,0,0,0,0,0,0,0,0
94947,21,363,8973,2,10,5,5,525291.207011,526036.097182,525377.373596,...,1,0,0,0,0,0,0,0,0,0
590882,22,418,10694,2,10,6,5,525291.207011,526036.097182,525377.373596,...,1,0,0,0,0,1,1,0,0,0
201944,11,131,1488,3,30,8,9,525291.207011,526036.097182,525377.373596,...,0,0,0,0,0,0,0,0,0,0


<h2>KNN</h2>

In [15]:
from sklearn.neighbors import KNeighborsClassifier

In [16]:
pipe = make_pipeline(KNeighborsClassifier())
pipe

Pipeline(steps=[('kneighborsclassifier', KNeighborsClassifier())])

In [17]:
knn_model = KNeighborsClassifier()

leaf_size = list(range(1,50))
n_neighbors = list(range(1,20))
p=[1,2]

#Convert to dictionary
knn_hyperparameters = dict(leaf_size=leaf_size, n_neighbors=n_neighbors)

<h3>Target Enc Unico Caso</h3>

In [17]:
knn_model.fit(train_values_tar_enc_subset, train_labels.values.ravel())

KNeighborsClassifier()

In [None]:
knn_preds = knn_model.predict(train_values_tar_enc_subset)
f1_score(train_labels, knn_preds, average='micro')

<h3>Label Enc Unico Caso</h3>

In [None]:
knn_model.fit(train_values_lab_enc_subset, train_labels.values.ravel())

In [None]:
knn_preds = knn_model.predict(train_values_lab_enc_subset)
f1_score(train_labels, knn_preds, average='micro')

<h2>Pruebo con grid seacrh</h2>

In [18]:
gsearch = GridSearchCV(knn_model, knn_hyperparameters, cv=5)
gsearch.fit(train_values_lab_enc_subset, train_labels.values.ravel())

KeyboardInterrupt: 

In [None]:
gsearch.best_params_

In [None]:
in_sample_preds = gsearch.predict(train_values_lab_enc_subset)
f1_score(train_labels, in_sample_preds, average='micro')

<h2>Pruebo con Randomize Search</h2>

In [23]:
from sklearn.model_selection import RandomizedSearchCV

In [24]:
rand = RandomizedSearchCV(knn_model, knn_hyperparameters, cv=5, scoring='accuracy', n_iter=10, random_state=5)
rand.fit(train_values_lab_enc_subset, train_labels.values.ravel())

AttributeError: 'RandomizedSearchCV' object has no attribute 'grid_scores_'

In [25]:
rand.best_params_

{'n_neighbors': 8, 'leaf_size': 43}

In [26]:
in_sample_preds = rand.predict(train_values_lab_enc_subset)
f1_score(train_labels, in_sample_preds, average='micro')

0.7688688838492561

<h2>Prediccion de test_values y creacion de la submission </h2>

In [78]:
predictions = knn_model.predict(test_values_lab_enc_subset)

In [27]:
rand_knn_model_with_best_hyperparams = rand
predictions = knn_model_with_best_hyperparams.predict(test_values_lab_enc_subset)

In [29]:
submission_format = pd.read_csv('uploads/submission_format.csv', index_col='building_id')

In [30]:
my_submission = pd.DataFrame(data=predictions,
                             columns=submission_format.columns,
                             index=submission_format.index)

In [31]:
my_submission.head()

Unnamed: 0_level_0,damage_grade
building_id,Unnamed: 1_level_1
300051,3
99355,2
890251,2
745817,1
421793,2


In [33]:
my_submission.to_csv('submission_knn.csv')

In [83]:
#For downloading in Colab
from google.colab import files
files.download('submission_knn.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<h2>PRUEBAS</h2>

In [29]:
knn_model_with_best_hyperparams = KNeighborsClassifier(n_neighbors= 8, leaf_size=43)

In [30]:
selected_feats = ['geo_level_1_id', 'geo_level_2_id', 'geo_level_3_id']

In [31]:
train_values = train_values_lab_enc[selected_feats]
test_values = test_values_lab_enc[selected_feats]

In [36]:
knn_model_with_best_hyperparams.fit(train_values_lab_enc, train_labels.values.ravel())

KNeighborsClassifier(leaf_size=43, n_neighbors=8)

In [37]:
knn_preds = knn_model_with_best_hyperparams.predict(train_values_lab_enc)
f1_score(train_values_lab_enc, knn_preds, average='micro')

ValueError: Classification metrics can't handle a mix of multiclass-multioutput and multiclass targets