In [15]:
import pickle
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, make_scorer, f1_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV
import numpy as np
import seaborn as sns
import pandas as pd

# Lectura de datos

In [2]:
data = pd.read_csv('/home/jose/Escritorio/datathon/src/data/train.txt', sep='|', index_col='ID')
# test = pd.read_csv('/home/jose/Escritorio/datathon/src/data/test.txt', sep='|', index_col='ID')

labels = data.iloc[:, -1]
data.drop('CLASE', axis=1, inplace=True)

train, test, y_train, y_test = train_test_split(data, labels, test_size=0.2, random_state=123)

data = pd.concat([train, test], sort=False)

data['CADASTRALQUALITYID'] = data['CADASTRALQUALITYID'].map({'9': '0',
                               '8': '1',
                               '7': '2',
                               '6': '3',
                               '5': '4',
                               '4': '5',
                               '3': '6',
                               '2': '7',
                               '1': '8',
                               'C': '9',
                               'B': '10',
                               'A': '11',})
data['CADASTRALQUALITYID'] = data['CADASTRALQUALITYID'].astype('category')

# Tratamiento de NaN

In [3]:
data['MAXBUILDINGFLOOR'].fillna(data['MAXBUILDINGFLOOR'].median(), inplace=True)
data['CADASTRALQUALITYID'].fillna(data['CADASTRALQUALITYID'].mode()[0], inplace=True)

# Preprocesado

In [4]:
sc = StandardScaler()
data = pd.concat([pd.DataFrame(sc.fit_transform(data.select_dtypes(['number'])),
                              index=data.index,
                              columns=data.select_dtypes(['number']).columns),
                data.select_dtypes(['category', 'object'])], axis=1, sort=False)

train, test = data.iloc[:train.shape[0], ], data.iloc[train.shape[0]:, ]

# Modelo

In [5]:
model = KNeighborsClassifier(n_jobs=-1)

model.fit(train, y_train)

y_pred = model.predict(test)

In [6]:
accuracy_score(y_test, y_pred)

0.9043398236946624

In [7]:
print(classification_report(y_test, y_pred, digits=4))

              precision    recall  f1-score   support

 AGRICULTURE     0.7705    0.5875    0.6667        80
  INDUSTRIAL     0.6835    0.5928    0.6349       889
      OFFICE     0.3624    0.1414    0.2034       382
       OTHER     0.5862    0.3160    0.4106       269
      PUBLIC     0.4486    0.1388    0.2120       598
 RESIDENTIAL     0.9260    0.9886    0.9563     18018
      RETAIL     0.6327    0.1512    0.2441       410

    accuracy                         0.9043     20646
   macro avg     0.6300    0.4166    0.4754     20646
weighted avg     0.8804    0.9043    0.8846     20646



In [8]:
print(confusion_matrix(y_test, y_pred))

[[   47    17     1     0     1    14     0]
 [    4   527    25    11    11   298    13]
 [    0    87    54     2    11   226     2]
 [    0    25     1    85     9   143     6]
 [    1    24    14    21    83   450     5]
 [    7    69    43    19    57 17813    10]
 [    2    22    11     7    13   293    62]]


# Ajuste de hiperparametros

In [10]:
params = True

if params:
    params = {'n_neighbors': [1,3,5,7,11,13,15],
             'weights': ['uniform', 'distance'],
             'metric': ['minkowski', 'manhattan'],
             'n_jobs': [-1]}

    model = KNeighborsClassifier()

    grid = GridSearchCV(model, params, cv=5, scoring=make_scorer(f1_score, average='macro'), n_jobs=-1)

    grid.fit(train, y_train)
    
    best_params = grid.best_params_
    
    model = KNeighborsClassifier(**best_params)
    model.fit(train, y_train)
    
    print(best_params)
else:
    best_params = {'weights': 'distance', 'n_neighbors': 3, 'n_jobs': -1, 'metric': 'manhattan'}
    model = KNeighborsClassifier(**best_params)
    model.fit(train, y_train)



{'metric': 'manhattan', 'n_jobs': -1, 'n_neighbors': 3, 'weights': 'distance'}


# Prueba del modelo ajustado

In [11]:
y_pred = model.predict(test)

In [12]:
accuracy_score(y_test, y_pred)

0.9054054054054054

In [13]:
print(classification_report(y_test, y_pred, digits=4))

              precision    recall  f1-score   support

 AGRICULTURE     0.8382    0.7125    0.7703        80
  INDUSTRIAL     0.7092    0.6007    0.6504       889
      OFFICE     0.3586    0.2225    0.2746       382
       OTHER     0.6000    0.4126    0.4890       269
      PUBLIC     0.4529    0.2090    0.2860       598
 RESIDENTIAL     0.9352    0.9812    0.9577     18018
      RETAIL     0.4550    0.2463    0.3196       410

    accuracy                         0.9054     20646
   macro avg     0.6213    0.4836    0.5354     20646
weighted avg     0.8866    0.9054    0.8928     20646



In [14]:
print(confusion_matrix(y_test, y_pred))

[[   57     7     0     0     0    15     1]
 [    4   534    38    13    13   259    28]
 [    1    76    85     1    18   188    13]
 [    0    13     6   111    14   119     6]
 [    1    26    19    21   125   392    14]
 [    5    79    75    29    91 17680    59]
 [    0    18    14    10    15   252   101]]


# Prueba XGB

In [16]:
data = pd.concat([train,test], sort=False)
data['CADASTRALQUALITYID'] = data['CADASTRALQUALITYID'].astype(np.int)
ncol = sc.fit_transform((data['CADASTRALQUALITYID'].values).reshape(-1,1))
data['CADASTRALQUALITYID'] = ncol

train, test = data.iloc[:train.shape[0], ], data.iloc[train.shape[0]:, ]

In [17]:
model = XGBClassifier(n_jobs=-1)

model.fit(train, y_train)

y_pred = model.predict(test)

In [18]:
accuracy_score(y_test, y_pred)

0.8994962704640124

In [19]:
print(classification_report(y_test, y_pred, digits=4))

              precision    recall  f1-score   support

 AGRICULTURE     0.8649    0.4000    0.5470        80
  INDUSTRIAL     0.6762    0.4792    0.5609       889
      OFFICE     0.4667    0.0183    0.0353       382
       OTHER     0.5942    0.3048    0.4029       269
      PUBLIC     0.5260    0.1355    0.2154       598
 RESIDENTIAL     0.9126    0.9944    0.9518     18018
      RETAIL     0.6410    0.0610    0.1114       410

    accuracy                         0.8995     20646
   macro avg     0.6688    0.3419    0.4035     20646
weighted avg     0.8733    0.8995    0.8713     20646



In [20]:
print(confusion_matrix(y_test, y_pred))

[[   32    10     0     0     2    36     0]
 [    1   426     3     8     9   433     9]
 [    0    68     7     3    13   290     1]
 [    0    20     1    82    14   152     0]
 [    1    23     1    19    81   471     2]
 [    2    49     1    25    21 17918     2]
 [    1    34     2     1    14   333    25]]
