In [1]:
import pandas as pd
import numpy as np
from collections import Counter
import time
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.model_selection import cross_validate, cross_val_predict
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from imblearn.under_sampling import CondensedNearestNeighbour, RandomUnderSampler


In [69]:
_data = pd.read_csv('./data/Modelar_UH2020.txt', sep='|')
_labels = _data.CLASE
_data.drop('CLASE', axis=1, inplace=True)
_data.drop('ID', axis=1, inplace=True)

# Variable categórica a numérica:
_data['CADASTRALQUALITYID'] = _data['CADASTRALQUALITYID'].map({'9': 0,
                               '8': 1,
                               '7': 2,
                               '6': 3,
                               '5': 4,
                               '4': 5,
                               '3': 6,
                               '2': 7,
                               '1': 8,
                               'C': 9,
                               'B': 10,
                               'A': 11})

# Imputación de NANs con la mediana:
_data['MAXBUILDINGFLOOR'].fillna(_data['MAXBUILDINGFLOOR'].median(), inplace=True)
_data['CADASTRALQUALITYID'].fillna(_data['CADASTRALQUALITYID'].median(), inplace=True)

#### Normalizar:

In [23]:
_data = pd.DataFrame( StandardScaler().fit_transform(_data), columns=_data.columns)

In [70]:
_data = pd.DataFrame( MinMaxScaler().fit_transform(_data), columns=_data.columns)

In [53]:
Counter(labels)

Counter({'RESIDENTIAL': 90173,
         'INDUSTRIAL': 4490,
         'PUBLIC': 2976,
         'OFFICE': 1828,
         'OTHER': 1332,
         'RETAIL': 2093,
         'AGRICULTURE': 338})

## OFFICE vs RETAIL:

In [71]:
data = pd.concat([_data,_labels], axis=1)
data_1 = data[data.CLASE == "OFFICE"]
data_2 = data[data.CLASE == "RETAIL"]
data = pd.concat([data_1, data_2])
labels = data.CLASE
data = data.drop('CLASE', axis=1)
X = data.values
y = labels.values
print(Counter(y))
print()
cv_results = cross_validate(RandomForestClassifier(n_jobs=-1), X, y, cv=3)
print(cv_results['test_score'])
cv_pred = cross_val_predict(RandomForestClassifier(n_jobs=-1), X, y, cv=3)
print(classification_report(y, cv_pred, digits=3))

Counter({'RETAIL': 2093, 'OFFICE': 1828})

[0.76358072 0.75133894 0.75669472]
              precision    recall  f1-score   support

      OFFICE      0.729     0.753     0.741      1828
      RETAIL      0.778     0.755     0.766      2093

    accuracy                          0.754      3921
   macro avg      0.753     0.754     0.753      3921
weighted avg      0.755     0.754     0.754      3921



## OFFICE vs PUBLIC:

In [72]:
data = pd.concat([_data,_labels], axis=1)
data_1 = data[data.CLASE == "OFFICE"]
data_2 = data[data.CLASE == "PUBLIC"]
data = pd.concat([data_1, data_2])
labels = data.CLASE
data = data.drop('CLASE', axis=1)
X = data.values
y = labels.values
print(Counter(y))
print()
cv_results = cross_validate(RandomForestClassifier(n_jobs=-1), X, y, cv=3)
print(cv_results['test_score'])
cv_pred = cross_val_predict(RandomForestClassifier(n_jobs=-1), X, y, cv=3)
print(classification_report(y, cv_pred, digits=3))

Counter({'PUBLIC': 2976, 'OFFICE': 1828})

[0.78589263 0.76202374 0.76702061]
              precision    recall  f1-score   support

      OFFICE      0.711     0.666     0.688      1828
      PUBLIC      0.803     0.833     0.818      2976

    accuracy                          0.770      4804
   macro avg      0.757     0.750     0.753      4804
weighted avg      0.768     0.770     0.768      4804



## OFFICE vs OTHER:

In [73]:
data = pd.concat([_data,_labels], axis=1)
data_1 = data[data.CLASE == "OFFICE"]
data_2 = data[data.CLASE == "OTHER"]
data = pd.concat([data_1, data_2])
labels = data.CLASE
data = data.drop('CLASE', axis=1)
X = data.values
y = labels.values
print(Counter(y))
print()
cv_results = cross_validate(RandomForestClassifier(n_jobs=-1), X, y, cv=3)
print(cv_results['test_score'])
cv_pred = cross_val_predict(RandomForestClassifier(n_jobs=-1), X, y, cv=3)
print(classification_report(y, cv_pred, digits=3))

Counter({'OFFICE': 1828, 'OTHER': 1332})

[0.88519924 0.88698955 0.88034188]
              precision    recall  f1-score   support

      OFFICE      0.892     0.902     0.897      1828
       OTHER      0.863     0.851     0.857      1332

    accuracy                          0.880      3160
   macro avg      0.878     0.876     0.877      3160
weighted avg      0.880     0.880     0.880      3160



## OFFICE vs INDUSTRIAL:

In [74]:
data = pd.concat([_data,_labels], axis=1)
data_1 = data[data.CLASE == "OFFICE"]
data_2 = data[data.CLASE == "INDUSTRIAL"]
data = pd.concat([data_1, data_2])
labels = data.CLASE
data = data.drop('CLASE', axis=1)
X = data.values
y = labels.values
print(Counter(y))
print()
cv_results = cross_validate(RandomForestClassifier(n_jobs=-1), X, y, cv=3)
print(cv_results['test_score'])
cv_pred = cross_val_predict(RandomForestClassifier(n_jobs=-1), X, y, cv=3)
print(classification_report(y, cv_pred, digits=3))

Counter({'INDUSTRIAL': 4490, 'OFFICE': 1828})

[0.82526116 0.83238367 0.81528965]
              precision    recall  f1-score   support

  INDUSTRIAL      0.855     0.905     0.879      4490
      OFFICE      0.728     0.622     0.671      1828

    accuracy                          0.823      6318
   macro avg      0.791     0.764     0.775      6318
weighted avg      0.818     0.823     0.819      6318



## OFFICE vs AGRICULTURE:

In [75]:
data = pd.concat([_data,_labels], axis=1)
data_1 = data[data.CLASE == "OFFICE"]
data_2 = data[data.CLASE == "AGRICULTURE"]
data = pd.concat([data_1, data_2])
labels = data.CLASE
data = data.drop('CLASE', axis=1)
X = data.values
y = labels.values
print(Counter(y))
print()
cv_results = cross_validate(RandomForestClassifier(n_jobs=-1), X, y, cv=3)
print(cv_results['test_score'])
cv_pred = cross_val_predict(RandomForestClassifier(n_jobs=-1), X, y, cv=3)
print(classification_report(y, cv_pred, digits=3))

Counter({'OFFICE': 1828, 'AGRICULTURE': 338})

[0.97783934 0.96952909 0.97506925]
              precision    recall  f1-score   support

 AGRICULTURE      0.950     0.905     0.927       338
      OFFICE      0.983     0.991     0.987      1828

    accuracy                          0.978      2166
   macro avg      0.966     0.948     0.957      2166
weighted avg      0.978     0.978     0.978      2166



## OFFICE vs RESIDENTIAL:

In [76]:
data = pd.concat([_data,_labels], axis=1)
data_1 = data[data.CLASE == "OFFICE"]
data_2 = data[data.CLASE == "RESIDENTIAL"]
data = pd.concat([data_1, data_2])
labels = data.CLASE
data = data.drop('CLASE', axis=1)
X = data.values
y = labels.values
print(Counter(y))
print()
cv_results = cross_validate(RandomForestClassifier(n_jobs=-1), X, y, cv=3)
print(cv_results['test_score'])
cv_pred = cross_val_predict(RandomForestClassifier(n_jobs=-1), X, y, cv=3)
print(classification_report(y, cv_pred, digits=3))

Counter({'RESIDENTIAL': 90173, 'OFFICE': 1828})

[0.98385887 0.98418495 0.98392409]
              precision    recall  f1-score   support

      OFFICE      0.799     0.251     0.382      1828
 RESIDENTIAL      0.985     0.999     0.992     90173

    accuracy                          0.984     92001
   macro avg      0.892     0.625     0.687     92001
weighted avg      0.981     0.984     0.980     92001

