In [2]:
import pandas as pd
import numpy as np
from collections import Counter
import time
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.model_selection import cross_validate, cross_val_predict
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from imblearn.under_sampling import CondensedNearestNeighbour, RandomUnderSampler


In [3]:
_data = pd.read_csv('./data/Modelar_UH2020.txt', sep='|')
_labels = _data.CLASE
_data.drop('CLASE', axis=1, inplace=True)
_data.drop('ID', axis=1, inplace=True)

# Variable categórica a numérica:
_data['CADASTRALQUALITYID'] = _data['CADASTRALQUALITYID'].map({'9': 0,
                               '8': 1,
                               '7': 2,
                               '6': 3,
                               '5': 4,
                               '4': 5,
                               '3': 6,
                               '2': 7,
                               '1': 8,
                               'C': 9,
                               'B': 10,
                               'A': 11})

# Imputación de NANs con la mediana:
_data['MAXBUILDINGFLOOR'].fillna(_data['MAXBUILDINGFLOOR'].median(), inplace=True)
_data['CADASTRALQUALITYID'].fillna(_data['CADASTRALQUALITYID'].median(), inplace=True)

#### Normalizar:

In [23]:
_data = pd.DataFrame( StandardScaler().fit_transform(_data), columns=_data.columns)

In [4]:
_data = pd.DataFrame( MinMaxScaler().fit_transform(_data), columns=_data.columns)

In [53]:
Counter(labels)

Counter({'RESIDENTIAL': 90173,
         'INDUSTRIAL': 4490,
         'PUBLIC': 2976,
         'OFFICE': 1828,
         'OTHER': 1332,
         'RETAIL': 2093,
         'AGRICULTURE': 338})

## INDUSTRIAL vs RETAIL:

In [5]:
data = pd.concat([_data,_labels], axis=1)
data_1 = data[data.CLASE == "INDUSTRIAL"]
data_2 = data[data.CLASE == "RETAIL"]
data = pd.concat([data_1, data_2])
labels = data.CLASE
data = data.drop('CLASE', axis=1)
X = data.values
y = labels.values
print(Counter(y))
print()
cv_results = cross_validate(RandomForestClassifier(n_jobs=-1), X, y, cv=3)
print(cv_results['test_score'])
cv_pred = cross_val_predict(RandomForestClassifier(n_jobs=-1), X, y, cv=3)
print(classification_report(y, cv_pred, digits=3))

Counter({'INDUSTRIAL': 4490, 'RETAIL': 2093})

[0.82186788 0.82087511 0.81905196]
              precision    recall  f1-score   support

  INDUSTRIAL      0.857     0.883     0.870      4490
      RETAIL      0.731     0.683     0.707      2093

    accuracy                          0.820      6583
   macro avg      0.794     0.783     0.788      6583
weighted avg      0.817     0.820     0.818      6583



## INDUSTRIAL vs PUBLIC:

In [6]:
data = pd.concat([_data,_labels], axis=1)
data_1 = data[data.CLASE == "INDUSTRIAL"]
data_2 = data[data.CLASE == "PUBLIC"]
data = pd.concat([data_1, data_2])
labels = data.CLASE
data = data.drop('CLASE', axis=1)
X = data.values
y = labels.values
print(Counter(y))
print()
cv_results = cross_validate(RandomForestClassifier(n_jobs=-1), X, y, cv=3)
print(cv_results['test_score'])
cv_pred = cross_val_predict(RandomForestClassifier(n_jobs=-1), X, y, cv=3)
print(classification_report(y, cv_pred, digits=3))

Counter({'INDUSTRIAL': 4490, 'PUBLIC': 2976})

[0.84732824 0.84652471 0.85610932]
              precision    recall  f1-score   support

  INDUSTRIAL      0.872     0.874     0.873      4490
      PUBLIC      0.809     0.806     0.808      2976

    accuracy                          0.847      7466
   macro avg      0.841     0.840     0.840      7466
weighted avg      0.847     0.847     0.847      7466



## INDUSTRIAL vs OTHER:

In [7]:
data = pd.concat([_data,_labels], axis=1)
data_1 = data[data.CLASE == "INDUSTRIAL"]
data_2 = data[data.CLASE == "OTHER"]
data = pd.concat([data_1, data_2])
labels = data.CLASE
data = data.drop('CLASE', axis=1)
X = data.values
y = labels.values
print(Counter(y))
print()
cv_results = cross_validate(RandomForestClassifier(n_jobs=-1), X, y, cv=3)
print(cv_results['test_score'])
cv_pred = cross_val_predict(RandomForestClassifier(n_jobs=-1), X, y, cv=3)
print(classification_report(y, cv_pred, digits=3))

Counter({'INDUSTRIAL': 4490, 'OTHER': 1332})

[0.90108192 0.89026275 0.90773196]
              precision    recall  f1-score   support

  INDUSTRIAL      0.919     0.956     0.937      4490
       OTHER      0.828     0.717     0.769      1332

    accuracy                          0.901      5822
   macro avg      0.874     0.836     0.853      5822
weighted avg      0.898     0.901     0.899      5822



## INDUSTRIAL vs RESIDENTIAL:

In [9]:
data = pd.concat([_data,_labels], axis=1)
data_1 = data[data.CLASE == "INDUSTRIAL"]
data_2 = data[data.CLASE == "RESIDENTIAL"]
data = pd.concat([data_1, data_2])
labels = data.CLASE
data = data.drop('CLASE', axis=1)
X = data.values
y = labels.values
print(Counter(y))
print()
cv_results = cross_validate(RandomForestClassifier(n_jobs=-1), X, y, cv=3)
print(cv_results['test_score'])
cv_pred = cross_val_predict(RandomForestClassifier(n_jobs=-1), X, y, cv=3)
print(classification_report(y, cv_pred, digits=3))

Counter({'RESIDENTIAL': 90173, 'INDUSTRIAL': 4490})

[0.98136587 0.98111174 0.98035114]
              precision    recall  f1-score   support

  INDUSTRIAL      0.919     0.665     0.772      4490
 RESIDENTIAL      0.984     0.997     0.990     90173

    accuracy                          0.981     94663
   macro avg      0.951     0.831     0.881     94663
weighted avg      0.980     0.981     0.980     94663

