In [1]:
import pandas as pd
import numpy as np
from collections import Counter
import time
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.model_selection import cross_validate, cross_val_predict
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from imblearn.under_sampling import CondensedNearestNeighbour, RandomUnderSampler
import xgboost as xgb

In [2]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV


In [2]:
_data = pd.read_csv('./data/Modelar_UH2020.txt', sep='|')
_labels = _data.CLASE
_data.drop('CLASE', axis=1, inplace=True)
_data.drop('ID', axis=1, inplace=True)

# Variable categórica a numérica:
_data['CADASTRALQUALITYID'] = _data['CADASTRALQUALITYID'].map({'9': 0,
                               '8': 1,
                               '7': 2,
                               '6': 3,
                               '5': 4,
                               '4': 5,
                               '3': 6,
                               '2': 7,
                               '1': 8,
                               'C': 9,
                               'B': 10,
                               'A': 11})

# Imputación de NANs con la mediana:
_data['MAXBUILDINGFLOOR'].fillna(_data['MAXBUILDINGFLOOR'].median(), inplace=True)
_data['CADASTRALQUALITYID'].fillna(_data['CADASTRALQUALITYID'].median(), inplace=True)

# Normalización:
#_data = pd.DataFrame( StandardScaler().fit_transform(_data), columns=_data.columns)
_data = pd.DataFrame( MinMaxScaler().fit_transform(_data), columns=_data.columns)

## OFFICE vs RETAIL:

In [10]:
data = pd.concat([_data,_labels], axis=1)
data_1 = data[data.CLASE == "OFFICE"]
data_2 = data[data.CLASE == "RETAIL"]
data = pd.concat([data_1, data_2])
labels = data.CLASE
data = data.drop('CLASE', axis=1)
X = data.values
y = labels.values
print(Counter(y))

Counter({'RETAIL': 2093, 'OFFICE': 1828})


In [11]:
data.shape

(3921, 54)

In [73]:
# XGBoost:

cv_pred = cross_val_predict(xgb.XGBClassifier(), X, y, cv=5)
print(classification_report(y, cv_pred, digits=4))

              precision    recall  f1-score   support

      OFFICE     0.7193    0.7500    0.7343      1828
      RETAIL     0.7732    0.7444    0.7585      2093

    accuracy                         0.7470      3921
   macro avg     0.7463    0.7472    0.7464      3921
weighted avg     0.7481    0.7470    0.7472      3921



In [15]:
# RandomForest:

cv_pred = cross_val_predict(RandomForestClassifier(n_jobs=-1, n_estimators=500), X, y, cv=5)
print(classification_report(y, cv_pred, digits=4))

              precision    recall  f1-score   support

      OFFICE     0.7453    0.7620    0.7536      1828
      RETAIL     0.7880    0.7726    0.7802      2093

    accuracy                         0.7677      3921
   macro avg     0.7667    0.7673    0.7669      3921
weighted avg     0.7681    0.7677    0.7678      3921



In [66]:
# KNN:

for k in range(3,30):
    cv_pred = cross_val_predict(KNeighborsClassifier(n_jobs=-1, n_neighbors=k, metric='manhattan', weights='distance'), X, y, cv=5)
    #print(classification_report(y, cv_pred, digits=4))
    print(k, accuracy_score(y,cv_pred))
    
# cv_pred = cross_val_predict(KNeighborsClassifier(n_jobs=-1, n_neighbors=10, metric='manhattan', weights='distance'), X, y, cv=5)
# print(classification_report(y, cv_pred, digits=4))

3 0.7324662076001021
4 0.7347615404233614
5 0.7426676868145882
6 0.7454730935985718
7 0.7436878347360367
8 0.7464932415200204
9 0.7503187962254527
10 0.7521040550879877
11 0.7503187962254527
12 0.7475133894414691
13 0.7464932415200204
14 0.7459831675592961
15 0.7444529456771232
16 0.7441979086967611
17 0.7439428717163988
18 0.742412649834226
19 0.7421576128538638
20 0.7439428717163988
21 0.742412649834226
22 0.7413925019127774
23 0.7396072430502423
24 0.7380770211680694
25 0.7362917623055343
26 0.7416475388931395
27 0.7385870951287937
28 0.7398622800306044
29 0.7378219841877072


#### Comparación características **color** vs geom

In [54]:
color_col = _data.columns[2:46]
geom_col = _data.columns[0:2]
geom_col = geom_col.append(_data.columns[46:54])

In [74]:
X = data[geom_col].values

In [75]:
# RandomForest:

cv_pred = cross_val_predict(RandomForestClassifier(n_jobs=-1), X, y, cv=5)
print(classification_report(y, cv_pred, digits=4))

              precision    recall  f1-score   support

      OFFICE     0.7030    0.7330    0.7177      1828
      RETAIL     0.7578    0.7296    0.7434      2093

    accuracy                         0.7312      3921
   macro avg     0.7304    0.7313    0.7306      3921
weighted avg     0.7323    0.7312    0.7314      3921



In [63]:
# KNN:

for k in range(3,30):
    cv_pred = cross_val_predict(KNeighborsClassifier(n_jobs=-1, n_neighbors=k, metric='manhattan', weights='distance'), X, y, cv=5)
    #print(classification_report(y, cv_pred, digits=4))
    print(k, accuracy_score(y,cv_pred))
    
# cv_pred = cross_val_predict(KNeighborsClassifier(n_jobs=-1, n_neighbors=10, metric='manhattan', weights='distance'), X, y, cv=5)
# print(classification_report(y, cv_pred, digits=4))

3 0.7118082121907676
4 0.7087477684264218
5 0.7130833970925784
6 0.7158888038765621
7 0.7248150981892374
8 0.7237949502677888
9 0.7237949502677888
10 0.722264728385616
11 0.7217546544248916
12 0.722264728385616
13 0.7220096914052537
14 0.7227748023463402
15 0.7245600612088753
16 0.7212445804641673
17 0.7243050242285132
18 0.724049987248151
19 0.7266003570517725
20 0.7245600612088753
21 0.7202244325427187
22 0.7202244325427187
23 0.7184391736801836
24 0.7194593216016323
25 0.7181841366998215
26 0.7186942106605457
27 0.7161438408569243
28 0.7186942106605457
29 0.7156337668962


In [76]:
X = data[color_col].values

In [64]:
for k in range(3,30):
    cv_pred = cross_val_predict(KNeighborsClassifier(n_jobs=-1, n_neighbors=k, metric='manhattan', weights='distance'), X, y, cv=5)
    #print(classification_report(y, cv_pred, digits=4))
    print(k, accuracy_score(y,cv_pred))

3 0.6508543738842132
4 0.667176740627391
5 0.6618209640397857
6 0.6824789594491201
7 0.7000765110941086
8 0.6965059933690385
9 0.6962509563886764
10 0.7008416220351951
11 0.7026268808977302
12 0.7031369548584545
13 0.7003315480744708
14 0.7041571027799031
15 0.7036470288191787
16 0.6982912522315736
17 0.7031369548584545
18 0.7087477684264218
19 0.7054322876817138
20 0.700586585054833
21 0.6995664371333843
22 0.7036470288191787
23 0.7044121397602653
24 0.7046671767406274
25 0.7054322876817138
26 0.705687324662076
27 0.7064524356031625
28 0.7074725835246111
29 0.709002805406784


In [77]:
# RandomForest:

cv_pred = cross_val_predict(RandomForestClassifier(n_jobs=-1), X, y, cv=5)
print(classification_report(y, cv_pred, digits=4))

              precision    recall  f1-score   support

      OFFICE     0.6722    0.7347    0.7020      1828
      RETAIL     0.7478    0.6871    0.7161      2093

    accuracy                         0.7093      3921
   macro avg     0.7100    0.7109    0.7091      3921
weighted avg     0.7125    0.7093    0.7096      3921



#### Selección de características:

In [3]:
def feature_selection(data,labels,model):
    X = data.values
    y = labels.values
    cv_pred = cross_val_predict(model, X, y, cv=5)
    best = accuracy_score(y, cv_pred)
    print("Punto de partida(todas las características)", best)

    mejoria = True

    while(mejoria == True):
        mejoria = False
        best_dict = {}
        for c in data:
            print("Eliminanmos: ",c)

            data_temp = data.copy()
            data_temp.drop(labels=[c], axis=1, inplace = True)

            X = data_temp.values
            cv_pred = cross_val_predict(model, X, y, cv=5)
            temp = accuracy_score(y, cv_pred)
            #temp = validacion_cruzada_sc(knn,X,y,skf)
            #print("-> Resultado: ",temp)

            if(temp > best):
                best_dict[(temp)] = c 
                print(c, "  mejora: ", (temp-best) )
                mejoria = True

        if (mejoria):
            # Obtenemos mayor valor:
            best = sorted(best_dict, reverse=True)[0]

            # Columna asociada a dicho valor:
            col =best_dict[best]

            # Eliminamos la columna correspondiente al valor maximo:
            data.drop(labels=[col], axis=1, inplace = True)
            print("=======> Eliminanmos: ",col, " -> score: ", best)

def feature_square(data,labels,model):       
    X = data.values
    y = labels.values
    cv_pred = cross_val_predict(model, X, y, cv=5)
    best = accuracy_score(y, cv_pred)
    print("Punto de partida(todas las características)", best)

    mejoria = True

    while(mejoria == True):
        mejoria = False
        best_dict = {}
        for c in data:
            #print("Eliminanmos: ",c)

            data_temp = data.copy()
            # Generamos la nueva variable (manteniendo el signo original)
            name = c+'2'
            data_temp[name] = data_temp[c]**2

            X = data_temp.values
        
            cv_pred = cross_val_predict(model, X, y, cv=5)
            temp = accuracy_score(y, cv_pred)
            print("Añadimos: ",name, ", Resultado: ",temp)

            if(temp > best):
                best_dict[(temp)] = c 
                print("--------------> ",name, "  mejora: ", (temp-best) )
                mejoria = True

        if (mejoria):
            # Obtenemos mayor valor:
            best = sorted(best_dict, reverse=True)[0]

            # Columna asociada a dicho valor:
            col =best_dict[best]

            # Añadimos la columna correspondiente al valor maximo:
            name = col+'2'
            data[name] = data[col]**2
           
            print("===========> Añadimos: ",name, " --------------> score: ", best)
            print(' ')

In [103]:
data_record = data.copy()

In [123]:
data= data_record.copy()

In [19]:
data.shape

(3921, 54)

In [20]:

model = RandomForestClassifier(n_jobs=-1, n_estimators=500)
feature_selection(data,labels,model)

Punto de partida(todas las características) 0.7697016067329763
Eliminanmos:  X
Eliminanmos:  Y
Eliminanmos:  Q_R_4_0_0
Eliminanmos:  Q_R_4_0_1
Eliminanmos:  Q_R_4_0_2
Eliminanmos:  Q_R_4_0_3
Eliminanmos:  Q_R_4_0_4
Eliminanmos:  Q_R_4_0_5
Eliminanmos:  Q_R_4_0_6
Eliminanmos:  Q_R_4_0_7
Eliminanmos:  Q_R_4_0_8
Eliminanmos:  Q_R_4_0_9
Eliminanmos:  Q_R_4_1_0
Eliminanmos:  Q_G_3_0_0
Eliminanmos:  Q_G_3_0_1
Eliminanmos:  Q_G_3_0_2
Eliminanmos:  Q_G_3_0_3
Q_G_3_0_3   mejora:  0.0007651109410864443
Eliminanmos:  Q_G_3_0_4
Eliminanmos:  Q_G_3_0_5
Eliminanmos:  Q_G_3_0_6
Q_G_3_0_6   mejora:  0.0002550369803621111
Eliminanmos:  Q_G_3_0_7
Eliminanmos:  Q_G_3_0_8
Eliminanmos:  Q_G_3_0_9
Eliminanmos:  Q_G_3_1_0
Eliminanmos:  Q_B_2_0_0
Q_B_2_0_0   mejora:  0.0012751849018107775
Eliminanmos:  Q_B_2_0_1
Eliminanmos:  Q_B_2_0_2
Eliminanmos:  Q_B_2_0_3
Eliminanmos:  Q_B_2_0_4
Eliminanmos:  Q_B_2_0_5
Eliminanmos:  Q_B_2_0_6
Eliminanmos:  Q_B_2_0_7
Eliminanmos:  Q_B_2_0_8
Eliminanmos:  Q_B_2_0_9
Eliminan

In [124]:
data_geom = data[geom_col].copy()
model = KNeighborsClassifier(n_jobs=-1, n_neighbors=k, metric='manhattan', weights='distance')
model = RandomForestClassifier(n_jobs=-1, n_estimators=500)
feature_square(data, labels, model)

Punto de partida(todas las características) 0.7689364957918898
Añadimos:  X2 , Resultado:  0.7686814588115277
Añadimos:  Y2 , Resultado:  0.7686814588115277
Añadimos:  Q_R_4_0_02 , Resultado:  0.7658760520275439
Añadimos:  Q_R_4_0_12 , Resultado:  0.7679163478704412
Añadimos:  Q_R_4_0_22 , Resultado:  0.7694465697526142
-------------->  Q_R_4_0_22   mejora:  0.0005100739607243332
Añadimos:  Q_R_4_0_32 , Resultado:  0.7694465697526142
-------------->  Q_R_4_0_32   mejora:  0.0005100739607243332
Añadimos:  Q_R_4_0_42 , Resultado:  0.7668961999489926
Añadimos:  Q_R_4_0_52 , Resultado:  0.764345830145371
Añadimos:  Q_R_4_0_62 , Resultado:  0.7679163478704412
Añadimos:  Q_R_4_0_72 , Resultado:  0.7684264218311655
Añadimos:  Q_R_4_0_82 , Resultado:  0.767661310890079
Añadimos:  Q_R_4_0_92 , Resultado:  0.7651109410864575
Añadimos:  Q_R_4_1_02 , Resultado:  0.7704667176740627
-------------->  Q_R_4_1_02   mejora:  0.0015302218821728886
Añadimos:  Q_G_3_0_02 , Resultado:  0.7646008671257333
Añ

Añadimos:  Q_G_3_0_62 , Resultado:  0.7671512369293547
Añadimos:  Q_G_3_0_72 , Resultado:  0.7702116806937006
Añadimos:  Q_G_3_0_82 , Resultado:  0.7694465697526142
Añadimos:  Q_G_3_0_92 , Resultado:  0.7714868655955114
Añadimos:  Q_G_3_1_02 , Resultado:  0.7689364957918898
Añadimos:  Q_B_2_0_02 , Resultado:  0.7694465697526142
Añadimos:  Q_B_2_0_12 , Resultado:  0.7694465697526142
Añadimos:  Q_B_2_0_22 , Resultado:  0.7699566437133384
Añadimos:  Q_B_2_0_32 , Resultado:  0.7702116806937006
Añadimos:  Q_B_2_0_42 , Resultado:  0.7712318286151492
Añadimos:  Q_B_2_0_52 , Resultado:  0.7697016067329763
Añadimos:  Q_B_2_0_62 , Resultado:  0.7737821984187707
Añadimos:  Q_B_2_0_72 , Resultado:  0.7681713848508034
Añadimos:  Q_B_2_0_82 , Resultado:  0.7702116806937006
Añadimos:  Q_B_2_0_92 , Resultado:  0.7691915327722519
Añadimos:  Q_B_2_1_02 , Resultado:  0.7684264218311655
Añadimos:  Q_NIR_8_0_02 , Resultado:  0.7691915327722519
Añadimos:  Q_NIR_8_0_12 , Resultado:  0.7684264218311655
Añadim

#### Hiperparámetros:

In [19]:
data = pd.concat([_data,_labels], axis=1)
data_1 = data[data.CLASE == "OFFICE"]
data_2 = data[data.CLASE == "RETAIL"]
data = pd.concat([data_1, data_2])
labels = data.CLASE
data = data.drop('CLASE', axis=1)
X = data.values
y = labels.values

In [20]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 6000, num = 100)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt', 'log2', None]
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = RandomForestClassifier()
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)
# Fit the random search model
rf_random.fit(X, y)
print("================== DONE ==================")
print(rf_random.best_params_)
print(rf_random.best_score_)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:  3.7min
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed: 30.6min
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed: 63.2min finished


{'n_estimators': 5765, 'min_samples_split': 5, 'min_samples_leaf': 2, 'max_features': 'sqrt', 'max_depth': 50, 'bootstrap': True}
0.7656210150471819


## OFFICE vs PUBLIC:

In [72]:
data = pd.concat([_data,_labels], axis=1)
data_1 = data[data.CLASE == "OFFICE"]
data_2 = data[data.CLASE == "PUBLIC"]
data = pd.concat([data_1, data_2])
labels = data.CLASE
data = data.drop('CLASE', axis=1)
X = data.values
y = labels.values
print(Counter(y))
print()
cv_results = cross_validate(RandomForestClassifier(n_jobs=-1), X, y, cv=3)
print(cv_results['test_score'])
cv_pred = cross_val_predict(RandomForestClassifier(n_jobs=-1), X, y, cv=3)
print(classification_report(y, cv_pred, digits=3))

Counter({'PUBLIC': 2976, 'OFFICE': 1828})

[0.78589263 0.76202374 0.76702061]
              precision    recall  f1-score   support

      OFFICE      0.711     0.666     0.688      1828
      PUBLIC      0.803     0.833     0.818      2976

    accuracy                          0.770      4804
   macro avg      0.757     0.750     0.753      4804
weighted avg      0.768     0.770     0.768      4804



In [21]:
data = pd.concat([_data,_labels], axis=1)
data_1 = data[data.CLASE == "OFFICE"]
data_2 = data[data.CLASE == "PUBLIC"]
data = pd.concat([data_1, data_2])
labels = data.CLASE
data = data.drop('CLASE', axis=1)
X = data.values
y = labels.values

In [22]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 6000, num = 100)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt', 'log2', None]
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = RandomForestClassifier()
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)
# Fit the random search model
rf_random.fit(X, y)
print("================== DONE ==================")
print(rf_random.best_params_)
print(rf_random.best_score_)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:  4.1min
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed: 37.7min
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed: 77.1min finished


{'n_estimators': 4066, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 'log2', 'max_depth': 40, 'bootstrap': True}
0.77601766790055


In [4]:
data = pd.concat([_data,_labels], axis=1)
data_1 = data[data.CLASE == "OFFICE"]
data_2 = data[data.CLASE == "PUBLIC"]
data = pd.concat([data_1, data_2])
labels = data.CLASE
data = data.drop('CLASE', axis=1)
X = data.values
y = labels.values

model = RandomForestClassifier(n_jobs=-1, n_estimators=500)
feature_square(data, labels, model)

data = pd.concat([_data,_labels], axis=1)
data_1 = data[data.CLASE == "OFFICE"]
data_2 = data[data.CLASE == "PUBLIC"]
data = pd.concat([data_1, data_2])
labels = data.CLASE
data = data.drop('CLASE', axis=1)
X = data.values
y = labels.values

model = RandomForestClassifier(n_jobs=-1, n_estimators=500)
feature_selection(data, labels, model)

Punto de partida(todas las características) 0.7787260616153205
Añadimos:  X2 , Resultado:  0.7810158201498751
-------------->  X2   mejora:  0.002289758534554598
Añadimos:  Y2 , Resultado:  0.7853871773522065
-------------->  Y2   mejora:  0.006661115736885992
Añadimos:  Q_R_4_0_02 , Resultado:  0.7785179017485429
Añadimos:  Q_R_4_0_12 , Resultado:  0.7803913405495421
-------------->  Q_R_4_0_12   mejora:  0.0016652789342215257
Añadimos:  Q_R_4_0_22 , Resultado:  0.7824729392173189
-------------->  Q_R_4_0_22   mejora:  0.003746877601998322
Añadimos:  Q_R_4_0_32 , Resultado:  0.7783097418817652
Añadimos:  Q_R_4_0_42 , Resultado:  0.7789342214820982
-------------->  Q_R_4_0_42   mejora:  0.00020815986677769072
Añadimos:  Q_R_4_0_52 , Resultado:  0.7818484596169858
-------------->  Q_R_4_0_52   mejora:  0.0031223980016652497
Añadimos:  Q_R_4_0_62 , Resultado:  0.779766860949209
-------------->  Q_R_4_0_62   mejora:  0.0010407993338884536
Añadimos:  Q_R_4_0_72 , Resultado:  0.778726061615

Añadimos:  Q_B_2_1_02 , Resultado:  0.7880932556203164
-------------->  Q_B_2_1_02   mejora:  0.0027060782681098683
Añadimos:  Q_NIR_8_0_02 , Resultado:  0.7864279766860949
-------------->  Q_NIR_8_0_02   mejora:  0.0010407993338883426
Añadimos:  Q_NIR_8_0_12 , Resultado:  0.7862198168193172
-------------->  Q_NIR_8_0_12   mejora:  0.0008326394671106518
Añadimos:  Q_NIR_8_0_22 , Resultado:  0.7843463780183181
Añadimos:  Q_NIR_8_0_32 , Resultado:  0.7855953372189842
-------------->  Q_NIR_8_0_32   mejora:  0.00020815986677769072
Añadimos:  Q_NIR_8_0_42 , Resultado:  0.7880932556203164
-------------->  Q_NIR_8_0_42   mejora:  0.0027060782681098683
Añadimos:  Q_NIR_8_0_52 , Resultado:  0.7826810990840966
Añadimos:  Q_NIR_8_0_62 , Resultado:  0.7855953372189842
-------------->  Q_NIR_8_0_62   mejora:  0.00020815986677769072
Añadimos:  Q_NIR_8_0_72 , Resultado:  0.7835137385512073
Añadimos:  Q_NIR_8_0_82 , Resultado:  0.7851790174854288
Añadimos:  Q_NIR_8_0_92 , Resultado:  0.78913405495420

Añadimos:  Q_NIR_8_0_922 , Resultado:  0.7874687760199833
Añadimos:  Q_B_2_0_322 , Resultado:  0.7874687760199833
Punto de partida(todas las características) 0.7810158201498751
Eliminanmos:  X
Eliminanmos:  Y
Eliminanmos:  Q_R_4_0_0
Eliminanmos:  Q_R_4_0_1
Eliminanmos:  Q_R_4_0_2
Eliminanmos:  Q_R_4_0_3
Eliminanmos:  Q_R_4_0_4
Eliminanmos:  Q_R_4_0_5
Eliminanmos:  Q_R_4_0_6
Eliminanmos:  Q_R_4_0_7
Eliminanmos:  Q_R_4_0_8
Eliminanmos:  Q_R_4_0_9
Eliminanmos:  Q_R_4_1_0
Q_R_4_1_0   mejora:  0.0008326394671106518
Eliminanmos:  Q_G_3_0_0
Eliminanmos:  Q_G_3_0_1
Eliminanmos:  Q_G_3_0_2
Eliminanmos:  Q_G_3_0_3
Eliminanmos:  Q_G_3_0_4
Eliminanmos:  Q_G_3_0_5
Eliminanmos:  Q_G_3_0_6
Eliminanmos:  Q_G_3_0_7
Q_G_3_0_7   mejora:  0.0006244796003330721
Eliminanmos:  Q_G_3_0_8
Eliminanmos:  Q_G_3_0_9
Eliminanmos:  Q_G_3_1_0
Eliminanmos:  Q_B_2_0_0
Eliminanmos:  Q_B_2_0_1
Q_B_2_0_1   mejora:  0.0016652789342214147
Eliminanmos:  Q_B_2_0_2
Eliminanmos:  Q_B_2_0_3
Eliminanmos:  Q_B_2_0_4
Eliminanmos:  

## OFFICE vs OTHER:

In [73]:
data = pd.concat([_data,_labels], axis=1)
data_1 = data[data.CLASE == "OFFICE"]
data_2 = data[data.CLASE == "OTHER"]
data = pd.concat([data_1, data_2])
labels = data.CLASE
data = data.drop('CLASE', axis=1)
X = data.values
y = labels.values
print(Counter(y))
print()
cv_results = cross_validate(RandomForestClassifier(n_jobs=-1), X, y, cv=3)
print(cv_results['test_score'])
cv_pred = cross_val_predict(RandomForestClassifier(n_jobs=-1), X, y, cv=3)
print(classification_report(y, cv_pred, digits=3))

Counter({'OFFICE': 1828, 'OTHER': 1332})

[0.88519924 0.88698955 0.88034188]
              precision    recall  f1-score   support

      OFFICE      0.892     0.902     0.897      1828
       OTHER      0.863     0.851     0.857      1332

    accuracy                          0.880      3160
   macro avg      0.878     0.876     0.877      3160
weighted avg      0.880     0.880     0.880      3160



In [23]:
data = pd.concat([_data,_labels], axis=1)
data_1 = data[data.CLASE == "OFFICE"]
data_2 = data[data.CLASE == "OTHER"]
data = pd.concat([data_1, data_2])
labels = data.CLASE
data = data.drop('CLASE', axis=1)
X = data.values
y = labels.values

In [24]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 6000, num = 100)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt', 'log2', None]
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = RandomForestClassifier()
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)
# Fit the random search model
rf_random.fit(X, y)
print("================== DONE ==================")
print(rf_random.best_params_)
print(rf_random.best_score_)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:  2.7min
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed: 24.4min
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed: 50.3min finished


{'n_estimators': 4828, 'min_samples_split': 5, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'max_depth': 80, 'bootstrap': False}
0.8882915173237754


In [5]:
data = pd.concat([_data,_labels], axis=1)
data_1 = data[data.CLASE == "OFFICE"]
data_2 = data[data.CLASE == "OTHER"]
data = pd.concat([data_1, data_2])
labels = data.CLASE
data = data.drop('CLASE', axis=1)
X = data.values
y = labels.values

model = RandomForestClassifier(n_jobs=-1, n_estimators=500)
feature_square(data, labels, model)

data = pd.concat([_data,_labels], axis=1)
data_1 = data[data.CLASE == "OFFICE"]
data_2 = data[data.CLASE == "OTHER"]
data = pd.concat([data_1, data_2])
labels = data.CLASE
data = data.drop('CLASE', axis=1)
X = data.values
y = labels.values

model = RandomForestClassifier(n_jobs=-1, n_estimators=500)
feature_selection(data, labels, model)

Punto de partida(todas las características) 0.8882911392405063
Añadimos:  X2 , Resultado:  0.884493670886076
Añadimos:  Y2 , Resultado:  0.885759493670886
Añadimos:  Q_R_4_0_02 , Resultado:  0.8863924050632911
Añadimos:  Q_R_4_0_12 , Resultado:  0.8863924050632911
Añadimos:  Q_R_4_0_22 , Resultado:  0.8860759493670886
Añadimos:  Q_R_4_0_32 , Resultado:  0.8860759493670886
Añadimos:  Q_R_4_0_42 , Resultado:  0.8873417721518987
Añadimos:  Q_R_4_0_52 , Resultado:  0.8860759493670886
Añadimos:  Q_R_4_0_62 , Resultado:  0.8876582278481012
Añadimos:  Q_R_4_0_72 , Resultado:  0.8854430379746835
Añadimos:  Q_R_4_0_82 , Resultado:  0.8867088607594936
Añadimos:  Q_R_4_0_92 , Resultado:  0.8863924050632911
Añadimos:  Q_R_4_1_02 , Resultado:  0.8863924050632911
Añadimos:  Q_G_3_0_02 , Resultado:  0.8863924050632911
Añadimos:  Q_G_3_0_12 , Resultado:  0.884493670886076
Añadimos:  Q_G_3_0_22 , Resultado:  0.8870253164556962
Añadimos:  Q_G_3_0_32 , Resultado:  0.884493670886076
Añadimos:  Q_G_3_0_42 

Añadimos:  Q_B_2_0_52 , Resultado:  0.8863924050632911
Añadimos:  Q_B_2_0_62 , Resultado:  0.8863924050632911
Añadimos:  Q_B_2_0_72 , Resultado:  0.8873417721518987
Añadimos:  Q_B_2_0_82 , Resultado:  0.8848101265822785
Añadimos:  Q_B_2_0_92 , Resultado:  0.8854430379746835
Añadimos:  Q_B_2_1_02 , Resultado:  0.8851265822784811
Añadimos:  Q_NIR_8_0_02 , Resultado:  0.8860759493670886
Añadimos:  Q_NIR_8_0_12 , Resultado:  0.8873417721518987
Añadimos:  Q_NIR_8_0_22 , Resultado:  0.8867088607594936
Añadimos:  Q_NIR_8_0_32 , Resultado:  0.8876582278481012
Añadimos:  Q_NIR_8_0_42 , Resultado:  0.8848101265822785
Añadimos:  Q_NIR_8_0_52 , Resultado:  0.8870253164556962
Añadimos:  Q_NIR_8_0_62 , Resultado:  0.8851265822784811
Añadimos:  Q_NIR_8_0_72 , Resultado:  0.8854430379746835
Añadimos:  Q_NIR_8_0_82 , Resultado:  0.8886075949367088
Añadimos:  Q_NIR_8_0_92 , Resultado:  0.8886075949367088
Añadimos:  Q_NIR_8_1_02 , Resultado:  0.8867088607594936
Añadimos:  AREA2 , Resultado:  0.8848101265

Eliminanmos:  Q_G_3_0_5
Eliminanmos:  Q_G_3_0_6
Eliminanmos:  Q_G_3_0_7
Eliminanmos:  Q_G_3_0_8
Eliminanmos:  Q_G_3_0_9
Eliminanmos:  Q_G_3_1_0
Eliminanmos:  Q_B_2_0_0
Eliminanmos:  Q_B_2_0_1
Eliminanmos:  Q_B_2_0_3
Eliminanmos:  Q_B_2_0_4
Eliminanmos:  Q_B_2_0_5
Eliminanmos:  Q_B_2_0_6
Eliminanmos:  Q_B_2_0_7
Eliminanmos:  Q_B_2_0_8
Eliminanmos:  Q_B_2_0_9
Eliminanmos:  Q_B_2_1_0
Q_B_2_1_0   mejora:  0.00031645569620253333
Eliminanmos:  Q_NIR_8_0_0
Eliminanmos:  Q_NIR_8_0_1
Eliminanmos:  Q_NIR_8_0_2
Eliminanmos:  Q_NIR_8_0_3
Eliminanmos:  Q_NIR_8_0_4
Eliminanmos:  Q_NIR_8_0_5
Eliminanmos:  Q_NIR_8_0_6
Eliminanmos:  Q_NIR_8_0_7
Eliminanmos:  Q_NIR_8_0_8
Eliminanmos:  Q_NIR_8_0_9
Eliminanmos:  Q_NIR_8_1_0
Eliminanmos:  AREA
Eliminanmos:  GEOM_R1
Eliminanmos:  GEOM_R2
Eliminanmos:  GEOM_R3
Eliminanmos:  GEOM_R4
Eliminanmos:  CONTRUCTIONYEAR
Eliminanmos:  MAXBUILDINGFLOOR
Eliminanmos:  CADASTRALQUALITYID
Eliminanmos:  X
Eliminanmos:  Y
Eliminanmos:  Q_R_4_0_0
Eliminanmos:  Q_R_4_0_1
Elimi

## OFFICE vs INDUSTRIAL:

In [74]:
data = pd.concat([_data,_labels], axis=1)
data_1 = data[data.CLASE == "OFFICE"]
data_2 = data[data.CLASE == "INDUSTRIAL"]
data = pd.concat([data_1, data_2])
labels = data.CLASE
data = data.drop('CLASE', axis=1)
X = data.values
y = labels.values
print(Counter(y))
print()
cv_results = cross_validate(RandomForestClassifier(n_jobs=-1), X, y, cv=3)
print(cv_results['test_score'])
cv_pred = cross_val_predict(RandomForestClassifier(n_jobs=-1), X, y, cv=3)
print(classification_report(y, cv_pred, digits=3))

Counter({'INDUSTRIAL': 4490, 'OFFICE': 1828})

[0.82526116 0.83238367 0.81528965]
              precision    recall  f1-score   support

  INDUSTRIAL      0.855     0.905     0.879      4490
      OFFICE      0.728     0.622     0.671      1828

    accuracy                          0.823      6318
   macro avg      0.791     0.764     0.775      6318
weighted avg      0.818     0.823     0.819      6318



In [25]:
data = pd.concat([_data,_labels], axis=1)
data_1 = data[data.CLASE == "OFFICE"]
data_2 = data[data.CLASE == "INDUSTRIAL"]
data = pd.concat([data_1, data_2])
labels = data.CLASE
data = data.drop('CLASE', axis=1)
X = data.values
y = labels.values

In [26]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 6000, num = 100)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt', 'log2', None]
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = RandomForestClassifier()
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)
# Fit the random search model
rf_random.fit(X, y)
print("================== DONE ==================")
print(rf_random.best_params_)
print(rf_random.best_score_)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:  5.0min


KeyboardInterrupt: 

In [6]:
data = pd.concat([_data,_labels], axis=1)
data_1 = data[data.CLASE == "OFFICE"]
data_2 = data[data.CLASE == "INDUSTRIAL"]
data = pd.concat([data_1, data_2])
labels = data.CLASE
data = data.drop('CLASE', axis=1)
X = data.values
y = labels.values

model = RandomForestClassifier(n_jobs=-1, n_estimators=500)
feature_square(data, labels, model)

data = pd.concat([_data,_labels], axis=1)
data_1 = data[data.CLASE == "OFFICE"]
data_2 = data[data.CLASE == "INDUSTRIAL"]
data = pd.concat([data_1, data_2])
labels = data.CLASE
data = data.drop('CLASE', axis=1)
X = data.values
y = labels.values

model = RandomForestClassifier(n_jobs=-1, n_estimators=500)
feature_selection(data, labels, model)

Punto de partida(todas las características) 0.8262108262108262
Añadimos:  X2 , Resultado:  0.8255777144666033
Añadimos:  Y2 , Resultado:  0.8258942703387148
Añadimos:  Q_R_4_0_02 , Resultado:  0.8239949351060463
Añadimos:  Q_R_4_0_12 , Resultado:  0.8236783792339348
Añadimos:  Q_R_4_0_22 , Resultado:  0.8263691041468819
-------------->  Q_R_4_0_22   mejora:  0.00015827793605571738
Añadimos:  Q_R_4_0_32 , Resultado:  0.8247863247863247
Añadimos:  Q_R_4_0_42 , Resultado:  0.8247863247863247
Añadimos:  Q_R_4_0_52 , Resultado:  0.8252611585944919
Añadimos:  Q_R_4_0_62 , Resultado:  0.8243114909781576
Añadimos:  Q_R_4_0_72 , Resultado:  0.825735992402659
Añadimos:  Q_R_4_0_82 , Resultado:  0.8251028806584362
Añadimos:  Q_R_4_0_92 , Resultado:  0.8260525482747705
Añadimos:  Q_R_4_1_02 , Resultado:  0.8260525482747705
Añadimos:  Q_G_3_0_02 , Resultado:  0.8263691041468819
-------------->  Q_G_3_0_02   mejora:  0.00015827793605571738
Añadimos:  Q_G_3_0_12 , Resultado:  0.824628046850269
Añadim

Q_NIR_8_0_7   mejora:  0.00015827793605571738
Eliminanmos:  Q_NIR_8_0_8
Q_NIR_8_0_8   mejora:  0.002849002849002913
Eliminanmos:  Q_NIR_8_0_9
Eliminanmos:  Q_NIR_8_1_0
Q_NIR_8_1_0   mejora:  0.0015827793605571738
Eliminanmos:  AREA
Eliminanmos:  GEOM_R1
Eliminanmos:  GEOM_R2
GEOM_R2   mejora:  0.00031655587211143477
Eliminanmos:  GEOM_R3
GEOM_R3   mejora:  0.0006331117442228695
Eliminanmos:  GEOM_R4
Eliminanmos:  CONTRUCTIONYEAR
Eliminanmos:  MAXBUILDINGFLOOR
Eliminanmos:  CADASTRALQUALITYID
Eliminanmos:  X
Eliminanmos:  Y
Eliminanmos:  Q_R_4_0_0
Eliminanmos:  Q_R_4_0_1
Eliminanmos:  Q_R_4_0_2
Eliminanmos:  Q_R_4_0_3
Eliminanmos:  Q_R_4_0_4
Eliminanmos:  Q_R_4_0_5
Eliminanmos:  Q_R_4_0_6
Eliminanmos:  Q_R_4_0_7
Eliminanmos:  Q_R_4_0_8
Eliminanmos:  Q_R_4_0_9
Eliminanmos:  Q_R_4_1_0
Eliminanmos:  Q_G_3_0_0
Eliminanmos:  Q_G_3_0_1
Eliminanmos:  Q_G_3_0_2
Eliminanmos:  Q_G_3_0_3
Eliminanmos:  Q_G_3_0_4
Eliminanmos:  Q_G_3_0_5
Eliminanmos:  Q_G_3_0_6
Eliminanmos:  Q_G_3_0_7
Eliminanmos:  Q

## OFFICE vs AGRICULTURE:

In [75]:
data = pd.concat([_data,_labels], axis=1)
data_1 = data[data.CLASE == "OFFICE"]
data_2 = data[data.CLASE == "AGRICULTURE"]
data = pd.concat([data_1, data_2])
labels = data.CLASE
data = data.drop('CLASE', axis=1)
X = data.values
y = labels.values
print(Counter(y))
print()
cv_results = cross_validate(RandomForestClassifier(n_jobs=-1), X, y, cv=3)
print(cv_results['test_score'])
cv_pred = cross_val_predict(RandomForestClassifier(n_jobs=-1), X, y, cv=3)
print(classification_report(y, cv_pred, digits=3))

Counter({'OFFICE': 1828, 'AGRICULTURE': 338})

[0.97783934 0.96952909 0.97506925]
              precision    recall  f1-score   support

 AGRICULTURE      0.950     0.905     0.927       338
      OFFICE      0.983     0.991     0.987      1828

    accuracy                          0.978      2166
   macro avg      0.966     0.948     0.957      2166
weighted avg      0.978     0.978     0.978      2166



In [None]:
data = pd.concat([_data,_labels], axis=1)
data_1 = data[data.CLASE == "OFFICE"]
data_2 = data[data.CLASE == "AGRICULTURE"]
data = pd.concat([data_1, data_2])
labels = data.CLASE
data = data.drop('CLASE', axis=1)
X = data.values
y = labels.values

In [None]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 6000, num = 100)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt', 'log2', None]
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = RandomForestClassifier()
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)
# Fit the random search model
rf_random.fit(X, y)
print("================== DONE ==================")
print(rf_random.best_params_)
print(rf_random.best_score_)

In [7]:
data = pd.concat([_data,_labels], axis=1)
data_1 = data[data.CLASE == "OFFICE"]
data_2 = data[data.CLASE == "AGRICULTURE"]
data = pd.concat([data_1, data_2])
labels = data.CLASE
data = data.drop('CLASE', axis=1)
X = data.values
y = labels.values

model = RandomForestClassifier(n_jobs=-1, n_estimators=500)
feature_square(data, labels, model)

data = pd.concat([_data,_labels], axis=1)
data_1 = data[data.CLASE == "OFFICE"]
data_2 = data[data.CLASE == "AGRICULTURE"]
data = pd.concat([data_1, data_2])
labels = data.CLASE
data = data.drop('CLASE', axis=1)
X = data.values
y = labels.values

model = RandomForestClassifier(n_jobs=-1, n_estimators=500)
feature_selection(data, labels, model)

Punto de partida(todas las características) 0.9773776546629732
Añadimos:  X2 , Resultado:  0.9778393351800554
-------------->  X2   mejora:  0.0004616805170821747
Añadimos:  Y2 , Resultado:  0.9792243767313019
-------------->  Y2   mejora:  0.001846722068328699
Añadimos:  Q_R_4_0_02 , Resultado:  0.976915974145891
Añadimos:  Q_R_4_0_12 , Resultado:  0.9764542936288089
Añadimos:  Q_R_4_0_22 , Resultado:  0.9778393351800554
-------------->  Q_R_4_0_22   mejora:  0.0004616805170821747
Añadimos:  Q_R_4_0_32 , Resultado:  0.9773776546629732
Añadimos:  Q_R_4_0_42 , Resultado:  0.976915974145891
Añadimos:  Q_R_4_0_52 , Resultado:  0.976915974145891
Añadimos:  Q_R_4_0_62 , Resultado:  0.9773776546629732
Añadimos:  Q_R_4_0_72 , Resultado:  0.9759926131117267
Añadimos:  Q_R_4_0_82 , Resultado:  0.9783010156971376
-------------->  Q_R_4_0_82   mejora:  0.0009233610341643494
Añadimos:  Q_R_4_0_92 , Resultado:  0.9773776546629732
Añadimos:  Q_R_4_1_02 , Resultado:  0.9778393351800554
--------------

Añadimos:  Q_R_4_0_12 , Resultado:  0.9801477377654663
Añadimos:  Q_R_4_0_22 , Resultado:  0.9810710987996306
Añadimos:  Q_R_4_0_32 , Resultado:  0.9801477377654663
Añadimos:  Q_R_4_0_42 , Resultado:  0.9796860572483841
Añadimos:  Q_R_4_0_52 , Resultado:  0.9810710987996306
Añadimos:  Q_R_4_0_62 , Resultado:  0.9806094182825484
Añadimos:  Q_R_4_0_72 , Resultado:  0.9806094182825484
Añadimos:  Q_R_4_0_82 , Resultado:  0.9792243767313019
Añadimos:  Q_R_4_0_92 , Resultado:  0.9801477377654663
Añadimos:  Q_R_4_1_02 , Resultado:  0.9806094182825484
Añadimos:  Q_G_3_0_02 , Resultado:  0.9810710987996306
Añadimos:  Q_G_3_0_12 , Resultado:  0.9801477377654663
Añadimos:  Q_G_3_0_22 , Resultado:  0.9801477377654663
Añadimos:  Q_G_3_0_32 , Resultado:  0.9806094182825484
Añadimos:  Q_G_3_0_42 , Resultado:  0.9801477377654663
Añadimos:  Q_G_3_0_52 , Resultado:  0.9806094182825484
Añadimos:  Q_G_3_0_62 , Resultado:  0.9806094182825484
Añadimos:  Q_G_3_0_72 , Resultado:  0.9810710987996306
Añadimos: 

Eliminanmos:  Q_R_4_0_3
Eliminanmos:  Q_R_4_0_4
Eliminanmos:  Q_R_4_0_5
Eliminanmos:  Q_R_4_0_6
Eliminanmos:  Q_R_4_0_7
Eliminanmos:  Q_R_4_0_8
Eliminanmos:  Q_R_4_0_9
Eliminanmos:  Q_R_4_1_0
Eliminanmos:  Q_G_3_0_0
Eliminanmos:  Q_G_3_0_1
Eliminanmos:  Q_G_3_0_3
Eliminanmos:  Q_G_3_0_4
Eliminanmos:  Q_G_3_0_5
Eliminanmos:  Q_G_3_0_6
Eliminanmos:  Q_G_3_0_7
Eliminanmos:  Q_G_3_0_8
Eliminanmos:  Q_G_3_0_9
Eliminanmos:  Q_G_3_1_0
Eliminanmos:  Q_B_2_0_0
Eliminanmos:  Q_B_2_0_1
Eliminanmos:  Q_B_2_0_2
Eliminanmos:  Q_B_2_0_3
Eliminanmos:  Q_B_2_0_4
Eliminanmos:  Q_B_2_0_5
Eliminanmos:  Q_B_2_0_6
Eliminanmos:  Q_B_2_0_7
Eliminanmos:  Q_B_2_0_8
Eliminanmos:  Q_B_2_0_9
Eliminanmos:  Q_B_2_1_0
Eliminanmos:  Q_NIR_8_0_0
Eliminanmos:  Q_NIR_8_0_1
Eliminanmos:  Q_NIR_8_0_2
Eliminanmos:  Q_NIR_8_0_3
Eliminanmos:  Q_NIR_8_0_4
Eliminanmos:  Q_NIR_8_0_5
Eliminanmos:  Q_NIR_8_0_6
Eliminanmos:  Q_NIR_8_0_7
Eliminanmos:  Q_NIR_8_0_8
Eliminanmos:  Q_NIR_8_0_9
Eliminanmos:  Q_NIR_8_1_0
Eliminanmos:  AREA

## OFFICE vs RESIDENTIAL:

In [76]:
data = pd.concat([_data,_labels], axis=1)
data_1 = data[data.CLASE == "OFFICE"]
data_2 = data[data.CLASE == "RESIDENTIAL"]
data = pd.concat([data_1, data_2])
labels = data.CLASE
data = data.drop('CLASE', axis=1)
X = data.values
y = labels.values
print(Counter(y))
print()
cv_results = cross_validate(RandomForestClassifier(n_jobs=-1), X, y, cv=3)
print(cv_results['test_score'])
cv_pred = cross_val_predict(RandomForestClassifier(n_jobs=-1), X, y, cv=3)
print(classification_report(y, cv_pred, digits=3))

Counter({'RESIDENTIAL': 90173, 'OFFICE': 1828})

[0.98385887 0.98418495 0.98392409]
              precision    recall  f1-score   support

      OFFICE      0.799     0.251     0.382      1828
 RESIDENTIAL      0.985     0.999     0.992     90173

    accuracy                          0.984     92001
   macro avg      0.892     0.625     0.687     92001
weighted avg      0.981     0.984     0.980     92001



In [8]:
data = pd.concat([_data,_labels], axis=1)
data_1 = data[data.CLASE == "OFFICE"]
data_2 = data[data.CLASE == "RESIDENTIAL"]
data = pd.concat([data_1, data_2])
labels = data.CLASE
data = data.drop('CLASE', axis=1)
X = data.values
y = labels.values

model = RandomForestClassifier(n_jobs=-1, n_estimators=500)
feature_square(data, labels, model)

data = pd.concat([_data,_labels], axis=1)
data_1 = data[data.CLASE == "OFFICE"]
data_2 = data[data.CLASE == "RESIDENTIAL"]
data = pd.concat([data_1, data_2])
labels = data.CLASE
data = data.drop('CLASE', axis=1)
X = data.values
y = labels.values

model = RandomForestClassifier(n_jobs=-1, n_estimators=500)
feature_selection(data, labels, model)

Punto de partida(todas las características) 0.9842719100879338
Añadimos:  X2 , Resultado:  0.9843045184291475
-------------->  X2   mejora:  3.2608341213657965e-05
Añadimos:  Y2 , Resultado:  0.9842393017467201
Añadimos:  Q_R_4_0_02 , Resultado:  0.9841088683818654
Añadimos:  Q_R_4_0_12 , Resultado:  0.9841740850642928
Añadimos:  Q_R_4_0_22 , Resultado:  0.9843045184291475
-------------->  Q_R_4_0_22   mejora:  3.2608341213657965e-05
Añadimos:  Q_R_4_0_32 , Resultado:  0.9842066934055065


KeyboardInterrupt: 