# Librerías

In [None]:
import numpy as np
import pandas as pd
from imblearn.under_sampling import EditedNearestNeighbours
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from scipy.optimize import diferential_evolution
from imblearn.pipeline import Pipeline

# Datos

Se cargan los datos de train y de test y se separa el train de las etiquetas.

In [None]:
train = pd.read_csv('/home/jose/Escritorio/datathon/src/data/train.txt', sep='|', index_col='ID')
test = pd.read_csv('/home/jose/Escritorio/datathon/src/data/test.txt', sep='|', index_col='ID')

labels = train.iloc[:, -1]
train.drop('CLASE', axis=1, inplace=True)

Creamos distintas funciones para el preprocesado

In [None]:
def prepare_data(pdata):
    data = pdata.copy()
    data['CADASTRALQUALITYID'] = data['CADASTRALQUALITYID'].map({'9': '0',
                                                                 '8': '1',
                                                                 '7': '2',
                                                                 '6': '3',
                                                                 '5': '4',
                                                                 '4': '5',
                                                                 '3': '6',
                                                                 '2': '7',
                                                                 '1': '8',
                                                                 'C': '9',
                                                                 'B': '10',
                                                                 'A': '11', })
    data['CADASTRALQUALITYID'] = data['CADASTRALQUALITYID'].astype('category')

    return data


def fillna(pdata):
    data = pdata.copy()

    data['MAXBUILDINGFLOOR'].fillna(data['MAXBUILDINGFLOOR'].median(), inplace=True)
    data['CADASTRALQUALITYID'].fillna(data['CADASTRALQUALITYID'].mode()[0], inplace=True)

    return data


def to_numeric(pdata):
    data = pdata.copy()

    data['CADASTRALQUALITYID'] = data['CADASTRALQUALITYID'].astype(np.int)

    return data

Ejecutamos las funciones sobre todo el conjunto y volvemos a dividirlo en train y test

In [None]:
data = prepare_data(data)
data = fillna(data)
data = to_numeric(data)

train, test = data.iloc[:train.shape[0], ], data.iloc[train.shape[0]:, ]

# Creación mejores modelos obtenidos

In [None]:
models = {
    {'RESIDENTIAL': Pipeline([('scl', StandardScaler()),
                               ('enn', EditedNearestNeighbours(sampling_strategy='majority')),
                               ('clf', XGBClassifier(n_estimators=500, max_depth=15, learning_rate=0.1))]),
    'INDUSTRIAL': Pipeline([('scl', StandardScaler()),
                           ('enn', EditedNearestNeighbours(sampling_strategy='majority')),
                           ('clf', XGBClassifier(n_estimators=1000, max_depth=10, learning_rate=0.1))]),
    'PUBLIC': Pipeline([('scl', StandardScaler()),
                       ('enn', EditedNearestNeighbours(sampling_strategy='majority')),
                       ('clf', XGBClassifier(n_estimators=1000, max_depth=15, learning_rate=0.15))]),
    'OFFICE': Pipeline([('scl', StandardScaler()),
                       ('enn', EditedNearestNeighbours(sampling_strategy='majority')),
                       ('clf', XGBClassifier(n_estimators=1000, max_depth=10, learning_rate=0.1))]),
    'RETAIL': Pipeline([('scl', StandardScaler()),
                       ('enn', EditedNearestNeighbours(sampling_strategy='majority')),
                       ('clf', XGBClassifier(n_estimators=1000, max_depth=15, learning_rate=0.1))]),
    'AGRICULTURE': Pipeline([('scl', StandardScaler()),
                       ('enn', EditedNearestNeighbours(sampling_strategy='majority')),
                       ('clf', KNeighborsClassifier(weights='distance', n_neighbors=3, metric='manhattan'))]),
    'OTHER': Pipeline([('scl', StandardScaler()),
                       ('enn', EditedNearestNeighbours(sampling_strategy='majority')),
                       ('clf', XGBClassifier(n_estimators=1000, max_depth=10, learning_rate=0.1))])}
}

# Creación del fichero de salida

Una vez creados los modelos, solo tenemos que construir el fichero de salida

In [None]:
labels_names = np.unique(labels)

y_pred_label = []
for label in labels_names:
    print('Load %s model:' % label)

    model = models[label]

    if label != 'RESIDENTIAL':
        y_train = np.array([1 if x == label else -1 for x in labels])
    else:
        y_train = np.array([-1 if x == label else 1 for x in labels])

    print('Training...')
    model.fit(train, y_train)

    pred_proba = model.predict_proba(train)

    if label != 'RESIDENTIAL':
        y_pred_label.append(pred_proba[:, 1])
    else:
        y_pred_label.append(pred_proba[:, 0])

y_pred = labels_names[np.argmax(y_pred_label, axis=0)]

# Calibración de la salida

In [None]:
def predict2(i,X):
    return np.argmax(X+i.reshape(1,-1),axis=1)

def eval_sol2(i,X,y,y_weights):
    return -accuracy_score(y,predict2(i,X),sample_weight=y_weights)

cv = list(StratifiedKFold(shuffle=True).split(X,y))

ypred2 = np.zeros(y.shape)
for tr_idx,ts_idx in progressbar.progressbar(cv):
    de = differential_evolution(eval_sol2,[(-1,1)]*len(le.classes_),popsize=30, tol=1e-4, workers=-1,args=[X[tr_idx],y[tr_idx],y_weights[tr_idx]])
    ypred2[ts_idx] = predict2(de.x,X[ts_idx])

# Creación del fichero de salida

In [None]:
submit = {'ID': test.index, 'CLASE': y_pred2}
df_submit = pd.DataFrame(data=submit)

df_submit.to_csv('predictions/UGR_Code Digger.txt', sep='|', index=False)