In [1]:
import pickle
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, make_scorer, f1_score, recall_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from imblearn.under_sampling import EditedNearestNeighbours
import numpy as np
import seaborn as sns
import pandas as pd



In [2]:
data = pd.read_csv('/home/jose/Escritorio/datathon/src/data/train.txt', sep='|', index_col='ID')
# test = pd.read_csv('/home/jose/Escritorio/datathon/src/data/test.txt', sep='|', index_col='ID')

labels = data.iloc[:, -1].map({
    'RESIDENTIAL': -1,
    'INDUSTRIAL': -1,
    'PUBLIC': -1,
    'OFFICE': -1,
    'RETAIL': -1,
    'AGRICULTURE': 1,
    'OTHER': -1,
})
data.drop('CLASE', axis=1, inplace=True)

train, test, y_train, y_test = train_test_split(data, labels, test_size=0.2, random_state=123)

data = pd.concat([train, test], sort=False)

data['CADASTRALQUALITYID'] = data['CADASTRALQUALITYID'].map({'9': '0',
                               '8': '1',
                               '7': '2',
                               '6': '3',
                               '5': '4',
                               '4': '5',
                               '3': '6',
                               '2': '7',
                               '1': '8',
                               'C': '9',
                               'B': '10',
                               'A': '11',})
data['CADASTRALQUALITYID'] = data['CADASTRALQUALITYID'].astype('category')

In [3]:
data['MAXBUILDINGFLOOR'].fillna(data['MAXBUILDINGFLOOR'].median(), inplace=True)
data['CADASTRALQUALITYID'].fillna(data['CADASTRALQUALITYID'].mode()[0], inplace=True)

data['CADASTRALQUALITYID'] = data['CADASTRALQUALITYID'].astype(np.int)

In [4]:
sc = StandardScaler()
data = pd.concat([pd.DataFrame(sc.fit_transform(data.select_dtypes(['number'])),
                              index=data.index,
                              columns=data.select_dtypes(['number']).columns),
                data.select_dtypes(['category', 'object'])], axis=1, sort=False)

train, test = data.iloc[:train.shape[0], ], data.iloc[train.shape[0]:, ]

In [9]:
model = RandomForestClassifier(n_jobs=-1, n_estimators=600)

model.fit(train, y_train)
y_pred = model.predict(test)

In [10]:
accuracy_score(y_test, y_pred)

0.997965707643127

In [11]:
print(classification_report(y_test, y_pred, digits=4))

              precision    recall  f1-score   support

          -1     0.9983    0.9996    0.9990     20566
           1     0.8519    0.5750    0.6866        80

    accuracy                         0.9980     20646
   macro avg     0.9251    0.7873    0.8428     20646
weighted avg     0.9978    0.9980    0.9978     20646



In [8]:
print(confusion_matrix(y_test, y_pred))

[[20558     8]
 [   34    46]]


In [None]:
params = False

if params:
    params = {'n_neighbors': [1,3,5,7,11,13,15,21],
             'weights': ['uniform', 'distance'],
             'metric': ['minkowski', 'manhattan'],
             'n_jobs': [-1]}

    model = KNeighborsClassifier()

    grid = RandomizedSearchCV(model, params, cv=5, scoring='f1', n_jobs=-1)

    grid.fit(train, y_train)
    
    best_params = grid.best_params_
    
    model = grid
    
#     model = KNeighborsClassifier(**best_params)
#     model.fit(train, y_train)
    
    print(best_params)
else:
    # datos con ruido
    best_params = {'metric': 'manhattan', 'n_jobs': -1, 'n_neighbors': 1, 'weights': 'distance'}
    # datos sin ruido
#     best_params = {'metric': 'manhattan', 'n_jobs': -1, 'n_neighbors': 1, 'weights': 'uniform'}
    model = KNeighborsClassifier(**best_params)
    model.fit(train, y_train)

In [None]:
y_pred = model.predict(test)

In [None]:
accuracy_score(y_test, y_pred)

In [None]:
print(classification_report(y_test, y_pred, digits=4))

In [None]:
print(confusion_matrix(y_test, y_pred))

In [None]:
# data = pd.concat([train,test], sort=False)
# data['CADASTRALQUALITYID'] = data['CADASTRALQUALITYID'].astype(np.int)
# ncol = sc.fit_transform((data['CADASTRALQUALITYID'].values).reshape(-1,1))
# data['CADASTRALQUALITYID'] = ncol

# train, test = data.iloc[:train.shape[0], ], data.iloc[train.shape[0]:, ]

In [None]:
# params = False

# if params:
#     params = {'max_depth': [5,10,15],
#              'learning_rate': np.linspace(0.001, 0.15, 6),
#              'n_jobs': [-1],
#               'gamma': np.linspace(0,1,4),
#               'min_child_weight': [1,2,3],
#               'n_estimators': [100,400,800],
#              'random_state': [123]}

#     model = XGBClassifier()

#     grid = RandomizedSearchCV(model, params, cv=5, scoring=make_scorer(f1_score), n_jobs=-1)

#     grid.fit(train, y_train)
    
#     best_params = grid.best_params_
    
#     model = XGBClassifier(**best_params)
#     model.fit(train, y_train)
    
#     print(best_params)
# else:
#     best_params = {'random_state': 123, 'n_jobs': -1, 'n_estimators': 800, 'min_child_weight': 2, 'max_depth': 5, 'learning_rate': 0.1202, 'gamma': 1.0}
#     best_params = {'random_state': 123, 'n_jobs': -1, 'n_estimators': 1000, 'max_depth': 20}
#     model = XGBClassifier(**best_params)
#     model.fit(train, y_train)

In [None]:
# accuracy_score(y_test, y_pred)

In [None]:
# print(classification_report(y_test, y_pred, digits=4))

In [None]:
# print(confusion_matrix(y_test, y_pred))