# Classification

In [1]:
import pandas as pd
import numpy as np
from pandas.api.types import is_object_dtype

import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap

import sklearn
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
import sklearn.metrics
import xgboost as xgb

from sklearn.inspection import permutation_importance

from imblearn.over_sampling import RandomOverSampler
from imblearn.over_sampling import SMOTE, ADASYN

from helper_functions import preprocessing
from helper_functions.metrics import *

In [2]:
csv_data = pd.read_csv('immo_dev_data.csv')
csv_data

Unnamed: 0,Id,AreaLiving,AreaProperty,BuiltYear,FloorNumber,ForestDensityL,ForestDensityM,ForestDensityS,GroupNameDe,HouseObject,...,gde_private_apartments,gde_social_help_quota,gde_tax,gde_workers_sector1,gde_workers_sector2,gde_workers_sector3,gde_workers_total,location_has_street,location_is_complete,PurchasePrice
0,7135329,140.0,501.0,2016,,0.418964,0.555985,0.730714,Haus,True,...,1358.0,3.660512,8.73,17.0,162.0,358.0,537.0,0,0,745000.0
1,7170979,143.0,277.0,2004,,0.033259,0.074061,0.076468,Haus,True,...,3476.0,3.634717,6.13,0.0,2250.0,2787.0,5041.0,1,1,780000.0
2,7172246,160.0,712.0,1945,,0.000000,0.000000,0.000000,Haus,True,...,2806.0,2.512344,9.79,167.0,1694.0,1138.0,2999.0,0,0,570000.0
3,7172252,351.0,496.0,2016,,0.037575,0.000000,0.000000,Haus,True,...,131.0,1.734104,9.15,12.0,10.0,17.0,39.0,0,0,920000.0
4,7172733,400.0,1800.0,1975,,0.095162,0.097193,0.153314,Haus,True,...,1181.0,1.056052,2.97,0.0,27.0,701.0,732.0,0,0,3950000.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
153622,42175297,116.0,0.0,2005,,0.002560,0.000000,0.000000,Attikawohnung,False,...,9207.0,9.211883,3.09,0.0,3838.0,20507.0,24347.0,0,0,1700000.0
153623,42175302,70.0,0.0,1978,,0.019577,0.041224,0.001321,Wohnung,False,...,5511.0,6.205661,3.18,8.0,1328.0,4433.0,5769.0,0,0,720000.0
153624,42175305,136.0,0.0,1997,2.0,0.068206,0.058172,0.000000,Wohnung,False,...,178.0,0.000000,8.68,37.0,27.0,44.0,108.0,0,0,725000.0
153625,42175727,60.0,0.0,2009,,0.098870,0.020076,0.000000,Wohnung,False,...,2700.0,0.872739,4.90,282.0,396.0,1589.0,2267.0,0,0,289000.0


In [3]:
#train_data = csv_data.sample(50000, random_state = 5)[['GroupNameDe', 'AreaLiving', 'PurchasePrice']].reset_index()
train_data = csv_data[['AreaLiving', 'AreaProperty', 'BuiltYear', 'FloorNumber', 'GroupNameDe', 'Latitude', 'Longitude', 'PurchasePrice', 'Rooms']].reset_index()
train_data['AreaLiving'] = train_data['AreaLiving'].round(decimals = 0).astype('int32')
train_data['AreaProperty'] = train_data['AreaProperty'].round(decimals = 0).astype('int32')
train_data['Latitude'] = train_data['Latitude'].round(decimals = 2)
train_data['Longitude'] = train_data['Longitude'].round(decimals = 2)
train_data['PurchasePrice'] = train_data['PurchasePrice'].round(decimals = 0).astype('int32')
train_data.head()

Unnamed: 0,index,AreaLiving,AreaProperty,BuiltYear,FloorNumber,GroupNameDe,Latitude,Longitude,PurchasePrice,Rooms
0,0,140,501,2016,,Haus,47.32,7.85,745000,4.5
1,1,143,277,2004,,Haus,47.39,8.07,780000,5.5
2,2,160,712,1945,,Haus,47.34,7.24,570000,7.0
3,3,351,496,2016,,Haus,46.5,6.31,920000,5.5
4,4,400,1800,1975,,Haus,46.24,6.15,3950000,8.0


In [4]:
train_data['GroupNameDe'].value_counts()

Wohnung              77499
Haus                 50265
Doppelhaus            8397
Attikawohnung         6215
Maisonettewohnung     5517
Reihenhaus            4168
Terassenhaus           533
Loftwohnung            474
Mehrfamilienhaus       444
Zimmer                  89
Anderes Haus            26
Name: GroupNameDe, dtype: int64

Just use a few features for the start

In [5]:
prepro = preprocessing.preprocessor(train_data, y_var='GroupNameDe')#, cols_to_drop=['StreetAndNr', 'LastUpdate', 'Locality', 'Id', 'Zip'])#, 'Id'])
prepro.preprocess()

X_train = prepro.X_train
X_test = prepro.X_test

y_train = prepro.y_train
y_test = prepro.y_test

Columns dropped to create X:  []


In [6]:
ros = RandomOverSampler(random_state=0)
X_resampled, y_resampled = ros.fit_resample(X_train, y_train)

# Metrics

## Accuracy 

Anteil korrekt klassifiziert

## Precision, recall and F-1 score

- precision is $P = \frac{T_p}{T_p + F_p}$
    - how many of the as positive classified are actually positive
- recall $R = \frac{T_p}{T_p + F_n}$
    - how many of all positive are classified as positive
- F1-Score $F1 = 2 \frac{P \times R}{P + R}$ aka harmonic mean of $P$ and $R$

We're going to focus our analysis on improving the mean F1-Score. It's not clear yet if we take the `micro` or the `macro` average

# K Nearest Neighbor

In [7]:
knn = KNeighborsClassifier()
knn.fit(X = X_train, y = y_train)

KNeighborsClassifier()

In [8]:
knn_y_pred = knn.predict(X = X_test)

KeyboardInterrupt: 

In [None]:
print('f1 micro: ', sklearn.metrics.f1_score(y_test, knn_y_pred, average = 'micro'))
print('f1 macro: ', sklearn.metrics.f1_score(y_test, knn_y_pred, average = 'macro'))
print('f1 weighted: ', sklearn.metrics.f1_score(y_test, knn_y_pred, average = 'weighted'))

In [None]:
print(sklearn.metrics.classification_report(y_test, knn_y_pred))

In [None]:
#sklearn.inspection.permutation_importance(knn, X_train, y_train, scoring=None, n_repeats=1, n_jobs=None, random_state=1)

# Random Forest

In [None]:
rfc = RandomForestClassifier(n_estimators = 200, bootstrap = True)
rfc.fit(X_train, y_train)

In [None]:
rfc_y_pred = rfc.predict(X = X_test)

In [None]:
# new
print('f1 micro: ', sklearn.metrics.f1_score(y_test, rfc_y_pred, average = 'micro'))
print('f1 macro: ', sklearn.metrics.f1_score(y_test, rfc_y_pred, average = 'macro'))
print('f1 weighted: ', sklearn.metrics.f1_score(y_test, rfc_y_pred, average = 'weighted'))

In [None]:
y_resampled

In [None]:
sklearn.metrics.classification

### Resampled data

In [None]:
rfc_sample = RandomForestClassifier()
rfc_sample.fit(X_resampled, y_resampled)

In [None]:
rfc_sample_y_pred = rfc_sample.predict(X = X_test)

In [None]:
# new
print('f1 micro: ', sklearn.metrics.f1_score(y_test, rfc_sample_y_pred, average = 'micro'))
print('f1 macro: ', sklearn.metrics.f1_score(y_test, rfc_sample_y_pred, average = 'macro'))
print('f1 weighted: ', sklearn.metrics.f1_score(y_test, rfc_sample_y_pred, average = 'weighted'))

### SMOTE

In [None]:
X_smote, y_smote = SMOTE().fit_resample(X_train, y_train)

In [None]:
rfc_smote = RandomForestClassifier(n_estimators = 200)
rfc_smote.fit(X_smote, y_smote)

In [None]:
rfc_smote_y_pred = rfc_smote.predict(X = X_test)

In [None]:
# new
print('f1 micro: ', sklearn.metrics.f1_score(y_test, rfc_smote_y_pred, average = 'micro'))
print('f1 macro: ', sklearn.metrics.f1_score(y_test, rfc_smote_y_pred, average = 'macro'))
print('f1 weighted: ', sklearn.metrics.f1_score(y_test, rfc_smote_y_pred, average = 'weighted'))

In [None]:
print('f1 micro: ', sklearn.metrics.f1_score(y_test, rfc_smote_y_pred, average = 'micro'))
print('f1 macro: ', sklearn.metrics.f1_score(y_test, rfc_smote_y_pred, average = 'macro'))
print('f1 weighted: ', sklearn.metrics.f1_score(y_test, rfc_smote_y_pred, average = 'weighted'))

# XGBoost / REPLACE WITH NEURAL NET

In [None]:
xgbc = xgb.XGBClassifier()
xgbc.fit(X_train, y_train)

In [None]:
xgbc_y_pred = xgbc.predict(X_test)

In [None]:
print('f1 micro: ', sklearn.metrics.f1_score(y_test, xgbc_y_pred, average = 'micro'))
print('f1 macro: ', sklearn.metrics.f1_score(y_test, xgbc_y_pred, average = 'macro'))
print('f1 weighted: ', sklearn.metrics.f1_score(y_test, xgbc_y_pred, average = 'weighted'))

# Hyperparameter tuning

## RandomForest

In [None]:
# number of trees
n_estimators = [100, 200, 500, 1000]

# max number of features to consider at every split
max_features = ['auto', 'sqrt', 'log2']

# max number of levels in tree
max_depth = [10, 20, 30, 40, 50]
max_depth.append(None)

# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10, 20]

# Minumum number of samples required at each leaf node
min_samples_leaf = [1, 2, 5, 10, 20]

grid_param = {
    'n_estimators': n_estimators,
    'max_features': max_features,
    'max_depth': max_depth,
    'min_samples_split': min_samples_split,
    'min_samples_leaf': min_samples_leaf
}

In [None]:
rfc_rnd_search = RandomForestClassifier(n_estimators = 100, bootstrap = True)

n_iter_search = 10
random_search = RandomizedSearchCV(rfc_rnd_search, param_distributions=grid_param,
                                   n_iter=n_iter_search)


#random_search.fit(prepro.X_train, prepro.y_train)

In [None]:
# use a full grid over all parameters
param_grid = {"max_depth": [3, None],
              "max_features": [1, 3, 10],
              "min_samples_split": [1, 3, 10],
              "min_samples_leaf": [1, 3, 10],
              "bootstrap": [True, False],
              "criterion": ["gini", "entropy"]}

# run grid search
grid_search = GridSearchCV(clf, param_grid=param_grid)
start = time()
grid_search.fit(X, y)