## Importing Libraries

In [29]:
%matplotlib inline

from pathlib import Path

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
from imblearn.over_sampling import SMOTE

## Specifying the path of the dataset

In [30]:
DATA_DIR = Path('Dataset Competition')

## Loading the data

In [31]:
train_values = pd.read_csv(DATA_DIR / 'train_values.csv', index_col='building_id')
train_labels = pd.read_csv(DATA_DIR / 'train_labels.csv', index_col='building_id')

## Transforming the data
Transforming the categorical variables into dummy variables



In [32]:
train_values = pd.get_dummies(train_values)

## Creating training and validation set
We are using 80% of the data for training and 20% for validation.

In [33]:
from sklearn.model_selection import train_test_split
train_values, test_values, train_labels, test_labels = train_test_split(train_values, train_labels, test_size=0.2, random_state=42)


# Ausreißer entfernen

In [34]:
# Entfernen von Ausreißern in age Variable
variable = 'age'
threshold =  250

# Entferne alle Zeilen, in denen der Wert in der Spalte 'age' den 'threshold' überschreitet
filtered_train_values = train_values[train_values[variable] <= threshold]

# Entferne die gleichen Zeilen aus dem Label-Datensatz
filtered_train_labels = train_labels[train_labels.index.isin(filtered_train_values.index)]

# SMOTE anwenden

In [35]:
# SMOTE anwenden
sm = SMOTE(random_state=42)
train_values_smote, train_labels_smote = sm.fit_resample(train_values, train_labels.values.ravel())

KeyboardInterrupt: 

## Creating a pipeline

In [36]:
# for preprocessing the data
from sklearn.preprocessing import StandardScaler

# the model
from sklearn.ensemble import RandomForestClassifier

# for combining the preprocess with model training
from sklearn.pipeline import make_pipeline

# for optimizing the hyperparameters of the pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

In [63]:
pipe = make_pipeline(StandardScaler(), 
                     RandomForestClassifier(random_state=2018))
pipe

## Hyperparameter tuning via Grid Search

In [64]:
param_grid = {'randomforestclassifier__n_estimators': [750, 1000],
            'randomforestclassifier__min_samples_leaf': [14],
            'randomforestclassifier__max_features': ['sqrt', 30],
            'randomforestclassifier__max_depth': [21],
            'randomforestclassifier__min_samples_split': [6],
            'randomforestclassifier__criterion': ['entropy']}
gs = GridSearchCV(pipe, param_grid, cv=5)

## Hyperparameter tuning via Randomized Search

In [None]:
# from scipy.stats import randint

# param_dist = {'randomforestclassifier__n_estimators': randint(50,500),
#               'randomforestclassifier__min_samples_leaf': randint(1,10)}


# # Use random search to find the best hyperparameters
# rs = RandomizedSearchCV(pipe, 
#                                  param_distributions = param_dist, 
#                                  n_iter=10, 
#                                  cv=5)

In [None]:
# Printing the best parameters found by RandomizedSearchCV
# rs.best_params_

## Training the model

In [65]:
rf_model = gs.fit(train_values, train_labels.values.ravel())


In [66]:
# Printing the best parameters found by GridSearchCV
gs.best_params_

{'randomforestclassifier__criterion': 'entropy',
 'randomforestclassifier__max_depth': 21,
 'randomforestclassifier__max_features': 30,
 'randomforestclassifier__min_samples_leaf': 14,
 'randomforestclassifier__min_samples_split': 6,
 'randomforestclassifier__n_estimators': 1000}

In [None]:
# rs.fit(train_values, train_labels.values.ravel())

### Sequential Model-Based Optimization (SMBO)

In [69]:
import numpy as np
np.random.seed(123)
import matplotlib.pyplot as plt
from skopt import BayesSearchCV
from sklearn.datasets import load_digits
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split

# log-uniform: understand as search over p = exp(x) by varying x
opt = BayesSearchCV(
    RandomForestClassifier(),
    {
    'n_estimators': (100, 101),
    'max_depth': (1, 50),
    'min_samples_split': (2, 100),
    'min_samples_leaf': (1, 50),
    'max_features': ['sqrt', 'log2', None],
    'criterion': ['gini', 'entropy'],
    'max_features': (20, 30)
    },
    n_iter=32,
    cv=3
)

opt.fit(train_values, train_labels.values.ravel())

print("val. score: %s" % opt.best_score_)
print("test score: %s" % opt.score(test_values, test_labels))
print("best params: %s" % str(opt.best_params_))

## Making predictions

In [67]:
from sklearn.metrics import f1_score
from sklearn.model_selection import cross_val_score

test_preds = rf_model.predict(test_values)
print(f1_score(test_labels, test_preds, average='micro'))



0.7281709867423879
