## Importing Libraries

In [9]:
%matplotlib inline

from pathlib import Path

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

## Specifying the path of the dataset

In [10]:
DATA_DIR = Path('/Users/joscha/Library/Mobile Documents/com~apple~CloudDocs/Studienarbeit/Daten')

## Loading the data

In [11]:
train_values = pd.read_csv(DATA_DIR / 'train_values.csv', index_col='building_id')
train_labels = pd.read_csv(DATA_DIR / 'train_labels.csv', index_col='building_id')

## Transforming the data
Transforming the categorical variables into dummy variables



In [12]:
train_values = pd.get_dummies(train_values)

## Creating training and validation set
We are using 80% of the data for training and 20% for validation.

In [13]:
from sklearn.model_selection import train_test_split
train_values, test_values, train_labels, test_labels = train_test_split(train_values, train_labels, test_size=0.2, random_state=42)


## Creating a pipeline

In [14]:
# for preprocessing the data
from sklearn.preprocessing import StandardScaler

# the model
from sklearn.ensemble import RandomForestClassifier

# for combining the preprocess with model training
from sklearn.pipeline import make_pipeline

# for optimizing the hyperparameters of the pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

In [15]:
pipe = make_pipeline(StandardScaler(), 
                     RandomForestClassifier(random_state=2018))
pipe

## Hyperparameter tuning via Grid Search

In [16]:
param_grid = {'randomforestclassifier__n_estimators': [50, 100],
              'randomforestclassifier__min_samples_leaf': [1, 5]}
gs = GridSearchCV(pipe, param_grid, cv=5)

In [26]:
# Printing the best parameters found by GridSearchCV
gs.best_params_

{'randomforestclassifier__min_samples_leaf': 1,
 'randomforestclassifier__n_estimators': 100}

## Hyperparameter tuning via Randomized Search

In [17]:
from scipy.stats import randint

param_dist = {'randomforestclassifier__n_estimators': randint(50,500),
              'randomforestclassifier__min_samples_leaf': randint(1,10)}


# Use random search to find the best hyperparameters
rs = RandomizedSearchCV(pipe, 
                                 param_distributions = param_dist, 
                                 n_iter=10, 
                                 cv=5)

In [27]:
# Printing the best parameters found by RandomizedSearchCV
rs.best_params_

{'randomforestclassifier__min_samples_leaf': 2,
 'randomforestclassifier__n_estimators': 318}

## Training the model

In [18]:
gs.fit(train_values, train_labels.values.ravel())


In [25]:
rs.fit(train_values, train_labels.values.ravel())

## Making predictions

In [23]:
from sklearn.metrics import f1_score

training_preds = gs.predict(train_values)
print(f1_score(train_labels, training_preds, average='micro'))

test_preds = gs.predict(test_values)
print(f1_score(test_labels, test_preds, average='micro'))

training_preds = rs.predict(train_values)
print(f1_score(train_labels, training_preds, average='micro'))

test_preds = rs.predict(test_values)
print(f1_score(test_labels, test_preds, average='micro'))

0.9867900997697621
0.7131098789355539
0.7971987720644665
0.7176378043399014
