In [None]:
%matplotlib inline

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss
from sklearn.model_selection import train_test_split

import seaborn as sns

from sklearn.model_selection import GridSearchCV, StratifiedKFold

from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import AdaBoostClassifier, ExtraTreesClassifier, GradientBoostingClassifier, RandomForestClassifier, VotingClassifier

In [None]:
# Define the competition scorer
def competition_scorer(y_true, y_pred):
    return log_loss(y_true, y_pred, sample_weight=10**y_true)

# Information about the challenge

In this challenge, the `requests` dataset contains information about the requests made by group of individuals (or family) to the french emergency housing public service. A sample of the `requests` dataset corresponds to a unique request.

The goal is to predict the categorical variable `granted_number_of_nights` which represents the number of nights of emergency housing granted to a group. You can train your model on the `train_requests`, the predictions should be made for requests listed in the `test_requests` dataset.

The evaluation metric is given by the `competition_scorer` defined above. It corresponds to a weighted log-loss with weights 1, 10, 100, or 1000 if the `granted_number_of_nights` takes the value 0, 1, 2, or 3 respectively. Thus beware that you will be penalized harder for classification mistakes made on the higher labels.

Good luck!

# Load the datasets

In [None]:
# Train sample
requests = pd.read_csv('data/train_requests.csv', sep=',', low_memory=False, error_bad_lines=False)

# Test sample
requests_test = pd.read_csv('data/test_requests.csv', sep=',', low_memory=False, error_bad_lines=False)

# Data description

In [None]:
requests.describe(include='all').T

In [None]:
# Inspect basic metadata about the dataset
requests.info()

# Data cleaning

In [None]:
requests['child_to_come'].fillna('f', inplace=True)
requests['group_composition_label'].fillna('man alone', inplace=True)
requests['group_type'].fillna('individual', inplace=True)
requests['housing_situation_label'].fillna('street', inplace=True)
requests['victim_of_violence'].fillna('f', inplace=True)

In [None]:
requests.hist(bins=40, figsize=(18, 15))
plt.show()

# Select a subset

In [None]:
n0 = requests[requests['granted_number_of_nights']==0]
n1 = requests[requests['granted_number_of_nights']==1]
n2 = requests[requests['granted_number_of_nights']==2]
n3 = requests[requests['granted_number_of_nights']==3]

len_n0 = len(n0)
len_n1 = len(n1)
len_n2 = len(n2)
len_n3 = len(n3)

sub_n0 = n0.loc[n0.index[np.random.permutation(len_n0//10)], :]
sub_n1 = n1.loc[n1.index[np.random.permutation(len_n1//10)], :]
sub_n2 = n2.loc[n2.index[np.random.permutation(len_n2//10)], :]
sub_n3 = n3.loc[n3.index[np.random.permutation(len_n3//10)], :]

sub_requests = sub_n0.append(sub_n1)
sub_requests = sub_requests.append(sub_n2)
sub_requests = sub_requests.append(sub_n3)

In [None]:
# selected columns for explanatory variable
columns = ['child_situation',
           'district',
           'housing_situation_id',
           'group_composition_id',
           'number_of_underage']

# X = requests[columns]
# y = requests['granted_number_of_nights']
X = sub_requests[columns]
y = sub_requests['granted_number_of_nights']

In [None]:
# split between the train and the validation samples
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.33, random_state=42)

# 4-Train and evaluate a first model

Logistic regression

In [None]:
# use logistic regression as first model
lr_clf = LogisticRegression(solver='liblinear', multi_class='auto', random_state=42)

In [None]:
# fit the model
lr_clf.fit(X_train, y_train)

In [None]:
# evaluate the model with the competition scorer (validation set)
competition_scorer(y_val, lr_clf.predict_proba(X_val))

SVM

In [None]:
svr_params = {'kernel': ['rbf'],
              'gamma': ['auto'],
              'C': [1, 10, 50, 100],
              'probability':[True]}
svc_clf = GridSearchCV(SVC(random_state=42),
                              svr_params,
                              cv=StratifiedKFold(n_splits=10),
                              scoring='accuracy',
                              n_jobs=-1)
# svc_clf = SVC(random_state=42)
svc_clf.fit(X_train, y_train)

In [None]:
# competition_scorer(y_val, svc_clf.predict_proba(X_val))
competition_scorer(y_val, svc_clf.best_estimator_.predict_proba(X_val))

Decision Trees

In [None]:
decisionTree_params = {'max_features': [1, 3, 4],
                       'min_samples_split': [2, 3, 7, 10],
                       'min_samples_leaf': [1, 3, 7, 10]}
dt_clf = GridSearchCV(DecisionTreeClassifier(random_state=42),
                                       decisionTree_params,
                                       cv=StratifiedKFold(n_splits=10),
                                       scoring='accuracy',
                                       n_jobs=-1)
# dt_clf = DecisionTreeClassifier(random_state=42)
dt_clf.fit(X_train, y_train)

In [None]:
dt_clf.best_estimator_

In [None]:
# competition_scorer(y_val, dt_clf.predict_proba(X_val))
competition_scorer(y_val, dt_clf.best_estimator_.predict_proba(X_val))

Random Forest

In [None]:
rdmFrst_params = {'max_features': [2, 4],
                  'min_samples_split': [2, 7, 10],
                  'min_samples_leaf': [2, 8, 10],
                  'bootstrap': [False],
                  'n_estimators':[220, 230],
                  'criterion': ['gini']}
rdmFrst_clf = GridSearchCV(RandomForestClassifier(random_state=42),
                                  rdmFrst_params,
                                  cv=StratifiedKFold(n_splits=10),
                                  scoring='accuracy',
                                  n_jobs=-1)
# rdmFrst_clf = RandomForestClassifier(random_state=42)
rdmFrst_clf.fit(X_train, y_train)

In [None]:
rdmFrst_clf.best_estimator_

In [None]:
# competition_scorer(y_val, rdmFrst_clf.predict_proba(X_val))
competition_scorer(y_val, rdmFrst_clf.best_estimator_.predict_proba(X_val))

Adaboost

In [None]:
# Inspired by https://www.kaggle.com/yassineghouzam/titanic-top-4-with-ensemble-modeling
ada_params = {
    'base_estimator__criterion': ['gini', 'entropy'],
              'base_estimator__splitter': ['best', 'random'],
              'algorithm': ['SAMME', 'SAMME.R'],
              'n_estimators': [50, 55],
              'learning_rate': [0.05, 0.1]}

ada_clf = GridSearchCV(AdaBoostClassifier(DecisionTreeClassifier(random_state=42)),
                              param_grid = ada_params,
                              cv=StratifiedKFold(n_splits=10),
                              scoring='accuracy',
                              n_jobs=-1)

# ada_clf = AdaBoostClassifier(DecisionTreeClassifier(random_state=42))
ada_clf.fit(X_train, y_train)

In [None]:
ada_clf.best_estimator_

In [None]:
# competition_scorer(y_val, ada_clf.predict_proba(X_val))
competition_scorer(y_val, ada_clf.best_estimator_.predict_proba(X_val))

Extra Trees

In [None]:
extraTree_params = {'max_depth': [None],
                    'max_features': [2, 4],
                    'min_samples_split': [2, 10],
                    'min_samples_leaf': [2, 10],
                    'bootstrap': [False],
                    'n_estimators': [100, 200, 300],
                    'criterion': ['gini']}
extraTree_clf = GridSearchCV(ExtraTreesClassifier(random_state=42),
                                    extraTree_params,
                                    cv=StratifiedKFold(n_splits=10),
                                    scoring='accuracy',
                                    n_jobs=-1)
# extraTree_clf = ExtraTreesClassifier(random_state=42, max_features=4)
extraTree_clf.fit(X_train, y_train)

In [None]:
extraTree_clf.best_estimator_

In [None]:
# competition_scorer(y_val, extraTree_clf.predict_proba(X_val))
competition_scorer(y_val, extraTree_clf.best_estimator_.predict_proba(X_val))

Gradient Boosting

In [None]:
gbrt_params = {'n_estimators': [300, 400],
               'loss': ['deviance'],
               'learning_rate': [0.04, 0.05, 0.1],
               'max_depth': [4, 5],
               'min_samples_leaf': [2, 3, 5],
               'max_features': [4]}
gbrt_clf = GridSearchCV(GradientBoostingClassifier(random_state=42),
                               gbrt_params,
                               cv=StratifiedKFold(n_splits=10),
                               scoring='accuracy',
                               n_jobs=-1)
# gbrt_clf = GradientBoostingClassifier(random_state=42, max_features=4)
gbrt_clf.fit(X_train, y_train)

In [None]:
gbrt_clf.best_estimator_

In [None]:
# competition_scorer(y_val, gbrt_clf.predict_proba(X_val))
competition_scorer(y_val, gbrt_clf.best_estimator_.predict_proba(X_val))

Voting classifier

In [None]:
voting_soft_clf = VotingClassifier(
    estimators=[
                ('lr', lr_clf),
#                 ('svm', svr_gridsearch),
                ('dt', dt_clf),
                ('rf', rdmFrst_clf),
                ('ada', ada_clf),
                ('extraTree', extraTree_clf),
                ('gbrt', gbrt_clf)
    ],
    voting='soft'
)

voting_soft_clf.fit(X_train, y_train)

In [None]:
competition_scorer(y_val, voting_soft_clf.predict_proba(X_val))

# 5- Compute predictions on the test set 

In [None]:
# use the model to predict on the test set
X_test = requests_test[columns]
y_pred = ada_clf.predict_proba(X_test)

In [None]:
# overview of prediction probabilities for first four rows
y_pred[:4]

In [None]:
predictions = pd.concat([requests_test['request_id'], pd.DataFrame(y_pred)], axis=1)

# 6- Submit your predictions to the QScore platform

In [None]:
import io, math, requests

# Get your token from qscore:
# 1. Go to https://qscore.datascience-olympics.com/
# 2. Chose the competition Data Science Olympics 2019
# 3. In the left menu click 'Submissions'
# 4. Your token is in the 'Submit from your Python Notebook' tab

def submit_prediction(df, sep=',', comment='', compression='gzip', **kwargs):
    TOKEN='f621f6e9426b924e64f819e48aa2782e2426bab78a7b7dca8b1224dff5be734a6e18167266adf7e04f51dfbfa22db6c3633c0362c3d7c769ae210a8fa1a97670'
    URL='https://qscore.datascience-olympics.com/api/submissions'
    df.to_csv('temporary.dat', sep=sep, compression=compression, **kwargs)
    r = requests.post(URL, headers={'Authorization': 'Bearer {}'.format(TOKEN)},files={'datafile': open('temporary.dat', 'rb')},data={'comment':comment, 'compression': compression})
    if r.status_code == 429:
        raise Exception('Submissions are too close. Next submission is only allowed in {} seconds.'.format(int(math.ceil(int(r.headers['x-rate-limit-remaining']) / 1000.0))))
    if r.status_code != 200:
        raise Exception(r.text)

In [None]:
submit_prediction(predictions, sep=',', index=False, comment='my submission')