# Features and Model
This notebook prepare features, train models and test them in order to classify housing assistance requests thanks to the given datasets.

This is a multi-class classification task

The metric to optimize and the datasets description are explained in DS_technical_test_tutorial.ipynb Jupyter Notebook.

## import

In [149]:
%matplotlib inline
import matplotlib.pyplot as plt
plt.style.use('seaborn-whitegrid')
import seaborn as sns
sns.set(color_codes=True, font_scale=1.33)

import datetime
import re
import numpy as np
import pandas as pd
import time
from sklearn.metrics import log_loss
from sklearn.metrics import classification_report, confusion_matrix
from sklearn import preprocessing
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

## Definitions

In [2]:
# data cleaned
PATH_REQ_TRAIN_CLEAN = 'data/requests_train_clean.csv'
PATH_REQ_TEST_CLEAN = 'data/requests_test_clean.csv'

# Features
PATH_FS1_TRAIN = 'data/fs1_train.csv'
PATH_FS1_TEST = 'data/fs1_test.csv'

## Useful functions

In [3]:
# Define the test scorer
def competition_scorer(y_true, y_pred):
    return log_loss(y_true, y_pred, sample_weight=10**y_true)


def display_missing(df):
    '''
    Function to display missing values into dataframe.
    Creates a barplots figure and outputs values.

    - input : 
        df : dataframe : data to check
    - output : 
        df_missing : dataframe : table of missing values
    '''
    nb_lignes = df.shape[0]
    nb_missing = df.isnull().sum()
    nb_missing = nb_missing.sort_values()
    df_missing = nb_missing.to_frame()
    df_missing.rename(index=str, columns={0: "nb_missing"}, inplace=True)
    df_missing["ratio"] = 100*df_missing["nb_missing"] / nb_lignes
    fig = plt.figure(figsize=(7, 7))
    liste_name_bars = df_missing.index.values
    liste_name_bars_num = []
    for name in liste_name_bars:
        liste_name_bars_num.append("{} [{}]".format(
            name, df_missing.loc[name, "nb_missing"]))
    ax = sns.barplot(y=liste_name_bars_num, x='ratio', data=df_missing)
    plt.title('Missing lines Ratio over {} lines'.format(nb_lignes))
    ax.set(xlabel='repartition [%]')
    ax.set_xlim([0, 100])
    return df_missing


def plot_hist_cat(x, data, title):
    '''
    Plot histogram count of categorical data 
    Add proportional information
    '''
    ser_val = data[x].value_counts(dropna=False)
    pc_val = ser_val.values / data[x].shape[0]

    sns.catplot(x=x, kind="count", palette="ch:.25", data=data, 
                height=7, 
                aspect=1.7,
                order=ser_val.index)
    fig = plt.gcf()
    ax = fig.gca()

    for K, y_val in enumerate(ser_val.values):
        ax.text(K, y_val, '{:.2f}'.format(pc_val[K]))
    
    ax.set_title(title)
    
    
    plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
             rotation_mode="anchor")

    
def plot_cont(X,Y,data):
    '''
    Calculate Chi-2 heatmap into contingency table for 
    X event and Y event
    '''
    c = data[[X,Y]].pivot_table(index=X,columns=Y,aggfunc=len)
    cont = c.copy()

    tx = data[X].value_counts()
    ty = data[Y].value_counts()

    cont.loc[:,"Total"] = tx
    cont.loc["total",:] = ty
    cont.loc["total","Total"] = len(data)
    cont

    tx = pd.DataFrame(tx)
    ty = pd.DataFrame(ty)
    tx.columns = ["foo"]
    ty.columns = ["foo"]
    n = len(data)
    indep = tx.dot(ty.T) / n

    c = c.fillna(0) # on remplace les valeurs nulles par des 0
    mesure = (c-indep)**2/indep
    xi_n = mesure.sum().sum()
    # plot
    fig = plt.figure(figsize = (5,11.75))
    sns.heatmap(mesure/xi_n, annot=c, fmt=".0f")
    plt.show()
    
def plot_confusion_matrix(y_true, y_pred, classes,
                          normalize=False,
                          title=None,
                          cmap=plt.cm.Blues, size=16, fontsize=12, 
                          threshold=None):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    source : 
    https://scikit-learn.org/stable/auto_examples/model_selection/plot_confusion_matrix.html
    """
    if not title:
        if normalize:
            title = 'Normalized confusion matrix'
        else:
            title = 'Confusion matrix, without normalization'

    # Compute confusion matrix
    cm = confusion_matrix(y_true, y_pred)
    # Only use the labels that appear in the data
    classes = classes[unique_labels(y_true, y_pred)]

    # extract most important confusion if threshold
    if threshold is not None:
      list_idx=[]
      for i in range(0, cm.shape[0]):
        for j in range(0, cm.shape[1]):
          if i != j:
            if cm[i,j] > threshold:
              list_idx.append(i)
              list_idx.append(j)
      list_idx = np.unique(list_idx)
      cm = cm[np.ix_(list_idx, list_idx)]
      classes = classes[list_idx]

    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)

    fig, ax = plt.subplots()
    im = ax.imshow(cm, interpolation='nearest', cmap=cmap)
    #ax.figure.colorbar(im, ax=ax)
    # We want to show all ticks...
    ax.set(xticks=np.arange(cm.shape[1]),
           yticks=np.arange(cm.shape[0]),
           # ... and label them with the respective list entries
           xticklabels=classes, yticklabels=classes,
           title=title,
           ylabel='True label',
           xlabel='Predicted label')
 
    
    # Rotate the tick labels and set their alignment.
    plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
             rotation_mode="anchor", fontsize=fontsize)
    # resize y tick labels
    plt.setp(ax.get_yticklabels(), fontsize=fontsize)
    # Loop over data dimensions and create text annotations.
    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i in range(cm.shape[0]):
        for j in range(cm.shape[1]):
            ax.text(j, i, format(cm[i, j], fmt),
                    ha="center", va="center",
                    color="white" if cm[i, j] > thresh else "black",
                    fontsize=fontsize,
                    fontweight="black" if i==j else "normal")
    fig.tight_layout()

    fig.set_size_inches([size,size])
    ax.set_ylim([cm.shape[1]-0.5,-0.5]);
    ax.grid(linewidth=0)
    return ax

## Load data

In [4]:
requests_train = pd.read_csv(filepath_or_buffer=PATH_REQ_TRAIN_CLEAN,
                             sep=',',
                             low_memory=False,
                             error_bad_lines=False)

requests_test = pd.read_csv(filepath_or_buffer=PATH_REQ_TEST_CLEAN,
                            sep=',',
                            low_memory=False,
                            error_bad_lines=False)


requests_train.head()

Unnamed: 0,request_id,animal_presence,answer_creation_date,child_situation,child_to_come,district,granted_number_of_nights,group_composition_id,group_composition_label,group_creation_date,...,age_mean,disabled_worker_certification,nb_female,nb_male,married,request_creation_date_fmt,request_creation_day,request_creation_day_num,request_creation_month,child_situation_red
0,130667d2964de43c511d0ead7ac66b33,f,2019-02-11 22:30:00,-1,unknown,85,1,10,man alone,2018-05-03 12:10:40.416,...,42.0,f,0.0,1.0,f,2019-02-11 22:30:00,Mon,1,2,0
1,b1cd62fcf24eebb0f685d5eaf55317a8,f,2019-02-12 09:00:00,-1,unknown,85,1,10,man alone,2019-03-07 12:11:08.545,...,39.0,f,0.0,1.0,f,2019-02-12 09:00:00,Tue,2,2,0
2,12cc0a18890bd4959fe42df3ae58838e,f,2019-02-01 21:00:00,-1,unknown,59,1,80,isolated child/underage,2019-03-13 12:54:15.887,...,16.0,f,0.0,1.0,f,2019-02-01 21:00:00,Fri,5,2,0
3,ae2d5b4dc181d29e430132f145da1556,f,2019-02-25 15:12:05.037,-1,unknown,50,2,20,woman alone,2018-10-09 14:37:29.773,...,33.0,f,1.0,0.0,f,2019-02-25 15:12:05,Mon,1,2,0
4,d13a17ce36c832514fda2464e11c4a9f,f,2019-02-20 09:59:20.603,-1,unknown,93,2,60,group with child(ren),2018-10-02 10:47:36.931,...,32.0,f,2.0,1.0,f,2019-02-20 09:59:20,Wed,3,2,0


In [5]:
requests_train.shape

(238191, 41)

In [6]:
requests_test.shape

(59548, 41)

In [7]:
requests_train_raw = pd.read_csv(filepath_or_buffer='data/requests_train.csv',
                             sep=',',
                             low_memory=False,
                             error_bad_lines=False)

requests_test_raw = pd.read_csv(filepath_or_buffer='data/requests_test.csv',
                            sep=',',
                            low_memory=False,
                            error_bad_lines=False)

In [8]:
requests_train_raw.shape

(238191, 24)

In [9]:
requests_test_raw.shape

(59548, 24)

## Features selection

### FS1

Take all relevent features : 

In [10]:
requests_train.columns.tolist()

['request_id',
 'animal_presence',
 'answer_creation_date',
 'child_situation',
 'child_to_come',
 'district',
 'granted_number_of_nights',
 'group_composition_id',
 'group_composition_label',
 'group_creation_date',
 'group_id',
 'group_main_requester_id',
 'group_type',
 'housing_situation_id',
 'housing_situation_label',
 'long_term_housing_request',
 'number_of_underage',
 'request_backoffice_creator_id',
 'request_creation_date',
 'requester_type',
 'social_situation_id',
 'town',
 'victim_of_violence',
 'victim_of_violence_type',
 'housing_situation_2_label',
 'pregnancy',
 'region',
 'childcare_center_supervision',
 'nb_underage_red',
 'age_max',
 'age_min',
 'age_mean',
 'disabled_worker_certification',
 'nb_female',
 'nb_male',
 'married',
 'request_creation_date_fmt',
 'request_creation_day',
 'request_creation_day_num',
 'request_creation_month',
 'child_situation_red']

In [11]:
list_filter_raw = ['granted_number_of_nights', # int
                'animal_presence', # bol
                'group_composition_label', # cat 
                'housing_situation_label', # cat (#)
                'long_term_housing_request', # cat (3)
                'requester_type', # cat (3)
                'victim_of_violence_type', # cat
                'pregnancy', # bol
                'region', # cat
                'childcare_center_supervision', # bol
                'nb_underage_red', # int
                'age_min', # int
                'age_max', # int
                'disabled_worker_certification', # bol
                'nb_female', # int
                'nb_male', # int
                'married', # bol
                'request_creation_day_num', # ordinal cat int
                'request_creation_month',  # ordinal cat int
                'child_situation_red'] # bol 

# feature ready 
list_feat = ['nb_underage_red', 
             'age_min', 
             'age_max', 
             'nb_female',
             'nb_male',
             'request_creation_day_num',
             'request_creation_month',
             'child_situation_red']

In [12]:
df_train = requests_train.filter(items=list_filter_raw)
df_test = requests_test.filter(items=list_filter_raw)
df_train.head()

Unnamed: 0,granted_number_of_nights,animal_presence,group_composition_label,housing_situation_label,long_term_housing_request,requester_type,victim_of_violence_type,pregnancy,region,childcare_center_supervision,nb_underage_red,age_min,age_max,disabled_worker_certification,nb_female,nb_male,married,request_creation_day_num,request_creation_month,child_situation_red
0,1,f,man alone,street,unknown,third party,f,f,Provence-Alpes-Côte d'Azur,f,0,42.0,42.0,f,0.0,1.0,f,1,2,0
1,1,f,man alone,street,unknown,user,f,f,Provence-Alpes-Côte d'Azur,f,0,39.0,39.0,f,0.0,1.0,f,2,2,0
2,1,f,isolated child/underage,street,unknown,user,f,f,Grand Est,f,0,16.0,16.0,f,0.0,1.0,f,5,2,0
3,2,f,woman alone,street,unknown,user,woman,f,Bretagne,f,0,33.0,33.0,f,1.0,0.0,f,1,2,0
4,2,f,group with child(ren),hotel paid by the emergency centre,unknown,user,f,f,Île-de-France,f,0,17.0,40.0,f,2.0,1.0,f,3,2,0


#### animal_presence : str -> bol

In [13]:
def fun_str_to_bol(str_in):
    if str_in == 't':
        return 1
    else:
        return 0


df_train['animal_presence_bol'] = \
    df_train['animal_presence'].apply(fun_str_to_bol)
df_test['animal_presence_bol'] = \
    df_test['animal_presence'].apply(fun_str_to_bol)

list_feat.append('animal_presence_bol')

#### group_composition_label : one hot encoding

In [14]:
df_add = pd.get_dummies(df_train["group_composition_label"], 
               prefix='gcl', drop_first=True)
list_feat = list_feat +  df_add.columns.tolist()

In [15]:
df_tmp = pd.concat([df_train, df_add], axis=1)

In [16]:
df_train.shape

(238191, 21)

In [17]:
df_tmp.shape

(238191, 32)

In [18]:
df_train = df_tmp.copy()

In [19]:
df_train.head()

Unnamed: 0,granted_number_of_nights,animal_presence,group_composition_label,housing_situation_label,long_term_housing_request,requester_type,victim_of_violence_type,pregnancy,region,childcare_center_supervision,...,gcl_couple with child(ren),gcl_couple without whildren,gcl_group of adults,gcl_group with child(ren),gcl_isolated child/underage,gcl_man alone,gcl_single father with child(ren),gcl_single mother with child(ren),gcl_woman alone,gcl_women victim of violence
0,1,f,man alone,street,unknown,third party,f,f,Provence-Alpes-Côte d'Azur,f,...,0,0,0,0,0,1,0,0,0,0
1,1,f,man alone,street,unknown,user,f,f,Provence-Alpes-Côte d'Azur,f,...,0,0,0,0,0,1,0,0,0,0
2,1,f,isolated child/underage,street,unknown,user,f,f,Grand Est,f,...,0,0,0,0,1,0,0,0,0,0
3,2,f,woman alone,street,unknown,user,woman,f,Bretagne,f,...,0,0,0,0,0,0,0,0,1,0
4,2,f,group with child(ren),hotel paid by the emergency centre,unknown,user,f,f,Île-de-France,f,...,0,0,0,1,0,0,0,0,0,0


In [20]:
df_test = pd.concat([df_test, 
          pd.get_dummies(df_test["group_composition_label"], 
               prefix='gcl', drop_first=True)], axis=1)

In [21]:
df_test.shape

(59548, 32)

#### housing_situation_label : one hot encoding

In [22]:
df_add = pd.get_dummies(df_train["housing_situation_label"], 
               prefix='hs', drop_first=True)
list_feat = list_feat +  df_add.columns.tolist()

In [23]:
df_add

Unnamed: 0,hs_detoxification center,hs_emergency structure,hs_hotel paid by an association,hs_hotel paid by the emergency centre,hs_hotel paid by the household,hs_hotel paid by the regional administration,hs_inclusion structure,hs_mobile or makeshift shelter,hs_other,hs_parental home,hs_personal or marital home,hs_police station,hs_prison,hs_psychiatric hospital,hs_public hospital,hs_refused to answer,"hs_religious place (church, mosque, synogogue)",hs_shelters,hs_stabilisation structure,hs_street
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
4,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
238186,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
238187,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
238188,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
238189,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [24]:
df_tmp = pd.concat([df_train, df_add], axis=1)

In [25]:
df_train.shape

(238191, 32)

In [26]:
df_tmp.shape

(238191, 52)

In [27]:
df_train = df_tmp.copy()

In [28]:
df_train["housing_situation_label"].nunique()

21

In [29]:
df_test["housing_situation_label"].nunique()

21

In [30]:
df_test = pd.concat([df_test, 
          pd.get_dummies(df_test["housing_situation_label"], 
               prefix='hs', drop_first=True)], axis=1)

In [31]:
df_test.shape

(59548, 52)

#### long_term_housing_request : one hot encoding

We replace unknown by NaN

In [32]:
df_train["long_term_housing_request"].value_counts()

unknown    165556
f           60386
t           12249
Name: long_term_housing_request, dtype: int64

In [33]:
df_train.loc[df_train["long_term_housing_request"] == "unknown", 
   "long_term_housing_request"] = np.nan

In [34]:
df_train["long_term_housing_request"].value_counts(dropna=False)

NaN    165556
f       60386
t       12249
Name: long_term_housing_request, dtype: int64

In [35]:
df_add = pd.get_dummies(df_train["long_term_housing_request"], 
               prefix='lts_req')
df_add

Unnamed: 0,lts_req_f,lts_req_t
0,0,0
1,0,0
2,0,0
3,0,0
4,0,0
...,...,...
238186,1,0
238187,0,0
238188,0,0
238189,1,0


In [36]:
list_feat = list_feat +  df_add.columns.tolist()

In [37]:
df_tmp = pd.concat([df_train, df_add], axis=1)

In [38]:
df_train.shape

(238191, 52)

In [39]:
df_tmp.shape

(238191, 54)

In [40]:
df_train = df_tmp.copy()

In [41]:
df_train.sample(5)

Unnamed: 0,granted_number_of_nights,animal_presence,group_composition_label,housing_situation_label,long_term_housing_request,requester_type,victim_of_violence_type,pregnancy,region,childcare_center_supervision,...,hs_prison,hs_psychiatric hospital,hs_public hospital,hs_refused to answer,"hs_religious place (church, mosque, synogogue)",hs_shelters,hs_stabilisation structure,hs_street,lts_req_f,lts_req_t
45046,0,f,man alone,street,f,user,f,f,Hauts-de-France,f,...,0,0,0,0,0,0,0,1,1,0
79973,2,f,woman alone,accomodation by a third party,,user,f,f,Hauts-de-France,f,...,0,0,0,0,0,0,0,0,0,0
200132,2,f,group of adults,street,t,user,f,f,Île-de-France,f,...,0,0,0,0,0,0,0,1,0,1
39414,1,f,man alone,emergency structure,f,user,f,f,Auvergne-Rhône-Alpes,f,...,0,0,0,0,0,0,0,0,1,0
15860,1,f,couple without whildren,accomodation by a third party,,user,unknown,f,Centre-Val de Loire,f,...,0,0,0,0,0,0,0,0,0,0


In [42]:
df_test.loc[df_test["long_term_housing_request"] == "unknown", 
   "long_term_housing_request"] = np.nan

In [43]:
df_test = pd.concat([df_test, 
          pd.get_dummies(df_test["long_term_housing_request"], 
               prefix='lts_req')], axis=1)

In [44]:
df_test

Unnamed: 0,granted_number_of_nights,animal_presence,group_composition_label,housing_situation_label,long_term_housing_request,requester_type,victim_of_violence_type,pregnancy,region,childcare_center_supervision,...,hs_prison,hs_psychiatric hospital,hs_public hospital,hs_refused to answer,"hs_religious place (church, mosque, synogogue)",hs_shelters,hs_stabilisation structure,hs_street,lts_req_f,lts_req_t
0,0,f,single mother with child(ren),street,,user,f,f,Hauts-de-France,f,...,0,0,0,0,0,0,0,1,0,0
1,0,f,man alone,street,,user,f,f,Auvergne-Rhône-Alpes,f,...,0,0,0,0,0,0,0,1,0,0
2,0,f,woman alone,public hospital,f,user,f,f,Occitanie,f,...,0,0,1,0,0,0,0,0,1,0
3,0,f,man alone,street,,user,f,f,Île-de-France,f,...,0,0,0,0,0,0,0,1,0,0
4,0,f,man alone,street,,user,f,f,Provence-Alpes-Côte d'Azur,f,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59543,1,f,couple with child(ren),emergency structure,,user,f,f,Nouvelle-Aquitaine,f,...,0,0,0,0,0,0,0,0,0,0
59544,1,f,single mother with child(ren),emergency structure,,user,f,f,Pays de la Loire,f,...,0,0,0,0,0,0,0,0,0,0
59545,2,f,man alone,emergency structure,,user,f,f,Île-de-France,f,...,0,0,0,0,0,0,0,0,0,0
59546,1,f,single mother with child(ren),accomodation by a third party,,user,f,f,Île-de-France,f,...,0,0,0,0,0,0,0,0,0,0


In [45]:
df_test.shape

(59548, 54)

#### requester_type : one hot encoding

In [46]:
df_add = pd.get_dummies(df_train["requester_type"], 
               prefix='rt', drop_first=True)
list_feat = list_feat +  df_add.columns.tolist()

In [47]:
df_add

Unnamed: 0,rt_third party,rt_user
0,1,0
1,0,1
2,0,1
3,0,1
4,0,1
...,...,...
238186,0,1
238187,0,1
238188,0,1
238189,0,1


In [48]:
df_tmp = pd.concat([df_train, df_add], axis=1)

In [49]:
df_train.shape

(238191, 54)

In [50]:
df_tmp.shape

(238191, 56)

In [51]:
df_train = df_tmp.copy()

In [52]:
df_train.head()

Unnamed: 0,granted_number_of_nights,animal_presence,group_composition_label,housing_situation_label,long_term_housing_request,requester_type,victim_of_violence_type,pregnancy,region,childcare_center_supervision,...,hs_public hospital,hs_refused to answer,"hs_religious place (church, mosque, synogogue)",hs_shelters,hs_stabilisation structure,hs_street,lts_req_f,lts_req_t,rt_third party,rt_user
0,1,f,man alone,street,,third party,f,f,Provence-Alpes-Côte d'Azur,f,...,0,0,0,0,0,1,0,0,1,0
1,1,f,man alone,street,,user,f,f,Provence-Alpes-Côte d'Azur,f,...,0,0,0,0,0,1,0,0,0,1
2,1,f,isolated child/underage,street,,user,f,f,Grand Est,f,...,0,0,0,0,0,1,0,0,0,1
3,2,f,woman alone,street,,user,woman,f,Bretagne,f,...,0,0,0,0,0,1,0,0,0,1
4,2,f,group with child(ren),hotel paid by the emergency centre,,user,f,f,Île-de-France,f,...,0,0,0,0,0,0,0,0,0,1


In [53]:
df_test = pd.concat([df_test, 
          pd.get_dummies(df_test["requester_type"], 
               prefix='rt', drop_first=True)], axis=1)

In [54]:
df_test.shape

(59548, 56)

#### victim_of_violence_type

In [55]:
df_train.loc[df_train["victim_of_violence_type"] == "f", 
   "victim_of_violence_type"] = np.nan

In [56]:
df_train["victim_of_violence_type"].nunique()

7

In [57]:
df_test.loc[df_test["victim_of_violence_type"] == "f", 
   "victim_of_violence_type"] = np.nan

In [58]:
df_test["victim_of_violence_type"].nunique()

7

In [59]:
df_add = pd.get_dummies(df_train["victim_of_violence_type"], 
               prefix='vvt')
df_add

Unnamed: 0,vvt_child,vvt_family,vvt_man,vvt_man victim of domestic violence,vvt_unknown,vvt_woman,vvt_woman victim of domestic violence
0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0
3,0,0,0,0,0,1,0
4,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...
238186,0,0,0,0,0,0,0
238187,0,0,0,0,0,0,0
238188,0,0,0,0,0,0,0
238189,0,0,0,0,0,0,0


In [60]:
list_feat = list_feat +  df_add.columns.tolist()

In [61]:
df_tmp = pd.concat([df_train, df_add], axis=1)

In [62]:
df_train.shape

(238191, 56)

In [63]:
df_tmp.shape

(238191, 63)

In [64]:
df_train = df_tmp.copy()

In [65]:
df_test = pd.concat([df_test, 
          pd.get_dummies(df_test["victim_of_violence_type"], 
               prefix='vvt')], axis=1)

In [66]:
df_test.shape

(59548, 63)

#### pregnancy

In [67]:
df_train['pregnancy_bol'] = \
    df_train['pregnancy'].apply(fun_str_to_bol)
df_test['pregnancy_bol'] = \
    df_test['pregnancy'].apply(fun_str_to_bol)

list_feat.append('pregnancy_bol')

In [68]:
df_train.shape

(238191, 64)

In [69]:
df_test.shape

(59548, 64)

#### region : one hot encoding

In [70]:
df_train.loc[df_train["region"] == "unknown", "region"] = np.nan

In [71]:
df_test.loc[df_test["region"] == "unknown", "region"] = np.nan

In [72]:
df_train["region"].nunique()

16

In [73]:
df_test["region"].nunique()

14

In [74]:
df_add = pd.get_dummies(df_train["region"], 
               prefix='reg')
df_add

Unnamed: 0,reg_Auvergne-Rhône-Alpes,reg_Bourgogne-Franche-Comté,reg_Bretagne,reg_Centre-Val de Loire,reg_Corse,reg_Grand Est,reg_Guadeloupe,reg_Guyane,reg_Hauts-de-France,reg_Martinique,reg_Normandie,reg_Nouvelle-Aquitaine,reg_Occitanie,reg_Pays de la Loire,reg_Provence-Alpes-Côte d'Azur,reg_Île-de-France
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
2,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
3,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
238186,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
238187,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
238188,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
238189,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [75]:
list_region = df_add.columns.tolist()
list_feat = list_feat +  list_region

In [76]:
df_tmp = pd.concat([df_train, df_add], axis=1)

In [77]:
df_train.shape

(238191, 64)

In [78]:
df_tmp.shape

(238191, 80)

In [79]:
df_train = df_tmp.copy()

For test : we have to give 16 regions

In [80]:
list_reg = df_train["region"].unique().tolist()
list_reg = [elem for elem in list_reg if elem is not np.nan]
list_reg

["Provence-Alpes-Côte d'Azur",
 'Grand Est',
 'Bretagne',
 'Île-de-France',
 'Pays de la Loire',
 'Auvergne-Rhône-Alpes',
 'Hauts-de-France',
 'Normandie',
 'Nouvelle-Aquitaine',
 'Bourgogne-Franche-Comté',
 'Occitanie',
 'Centre-Val de Loire',
 'Guyane',
 'Martinique',
 'Guadeloupe',
 'Corse']

In [81]:
len(list_reg)

16

In [82]:
df_tmp = df_test.copy()

df_tmp["region"] = pd.Categorical(df_tmp["region"],
                                    categories=list_reg)
df_add = pd.get_dummies(df_tmp["region"], prefix='reg')

In [83]:
df_add

Unnamed: 0,reg_Provence-Alpes-Côte d'Azur,reg_Grand Est,reg_Bretagne,reg_Île-de-France,reg_Pays de la Loire,reg_Auvergne-Rhône-Alpes,reg_Hauts-de-France,reg_Normandie,reg_Nouvelle-Aquitaine,reg_Bourgogne-Franche-Comté,reg_Occitanie,reg_Centre-Val de Loire,reg_Guyane,reg_Martinique,reg_Guadeloupe,reg_Corse
0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
3,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
4,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59543,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
59544,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
59545,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
59546,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0


In [84]:
df_add.shape

(59548, 16)

In [85]:
df_tmp = pd.concat([df_test, df_add], axis=1)

In [86]:
df_test.shape

(59548, 64)

In [87]:
df_tmp.shape

(59548, 80)

In [88]:
df_test = df_tmp.copy()

#### childcare_center_supervision

In [89]:
df_train['childcare_center_supervision_bol'] = \
    df_train['childcare_center_supervision'].apply(fun_str_to_bol)
df_test['childcare_center_supervision_bol'] = \
    df_test['childcare_center_supervision'].apply(fun_str_to_bol)

list_feat.append('childcare_center_supervision_bol')

In [90]:
df_train.shape

(238191, 81)

In [91]:
df_test.shape

(59548, 81)

#### disabled_worker_certification

In [92]:
df_train['disabled_worker_certification_bol'] = \
    df_train['disabled_worker_certification'].apply(fun_str_to_bol)
df_test['disabled_worker_certification_bol'] = \
    df_test['disabled_worker_certification'].apply(fun_str_to_bol)

list_feat.append('disabled_worker_certification_bol')

In [93]:
df_train.shape

(238191, 82)

In [94]:
df_test.shape

(59548, 82)

#### married

In [95]:
df_train['married_bol'] = \
    df_train['married'].apply(fun_str_to_bol)
df_test['married_bol'] = \
    df_test['married'].apply(fun_str_to_bol)

list_feat.append('married_bol')

In [96]:
df_train.shape

(238191, 83)

In [97]:
df_test.shape

(59548, 83)

In [98]:
list_feat

['nb_underage_red',
 'age_min',
 'age_max',
 'nb_female',
 'nb_male',
 'request_creation_day_num',
 'request_creation_month',
 'child_situation_red',
 'animal_presence_bol',
 'gcl_child/underage with family',
 'gcl_couple with child(ren)',
 'gcl_couple without whildren',
 'gcl_group of adults',
 'gcl_group with child(ren)',
 'gcl_isolated child/underage',
 'gcl_man alone',
 'gcl_single father with child(ren)',
 'gcl_single mother with child(ren)',
 'gcl_woman alone',
 'gcl_women victim of violence',
 'hs_detoxification center',
 'hs_emergency structure',
 'hs_hotel paid by an association',
 'hs_hotel paid by the emergency centre',
 'hs_hotel paid by the household',
 'hs_hotel paid by the regional administration',
 'hs_inclusion structure',
 'hs_mobile or makeshift shelter',
 'hs_other',
 'hs_parental home',
 'hs_personal or marital home',
 'hs_police station',
 'hs_prison',
 'hs_psychiatric hospital',
 'hs_public hospital',
 'hs_refused to answer',
 'hs_religious place (church, mosque,

In [99]:
len(list_feat)

71

In [100]:
df_test.columns.tolist()

['granted_number_of_nights',
 'animal_presence',
 'group_composition_label',
 'housing_situation_label',
 'long_term_housing_request',
 'requester_type',
 'victim_of_violence_type',
 'pregnancy',
 'region',
 'childcare_center_supervision',
 'nb_underage_red',
 'age_min',
 'age_max',
 'disabled_worker_certification',
 'nb_female',
 'nb_male',
 'married',
 'request_creation_day_num',
 'request_creation_month',
 'child_situation_red',
 'animal_presence_bol',
 'gcl_child/underage with family',
 'gcl_couple with child(ren)',
 'gcl_couple without whildren',
 'gcl_group of adults',
 'gcl_group with child(ren)',
 'gcl_isolated child/underage',
 'gcl_man alone',
 'gcl_single father with child(ren)',
 'gcl_single mother with child(ren)',
 'gcl_woman alone',
 'gcl_women victim of violence',
 'hs_detoxification center',
 'hs_emergency structure',
 'hs_hotel paid by an association',
 'hs_hotel paid by the emergency centre',
 'hs_hotel paid by the household',
 'hs_hotel paid by the regional administ

In [101]:
df_train.columns.tolist()

['granted_number_of_nights',
 'animal_presence',
 'group_composition_label',
 'housing_situation_label',
 'long_term_housing_request',
 'requester_type',
 'victim_of_violence_type',
 'pregnancy',
 'region',
 'childcare_center_supervision',
 'nb_underage_red',
 'age_min',
 'age_max',
 'disabled_worker_certification',
 'nb_female',
 'nb_male',
 'married',
 'request_creation_day_num',
 'request_creation_month',
 'child_situation_red',
 'animal_presence_bol',
 'gcl_child/underage with family',
 'gcl_couple with child(ren)',
 'gcl_couple without whildren',
 'gcl_group of adults',
 'gcl_group with child(ren)',
 'gcl_isolated child/underage',
 'gcl_man alone',
 'gcl_single father with child(ren)',
 'gcl_single mother with child(ren)',
 'gcl_woman alone',
 'gcl_women victim of violence',
 'hs_detoxification center',
 'hs_emergency structure',
 'hs_hotel paid by an association',
 'hs_hotel paid by the emergency centre',
 'hs_hotel paid by the household',
 'hs_hotel paid by the regional administ

In [102]:
df_train.shape

(238191, 83)

In [103]:
df_test.shape

(59548, 83)

#### Save

In [104]:
df_train.to_csv(path_or_buf=PATH_FS1_TRAIN, index=False)

In [105]:
df_test.to_csv(path_or_buf=PATH_FS1_TEST, index=False)

## Models

### FS1

#### Features preparation

In [111]:
X_train = df_train.filter(items=list_feat).values

In [112]:
X_train.shape

(238191, 71)

In [113]:
X_test = df_test.filter(items=list_feat).values

In [114]:
X_test.shape

(59548, 71)

In [115]:
std_scale = preprocessing.StandardScaler().fit(X_train)
X_train_n = std_scale.transform(X_train)
X_test_n = std_scale.transform(X_test)

In [116]:
y_train = df_train["granted_number_of_nights"]
y_test = df_test["granted_number_of_nights"]

In [123]:
class_weight = {0: 10**0,
               1: 10**1,
               2: 10**2,
               3: 10**3}

#### Ramdon Forest

In [145]:
rfc_fs1 = RandomForestClassifier(n_estimators=300, random_state=0, 
                                 class_weight=class_weight, n_jobs=-1)
# fitting
t_0 = time.time()
rfc_fs1.fit(X_train_n, y_train)
t_1 = time.time()
print(rfc_fs1)
print("timing total: {} s".format(t_1-t_0))

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                       class_weight={0: 1, 1: 10, 2: 100, 3: 1000},
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=300,
                       n_jobs=-1, oob_score=False, random_state=0, verbose=0,
                       warm_start=False)
timing total: 120.3341166973114 s


In [146]:
print("Train Score : ", 
      rfc_fs1.score(X_train_n, y_train))
print("Test Score : ",
      rfc_fs1.score(X_test_n, y_test))

Train Score :  0.7795802528223149
Test Score :  0.6083159803855713


In [147]:
prob_Y_test = rfc_fs1.predict_proba(X_test_n)

In [148]:
rfc_fs1_score = competition_scorer(y_test, prob_Y_test)
print(f'test score : {rfc_fs1_score}')

test score : 5.0783347614577705


In [137]:
random_preds = np.random.uniform(size=(requests_test.shape[0], 4))
random_score = competition_scorer(y_test, random_preds)
print(f'test score: {random_score}')

test score: 1.662267758589146


In [118]:
'''from sklearn.datasets import make_classification
X, y = make_classification(n_samples=1000, n_features=4,
                           n_informative=2, n_redundant=0,
                           random_state=0, shuffle=False)
clf = RandomForestClassifier(max_depth=2, random_state=0)
clf.fit(X, y)

print(clf.predict([[0, 0, 0, 0]]))'''

[1]


#### LogisticRegression

In [150]:
logreg_fs1 = LogisticRegression(random_state=0, 
                                class_weight=class_weight, n_jobs=-1)
# fitting
t_0 = time.time()
logreg_fs1.fit(X_train_n, y_train)
t_1 = time.time()
print(logreg_fs1)
print("timing total: {} s".format(t_1-t_0))

LogisticRegression(C=1.0, class_weight={0: 1, 1: 10, 2: 100, 3: 1000},
                   dual=False, fit_intercept=True, intercept_scaling=1,
                   l1_ratio=None, max_iter=100, multi_class='auto', n_jobs=-1,
                   penalty='l2', random_state=0, solver='lbfgs', tol=0.0001,
                   verbose=0, warm_start=False)
timing total: 34.27102518081665 s


In [151]:
print("Train Score : ", 
      logreg_fs1.score(X_train_n, y_train))
print("Test Score : ",
      logreg_fs1.score(X_test_n, y_test))

Train Score :  0.15231893732340854
Test Score :  0.15046685027204945


In [152]:
prob_Y_test = logreg_fs1.predict_proba(X_test_n)

In [153]:
logreg_fs1_score = competition_scorer(y_test, prob_Y_test)
print(f'test score : {logreg_fs1_score}')

test score : 0.8958611094624978


In [119]:
from sklearn.metrics import log_loss
log_loss(["spam", "ham", "ham", "spam"], 
         [[.1, .9], [.9, .1], [.8, .2], [.35, .65]], labels=["ham","spam"])

log_loss(["spam", "ham", "ham", "spam"], 
         [[0, 1], [1, 0], [1, 0], [1, 0]], labels=["ham","spam"])

8.634694098727671