# 1. SETTINGS

In [None]:
# import packages
import pandas as pd
import numpy as np
import scipy.stats
import os

In [None]:
# pandas options
pd.set_option('display.max_columns', None)

In [None]:
# ignore warnings
import warnings
warnings.filterwarnings('ignore')

In [None]:
# garbage collection
import gc
gc.enable()

In [None]:
# helper functions
import functions
from functions import *

# 2. IMPORT

In [None]:
# Train sample
train = pd.read_csv('../data/train_requests.csv', sep=',', low_memory=False, error_bad_lines=False)
test  = pd.read_csv('../data/test_requests.csv', sep=',', low_memory=False, error_bad_lines=False)
y = train['granted_number_of_nights']

print(train.shape)
print(y.shape)
print(test.shape)

In [None]:
# check data
train.head()

In [None]:
# check data
test.head()

# 3. MERGER

In [None]:
# target variable
test['granted_number_of_nights']  = np.nan

In [None]:
# align columns
train = train.reindex_axis(sorted(train.columns), axis = 1)
test  = test.reindex_axis(sorted(test.columns),   axis = 1)

# check equalty
train.columns == test.columns

In [None]:
# concatenate
df = pd.concat([train, test], axis = 0)
df = df.reset_index(drop = True)
del train, test
print(df.shape)

In [None]:
# check distributions
df.describe()

# 4. PROCESSING

### DROP IRRELEVANT FEATURES

In [None]:
# remove columns with a single value
print(df.shape)
df = df.loc[:, df.nunique(dropna = False) != 1]
print(df.shape)

In [None]:
# remove irrelevant columns
print(df.shape)
drops = ['group_main_requester_id', 'request_backoffice_creator_id']
for var in drops:
    del df[var]
print(df.shape)

### MISSING VALUES

In [None]:
# check missings
count_missings(df)

In [None]:
# impute NAs with 0
nas = []
for var in nas:
    df[var].fillna(0, inplace = True)

In [None]:
# impute missings with means
#target = df['duration'].copy()
#df = df.replace([np.inf, -np.inf], np.nan)
#means = df.mean(axis = 0)
#df.fillna(means, inplace = True)
#df['duration'] = target

### VARIABLE TYPES

In [None]:
# check data types
df.dtypes

In [None]:
# convert to integers
to_int = ['animal_presence', 'child_to_come', 'long_term_housing_request', 'victim_of_violence']
for col in to_int:
    #df[col] = df[col].astype('int64')
    df[col], _ = pd.factorize(df[col])
    
    
# convert to strings
to_str = ['request_id', 'group_composition_id', 'housing_situation_id', 'social_situation_id']
for col in to_str:
    df[col] = df[col].astype('object')
    
    
# convert to dates
dates = ['answer_creation_date', 'group_creation_date', 'request_creation_date']
for var in dates:
    df[var] = pd.to_datetime(df[var].astype('str'), infer_datetime_format = True)

### AGGREGATIONS

In [None]:
# aggregated data
stats = ['mean', 'std', 'min', 'max']
train_ind_id = train_ind['request_id'].copy()
df_agg = aggregate_data(train_ind, group_var = 'individual_id', num_stats = stats, var_label = 'group')
df_agg.head()

In [None]:
# remove target aggregations
drops = ['housing_situation_id', 'district']
for var in drops:
    vars_to_drop = ['group_id_' + var + '_' + s for s in stats]
    for var_to_drop in vars_to_drop:
        del df_agg[var_to_drop]
df_agg.shape

In [None]:
# merge to the data
print(df.shape)
df = df.merge(df_agg, how = 'left', on = 'category')
print(df.shape)

# 5. WORKING WITH FEATURES

### EXTRACT FEATURES FROM TEXT

In [None]:
# compute features
text_vars = ['group_composition_label', 'housing_situation_label']
print(df.shape)
df = add_text_features(df, strings = text_vars, k = 5, keep = False)
print(df.shape)

### CREATE NEW FEATURES

In [None]:
dates = ['answer_creation_date', 'group_creation_date', 'request_creation_date']
df['daydif_1'] = (df[dates[0]] - df[dates[1]]).astype('int') / 1000000
df['daydif_1'] = (df[dates[0]] - df[dates[2]]).astype('int') / 1000000
df['daydif_1'] = (df[dates[1]] - df[dates[2]]).astype('int') / 1000000

### ENCODE FACTORS

In [None]:
# convert categorical features
ids = df['request_id'].copy()
df = encode_factors(df, method = 'label')
df['request_id'] = ids 
df.shape

# 6. EXPORT

In [None]:
# export CSV
df2.to_csv('../data/data_v3.csv', index = False)
df2.shape