# 1. SETTINGS

In [None]:
# import packages
import pandas as pd
import numpy as np
import scipy.stats
import os

In [None]:
# pandas options
pd.set_option('display.max_columns', None)

In [None]:
# ignore warnings
import warnings
warnings.filterwarnings('ignore')

In [None]:
# garbage collection
import gc
gc.enable()

In [None]:
# helper functions
import functions
from functions import *

# 2. IMPORT

In [None]:
# import data
train = pd.read_csv('../data/train_individuals.csv', sep = ',', low_memory = False, error_bad_lines = False)
test  = pd.read_csv('../data/test_individuals.csv',  sep = ',', low_memory = False, error_bad_lines = False)

# check dimensions
print(train.shape)
print(test.shape)

In [None]:
# check data
train.head()

In [None]:
# check data
test.head()

# 3. MERGER

In [None]:
# align columns
train = train.reindex_axis(sorted(train.columns), axis = 1)
test  = test.reindex_axis(sorted(test.columns),   axis = 1)

# check equalty
train.columns == test.columns

In [None]:
# concatenate
df = pd.concat([train, test], axis = 0)
df = df.reset_index(drop = True)
del train, test
print(df.shape)

In [None]:
# check distributions
df.describe()

# 4. PROCESSING

### DROP IRRELEVANT FEATURES

In [None]:
# remove columns with a single value
print(df.shape)
df = df.loc[:, df.nunique(dropna = False) != 1]
print(df.shape)

In [None]:
# remove irrelevant columns
print(df.shape)
drops = ['birth_month']
for var in drops:
    del df[var]
print(df.shape)

### MISSING VALUES

In [None]:
# check missings
count_missings(df)

In [None]:
# impute NAs with 0
nas = []
for var in nas:
    df[var].fillna(0, inplace = True)

In [None]:
# impute missings with means
#target = df['duration'].copy()
#df = df.replace([np.inf, -np.inf], np.nan)
#means = df.mean(axis = 0)
#df.fillna(means, inplace = True)
#df['duration'] = target

### VARIABLE TYPES

In [None]:
# check data types
df.dtypes

In [None]:
# check value counts
facs = [f for f in df.columns if df[f].dtype == "object"]
for fac in facs:
    print('--------------------------------')
    print(fac + ': ' + str(df[fac].nunique()) + ' unique values')
    print('--------------------------------')
    print(df[fac].value_counts().head(3))
    print('--------------------------------')
    print('')

In [None]:
# convert to integers
to_int = []
for col in to_int:
    df[col], _ = pd.factorize(df[col])
    
    
# convert to strings
to_str = ['housing_situation_2_id', 'individual_role_2_id', 'marital_status_id', 'individual_id', 'request_id']
for col in to_str:
    df[col] = df[col].astype('object')
    
    
# convert to dates
dates = ['individual_creation_date']
for var in dates:
    df[var] = pd.to_datetime(df[var].astype('str'), infer_datetime_format = True)

### AGGREGATIONS

In [None]:
%%time

# aggregated data
df_agg = aggregate_data(df, 
                        group_var = 'request_id',
                        num_stats = ['mean', 'std', 'sum'], 
                        factors   = ['gender', 'pregnancy', 'childcare_center_supervision', 
                                     'disabled_worker_certification', 'individual_role', 'individual_role_2_label',
                                     'marital_status_label', 'housing_situation_2_label'])

In [None]:
# check data
df_agg.head()

In [None]:
# remove columns with a single value
print(df_agg.shape)
df_agg = df_agg.loc[:, df_agg.nunique(dropna = False) != 1]
print(df_agg.shape)

# 5. WORKING WITH FEATURES

### EXTRACT FEATURES FROM TEXT

In [None]:
# compute features
text_vars = []
print(df_agg.shape)
df_agg = add_text_features(df_agg, strings = text_vars, k = 5, keep = False)
print(df_agg.shape)

### CREATE NEW FEATURES

### ENCODE FACTORS

In [None]:
# convert categorical features
df_agg = encode_factors(df_agg, method = 'label', skip = ['request_id'])
df_agg.shape

# 6. EXPORT

In [None]:
# export CSV
df_agg.to_csv('../data/data_group.csv', index = False)
df_agg.shape