In [20]:
import os
import sys
import numpy as np
import pandas as pd

In [21]:
import logging
import logging.handlers
logger = logging.getLogger('feature_creator')
if not logging.handlers:
    logger.setLevel(logging.DEBUG)
    sh = logging.StreamHandler(sys.stdout)  # for jupyter notebook
    sh.setLevel(logging.DEBUG)
    formatter = logging.Formatter('[%(levelname)-8s] %(asctime)s [%(filename)s] [%(funcName)s:%(lineno)d] %(message)s', '%Y-%m-%d %H:%M:%S')
    sh.setFormatter(formatter)
    logger.addHandler(sh)

## Constants

In [22]:
TRAIN = 'train'
TEST = 'test'
VALIDATION = 'validation'

## Configurations

In [23]:
# Summary type (for training, validation or testing)
summary_type = [TRAIN, VALIDATION, TEST][0]

# Process only part of the data?
process_fraction = False

# Months before test period to be considered in train features - default 0
# Multiple months are considered!
train_back_periods = range(15,-1,-1) #  13:0 trainBackPeriods 0 for train === validation!
save_by_month = True # Always TRUE going forward
max_records_save_batch = 2e6


# Option to limit the summary period to a fixed month window
max_summary_months = np.inf # Always Inf going forward
test_summary_months = range(16, 0, -1) # 16:1 #16:5 #16:3 #5 #17 # If not saveByMonth: consider first as lag
validation_back_months = 10 # 0 is default (validate May 16). 11 is Jun 15
validation_summary_months = range(15 - validation_back_months, 0, -1) # Default: 15:1 for May 16; 14:1 for Apr 16, ...

### Configurations - file path

In [24]:
data_dir = '../../rsc/data/'
feature_eng_dir = '../../rsc/feature_engineering'
feature_dir_root = '../../feature/'
version = '2017-04-29'
feature_dir = os.path.join(feature_dir_root, version)

### Configurations - less used settings

In [25]:
fraction_type = ['random', 'order'][1]
fraction_flag = '%s_%s' % ('small', fraction_type) if process_fraction else 'nofrac'
train_back_periods = sorted(train_back_periods)

if summary_type == VALIDATION:
    train_back_periods = [validation_back_months] * max(1, len(validation_summary_months))
elif summary_type == TEST:
    train_back_periods = [0] * max(1, len(test_summary_months))

train_back_string = '%s_%s' % ('back', train_back_periods[-1]) if summary_type != TEST else 'nobackstr'

In [26]:
folder_path = os.path.join(feature_dir, '__'.join([summary_type, fraction_flag]))
if summary_type == VALIDATION:
    folder_path = os.path.join(folder_path, '__'.join(['back', validation_back_months]))
logger.info('Working directory: %s' % folder_path)
if not os.path.isdir(folder_path):
    os.makedirs(folder_path)

In [27]:
# Feature mapping file name
feature_map_fname = "feature_mapping.csv"

# Family incomes file name
family_income_fname = "family_incomes.csv"

# Mean log province incomes file name
mean_province_income_fname = "province_incomes.csv"

# Client available ids file names for May and June 2015
clients_may_15_fname = "may_15_clients.csv"
clients_june_15_fname = "june_15_clients.csv"

# Client ids with at least one positive flank between Jan 15 and May 16
clients_pos_flank_fname = "positive_flank_clients.csv"

# Target variable constants
# Question: why (1, 2, 3, 4, 5, 6, 12) ?
raw_data_lags = [1, 2, 3, 4, 5, 6, 12]

# Count the number of products as well as the number of positive and negative 
# flanks in the flanks period
flanks_period = np.inf

# Months back product summary periods
months_back_trans_counts = [1, 2, 3, 4, 5, 6, 12] # These must appear in rawDataLags
months_back_prod_counts = [1, 2, 3, 4, 5, 6, 12]

## Create features

In [28]:
# Read in the raw cleaned data
if summary_type == TEST:
    train_part = pd.read_csv(os.path.join(data_dir, ('train_%s.csv' % fraction_flag)))
    test_part = pd.read_csv(os.path.join(data_dir, ('test_%s.csv' % fraction_flag)))  
    # Combine the train and test records
    raw_data = train_part.append(test_part, ignore_index=True)
else:
    # Drop the last trainBackPeriod months
    raw_data = pd.read_csv(os.path.join(data_dir, ('train_%s.csv' % fraction_flag)))

# Load the estimated relative map contributions
#mapContributions <- readRDS(file.path(getwd(), targetDate,
#                                      "monthlyMAPContributions.rds"))

# Store the original raw data before applying modifications
raw_data_orig = raw_data.copy()

# Keep track of the number of feature records
feature_records_counter = []

## For loop

In [29]:
# TODO: should be in for-loop
train_back_id = 0
train_back_period = train_back_periods[train_back_id]

In [30]:
logger.info('Processing period %d of %d for %s' % (train_back_id, len(train_back_periods), summary_type))

In [31]:
# Set the raw data to the original raw data since rawData is modified in
# each train back iteration
if summary_type == TEST:
    raw_data = raw_data_orig.copy()
else:
    # Drop the last trainBackPeriod months
    last_month = sorted(raw_data_orig.fecha_dato.unique(), reverse=True)[train_back_period]
    raw_data = raw_data_orig[raw_data_orig.fecha_dato <= last_month]

In [32]:
# Drop customers that don't have data in the month prior to the targeted 
# period Or in the month of the target period itself
data_months = sorted(raw_data.fecha_dato.unique(), reverse=True)
data_months_vec = data_months
for i in [0, 1]:
    raw_data = raw_data[raw_data.ncodpers.isin(raw_data[raw_data.fecha_dato==data_months[i]].ncodpers.unique())]
print raw_data.shape

(13502875, 48)


In [33]:
# Restrict the raw data optionally to the last maxSummaryMonths
if summary_type == TRAIN:
    if np.isfinite(max_summary_months) and max_summary_months < (len(data_months) - 1):
        raw_data = raw_data[raw_data.fecha_dato >= data_months[max_summary_months]]
else:
    if summary_type == VALIDATION:
        period_lag_months = validation_summary_months[train_back_id]
    else:
        period_lag_months = test_summary_months[train_back_id]
        
    if np.isfinite(period_lag_months) and period_lag_months < (len(data_months)-1):
        raw_data = raw_data[raw_data.fecha_dato >= dataMonths[periodLagMonths + 1]]
    # Skip stagnant feature

In [34]:
# Set up the save path variables
num_data_months = len(raw_data.fecha_dato.unique())
lag_period_extension = '_'.join(['lag', str(num_data_months - 1)])
train_back_string = '_'.join(['back', str(train_back_period)]) if summary_type == TRAIN else 'nobackstr'
save_extension = '__'.join([train_back_string, lag_period_extension])

### Section 1: Add predictor related features

In [98]:
def month_diff(s):
    s2 = s.apply(lambda v: v.year * 12 + v.month)
    return sum(s2.diff().fillna(1.0) != 1.0)

raw_data.fecha_dato = pd.to_datetime(raw_data.fecha_dato,format="%Y-%m-%d")
month_features = pd.DataFrame({'data_months': raw_data.groupby(['ncodpers']).size(),
                               'gaps': raw_data.groupby(['ncodpers']).fecha_dato.apply(month_diff)
                               })
month_features['months_frac'] = (month_features.data_months - 2) / (len(data_months) - 2)
month_features['gaps_frac'] = month_features.gaps / (len(data_months) - 2)
month_features['last_date'] = data_months_vec[0]

features = month_features