In [1]:
import datetime
from subscriber_learn.utils.s3_read_write import S3ReadWrite
import pandas as pd
import numpy as np
from sklearn.pipeline import make_pipeline
from sklearn import linear_model, feature_selection, preprocessing
from subscriber_learn.cross_validation.pipeline_tools import DummyEncoder, Timer
import logging

In [2]:
n_folds = 5;
offset = 7;
input_dir = 'input_files/etlv_modified'
response_dir = 'input_files/responses/canceled_within_7_days'

In [3]:
train_end_date = datetime.date(2018, 1, 28)
context = {'ds': train_end_date.strftime('%Y-%m-%d'),
               'execution_date': train_end_date}

In [4]:
def read_data(s3_reader, n_folds, offset, input_dir, response_dir, **context):
    logging.info('Reading data from {}.\nInput: {}\nOutput: {}'.format(
        str(s3_reader), input_dir, response_dir))
    date_fmt = '%Y-%m-%d'
    filename = 'part000.csv000'
    dates = [context['execution_date'] - datetime.timedelta(days = offset * i)
        for i in range(n_folds)]

    input_data = [s3_reader.read_from_S3_csv(
            csv_name = '{dir}/{year}/{month}/{day}/{filename}'.format(
                dir = input_dir,
                year = input_date.year,
                month = input_date.month,
                day = input_date.day,
                filename = filename))
        .assign(input_date = input_date)
        .set_index(['internal_user_id', 'input_date'])
        for input_date in dates]
    logging.info('{} input files read'.format(len(input_data)))

    response_data = [s3_reader.read_from_S3_csv(
            csv_name = '{dir}/{year}/{month}/{day}/{filename}'.format(
                dir = response_dir,
                year = date.year,
                month = date.month,
                day = date.day,
                filename = filename))
        .assign(input_date = date)
        .set_index(['internal_user_id', 'input_date'])
        for date in dates]
    logging.info('{} response files read'.format(len(response_data)))

    check_cols = ['valid_account_creation',
        'valid_prospect_creation',
        'valid_accepted_terms']

    input_data = pd.concat(input_data)

    input_data = (input_data.loc[
        input_data.valid_account_creation & \
        input_data.valid_prospect_creation & \
        input_data.valid_accepted_terms, :]
        .drop(check_cols, axis = 1))
    logging.info('Input shape: {}'.format(input_data.shape))

    response_data = (pd.concat(response_data)
        .loc[input_data.index])
    logging.info('Response shape: {}'.format(response_data.shape))

    return input_data, response_data

In [5]:
input_data, response_data = read_data(
        S3ReadWrite(), n_folds, offset, input_dir, response_dir,
        **context)

In [6]:
def get_temporal_cv(X, fold_col = 'input_date'):
    time_index = X.index.get_level_values(fold_col)
    cv = TemporalCrossValidator(time_index)
    return cv.split()

class TemporalCrossValidator():
    def __init__(self, time_index, n_weeks = 1):
        self.n_weeks = n_weeks
        self.unique_groups, self.groups = np.unique(
            time_index, return_inverse=True)
        n_groups = len(self.unique_groups)
        self.n_splits = n_groups - n_weeks
        if self.n_splits <= 1:
            raise ValueError(
                "Found {} time periods in the data, for training folds "
                "consisting of {} time periods of input per fold. "
                "Cross-validation requires 2 or more folds".format(
                n_groups, n_weeks))

    def split(self):
        self.groups = check_array(groups, ensure_2d=False, dtype=None)

        for fold_index in range(self.n_splits):
            train_indices = range(fold_index, fold_index + n_weeks)
            test_index = fold_index + n_weeks
            yield np.where(np.isin(groups, train_indices))[0], \
                np.where(groups == test_index)[0]

In [7]:
estimator = linear_model.ElasticNetCV(
            l1_ratio = [.1, .5, .7, .9, .95, 1],
            cv = get_temporal_cv(input_data),
            n_jobs = -1,
            verbose = 1,
            random_state = 1100)

In [18]:
pipeline = make_pipeline(DummyEncoder(),
            preprocessing.Imputer(strategy = 'median'),
            preprocessing.RobustScaler(),
            feature_selection.VarianceThreshold(threshold = .04))

In [19]:
pipeline.fit(input_data, response_data)

Pipeline(memory=None,
     steps=[('dummyencoder', DummyEncoder()), ('imputer', Imputer(axis=0, copy=True, missing_values='NaN', strategy='median', verbose=0)), ('robustscaler', RobustScaler(copy=True, quantile_range=(25.0, 75.0), with_centering=True,
       with_scaling=True)), ('variancethreshold', VarianceThreshold(threshold=0.04))])

In [23]:
pipeline.transform(input_data).shape

(358514, 168)

In [30]:
pipeline.named_steps['variancethreshold'].get_support().shape

(272,)

In [31]:
a.transformed_columns.shape

(272,)

In [32]:
pipeline.named_steps['variancethreshold'].get_support().sum()

168