In [1]:
%matplotlib inline

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
from atusfunclib import load_data
from activitylib import ACTINFO
from wlmetrics import *
import pickle

In [26]:
from sklearn.linear_model import Ridge
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestRegressor

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import normalize
from sklearn.preprocessing import Imputer
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import train_test_split
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import GridSearchCV

from sklearn.pipeline import Pipeline
from sklearn.pipeline import FeatureUnion

from sklearn import metrics
from sklearn import model_selection
from sklearn.externals import joblib
from sklearn import base

In [5]:
class BaseResEnsembleEstimator(base.BaseEstimator, base.RegressorMixin):

    def __init__(self, base_est, resd_est):
        # Set base estimator and residual estimator
        self.base_est = base_est
        self.resd_est = resd_est

    def fit(self, X, y):
        # Fit base
        self.base_est.fit(X, y)
        # Calculate residual
        residual = y - self.base_est.predict(X)
        # Fit to residual
        self.resd_est.fit(X, residual)
        return self

    def predict(self, X):
        # Sum of base prediction and residual prediction
        return self.base_est.predict(X) + self.resd_est.predict(X)


class DataFrameSelector(base.BaseEstimator, base.TransformerMixin):

    def __init__(self, feature_names=None, dtype=int):
        self.feature_names = feature_names
        self.dtype = dtype

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        if self.feature_names:
            return X[self.feature_names].as_matrix().astype(self.dtype)
        else:
            return X.as_matrix().astype(self.dtype)

In [6]:
# Import all csv data
data_import = load_data(loc="data", loc_clean="cleaned_data", loc_codes="code_tables")

In [7]:
# Unpack individual dataframes
df, dfactcodes, dfeducodes, dfinccodes, dfagecodes, \
dfempcodes, dfindcodes, dfraccodes, dfloccodes, dfwhocodes, \
dfdemocodes = data_import

In [8]:
# Convert category columns to float
df['TEAGE_CAT'] = df['TEAGE_CAT'].astype(float)
df['TRERNWA_CAT'] = df['TRERNWA_CAT'].astype(float)

In [9]:
# Split by weekday and weekend/holiday
df_wehol = df[(df['TRHOLIDAY'] == 1) | (df['TRWEEKEND'] == 1)]
df_weday = df[(df['TRHOLIDAY'] == 0) & (df['TRWEEKEND'] == 0)]

In [10]:
del df

In [11]:
# Weights for ratio calculation
weights_p = np.array([1.0, 0.2, 1.0, 0.6, 0.6, 1.0, 1.0, 0.9, 1.0, 1.0, 1.0, 1.0, 0.2])
weights_n = np.array([0.0, 1.0, 0.8, 1.0])

In [12]:
# Calculate metrics (i.e. y data)
df_weday_y = pd.DataFrame()
df_wehol_y = pd.DataFrame()

df_weday_y['metric1'] = w_l_balance_weighted_ratio(df_weday, ACTINFO['positiveWL'], ACTINFO['negoccWL'],
                                                   weights_p, weights_n, N=1)

df_weday_y['metric2'] = w_l_balance_workday(df_weday, workid='0501', hours=10)
df_weday_y['metric3'] = w_l_balance_personalcare(df_weday)
df_weday_y['metric4'] = w_l_balance_leisuresocial(df_weday)
df_weday_y['metric5'] = w_l_balance_housework(df_weday, hours=3)

df_wehol_y['metric1'] = w_l_balance_weighted_ratio(df_wehol, ACTINFO['positiveWL'], ACTINFO['negoccWL'],
                                                   weights_p, weights_n, N=1)

df_wehol_y['metric2'] = w_l_balance_workday(df_wehol, workid='0501', hours=10)
df_wehol_y['metric3'] = w_l_balance_personalcare(df_wehol)
df_wehol_y['metric4'] = w_l_balance_leisuresocial(df_wehol)
df_wehol_y['metric5'] = w_l_balance_housework(df_wehol, hours=3)

In [13]:
# Features list
features = ['TEAGE', 'TESEX', 'GEMETSTA', 'GESTFIPS',
            'TELFS', 'TRDPFTPT',
            'TRSPPRES', 'TESPEMPNOT',
            'TESCHENR', 'TESCHLVL', 'PEEDUCA',
            'PTDTRACE',
            'TRCHILDNUM', 'TRNUMHOU',
            'TRMJOCGR', 'TRDTOCC1',
            'TRMJIND1', 'TEIO1COW', 'TRERNWA',
            'TUDIS']

In [14]:
# Features matrix (i.e. X data)
df_weday_X = df_weday[features]
df_wehol_X = df_wehol[features]

In [21]:
# Split train-test indices
rs = ShuffleSplit(n_splits=1, test_size=0.1, random_state=42)

train_wd, test_wd = rs.split(df_weday).next()
train_wh, test_wh = rs.split(df_wehol).next()

In [22]:
del df_weday
del df_wehol

In [23]:
# Build estimator for age, education, weekly income, child number, household number

# Polynominal features for age
agetrans = Pipeline([
    ('featse1', DataFrameSelector(feature_names=['TEAGE'], dtype=float)),
    ('scaler1', StandardScaler()),
    ('quadrat', PolynomialFeatures(degree=3))
])

lintrans = DataFrameSelector(
    feature_names=['PEEDUCA', 'TRERNWA', 'TRCHILDNUM', 'TRNUMHOU'], dtype=float)

est = Pipeline([
    ('union01', FeatureUnion([
        ('featse2', lintrans),
        ('agetran', agetrans)
    ])),
    ('imputer', Imputer(missing_values='NaN', strategy='median', axis=0)),
    ('scaler2', StandardScaler()),
    ('ensembl', BaseResEnsembleEstimator(Ridge(alpha=0.5),
                                         RandomForestRegressor(n_estimators=10,
                                                               max_depth=20,
                                                               min_samples_leaf=2,
                                                               random_state=42))),
])

In [None]:
# Grid search CV
gridsearch = GridSearchCV(est, )


In [24]:
y = df_weday_y['metric1'].as_matrix()[ti_wd]
est.fit(df_weday_X.iloc[ti_wd], y)

Pipeline(steps=[('union01', FeatureUnion(n_jobs=1,
       transformer_list=[('featse2', DataFrameSelector(dtype=<type 'float'>,
         feature_names=['PEEDUCA', 'TRERNWA', 'TRCHILDNUM', 'TRNUMHOU'])), ('agetran', Pipeline(steps=[('featse1', DataFrameSelector(dtype=<type 'float'>, feature_names=['TEAGE'])),...stimators=10, n_jobs=1, oob_score=False, random_state=42,
           verbose=0, warm_start=False)))])

In [25]:
est.score(df_weday_X.iloc[ti_wd], y)

0.75629099218529705

In [None]:
est.score(df_weday_X.iloc[ti_wd], df_weday_y['metric1'].as_matrix()[ti_wd])

In [249]:
est.predict(df_weday_X.head(10))

array([ 4.41985331,  5.455184  ,  0.87719175,  1.50443558,  4.61999474,
        1.46759799,  1.46096391,  4.96981854,  4.80626127,  6.59230801])

In [250]:
y[:10]

array([ 7.25063551,  2.35088422,  0.17225817,  6.95939851,  0.2050389 ,
        0.37194242,  0.97435075,  6.8741985 ,  6.99484999,  6.52502966])