# Data Prep

In [1]:
import numpy as np
import pandas as pd
import math

import vogel.preprocessing as v_prep
import vogel.utils as v_utils
import vogel.utils.stats as v_stats
import vogel.train as v_train

## Generate Data

Creating example Tweedie data, which is a common assumption for insurance loss data.  The response is pure premium ('PP'), which is taken from a Tweedie distribution with mean equal to 'TrueMean'.  TrueMean is deterministically defined based on continuous variables and levels of categorical variables, with some having a stronger effect on the TrueMean than others; the variables are named accordingly.  We generate the Tweedie pure premiums explicitly by first simulating the claim counts, and then simulating the cumulative losses based on those counts, and finally calculating the Pure Premium as Losses / EE.  The losses and counts are being left in the dataset as artificats.

In addition to the strong and weak variables, we are also populating with several variables which have no effect on the TrueMean.  Some of the earned exposures ('EE') are being set to zero, to later simulate filtering abilities of Vogel; this is often encountered in insurance data where a row may represent a single policy, but the earned exposure for a given coverage may be zero if that policy didn't have that coverage.  Also, another variable has been added strictly to further showcase how to filter the data

Furthermore, a random column is being added, which will be used later to split into train and test sets.

In [2]:
np.random.seed(42)
NumRows = 10000

data_df = pd.DataFrame({'id':np.array(list(range(0,NumRows))),
                       'EE':np.random.choice([1,0], NumRows, p=[.9,.1]),
                       'StrongContin':np.random.rand(NumRows),
                       'WeakContin':np.random.rand(NumRows),
                       'Quadratic':np.random.rand(NumRows),
                       'Interact_One':np.random.rand(NumRows),
                       'Interact_Two':np.random.rand(NumRows),
                       'UnimportantContin':np.random.rand(NumRows),
                       'AnotherFilter':np.random.rand(NumRows)
                      })
StrongCategoricalLookup = pd.DataFrame({'StrongCategorical':["A_Medium5","B_Medium3","C_Low1","D_High9","E_Medium7","F_Medium5"],
                                       'Factor':[1,.75,.5,2,1.25,1]
                                       })
WeakCategoricalLookup = pd.DataFrame({'WeakCategorical':["A_medium5","B_Medium4","C_Medium6"],
                                       'Factor':[1,.95,1.05]
                                       })
StrongCategoricalList = ['A_Medium5', 'B_Medium3', 'C_Low1', 'D_High9', 'E_Medium7', 'F_Medium5']
WeakCategoricalList = ["A_medium5","B_Medium4","C_Medium6"]
UnimportantCategoricalList = ["A","B","C"]
data_df['StrongCategorical'] = np.random.choice(StrongCategoricalList, NumRows, p=[.2,.2,.1,.1,.2,.2])
data_df['WeakCategorical'] = np.random.choice(WeakCategoricalList, NumRows)
data_df['UnimportantCategorical'] = np.random.choice(UnimportantCategoricalList, NumRows)
data_df['TrueLogMean']\
= np.log(30) \
+ np.log(3)*data_df.StrongContin \
+ np.log(1.05)*data_df.WeakContin \
+ ((((data_df[['id','StrongCategorical']]).merge(StrongCategoricalLookup,on='StrongCategorical', how='left', sort=False))).Factor).apply(np.log) \
+ ((((data_df[['id','WeakCategorical']]).merge(WeakCategoricalLookup,on='WeakCategorical', how='left', sort=False))).Factor).apply(np.log) \
+ np.log(2)*data_df.Quadratic**2 \
+ np.log(.5)*data_df.Interact_One*data_df.Interact_Two

data_df['TrueMean'] = data_df.TrueLogMean.apply(np.exp)

MyP = 1.35
MyPhi = 72.65
MyMu = data_df.TrueMean

MyA = (2-MyP)/(MyP-1)
MyTheta = (MyPhi*(2-MyP)/(((2-MyP)/(MyP-1))**(2-MyP)))*(((MyMu*(MyP-1))/(2-MyP))**(MyP-1))
MyLambda = (MyMu*(MyP-1))/((2-MyP)*MyTheta)

data_df['Count'] = np.random.poisson(MyLambda*data_df.EE)
data_df['Loss'] = np.random.gamma(shape = data_df.Count*MyA, scale=MyTheta)
data_df['PP'] = data_df.Loss / data_df.EE
data_df['rand'] = np.random.rand(len(data_df))

data_df.head()

Unnamed: 0,id,EE,StrongContin,WeakContin,Quadratic,Interact_One,Interact_Two,UnimportantContin,AnotherFilter,StrongCategorical,WeakCategorical,UnimportantCategorical,TrueLogMean,TrueMean,Count,Loss,PP,rand
0,0,1,0.373641,0.729998,0.638145,0.298912,0.847237,0.741555,0.042661,F_Medium5,C_Medium6,B,4.002821,54.752388,0,0.0,0.0,0.866216
1,1,0,0.332912,0.184512,0.459292,0.094818,0.494517,0.881102,0.828505,A_Medium5,A_medium5,A,3.889659,48.89422,0,0.0,,0.956835
2,2,1,0.176154,0.34664,0.964499,0.126359,0.195466,0.46318,0.249308,F_Medium5,B_Medium4,A,4.188027,65.892649,1,189.598737,189.598737,0.077981
3,3,1,0.607267,0.663281,0.218978,0.180671,0.736642,0.289179,0.283937,F_Medium5,B_Medium4,C,3.990403,54.076672,2,343.660024,343.660024,0.404585
4,4,1,0.476624,0.482089,0.587856,0.203653,0.418678,0.318847,0.226245,F_Medium5,A_medium5,A,4.128777,62.101918,0,0.0,0.0,0.856713


Adding random NaNs to 5% of WeakContin and 10% of UnimportantCategorical to showcase the Vogel imputation ability.

In [3]:
np.random.seed(37)
data_df.iloc[np.random.randint(low=0,high=len(data_df)-1,size=500),3] = np.nan
data_df.head(20)


# np.random.seed(47)
# data_df.iloc[np.random.randint(low=0,high=len(data_df)-1,size=1000),11] = np.nan
# data_df.head(20)

Unnamed: 0,id,EE,StrongContin,WeakContin,Quadratic,Interact_One,Interact_Two,UnimportantContin,AnotherFilter,StrongCategorical,WeakCategorical,UnimportantCategorical,TrueLogMean,TrueMean,Count,Loss,PP,rand
0,0,1,0.373641,0.729998,0.638145,0.298912,0.847237,0.741555,0.042661,F_Medium5,C_Medium6,B,4.002821,54.752388,0,0.0,0.0,0.866216
1,1,0,0.332912,0.184512,0.459292,0.094818,0.494517,0.881102,0.828505,A_Medium5,A_medium5,A,3.889659,48.89422,0,0.0,,0.956835
2,2,1,0.176154,0.34664,0.964499,0.126359,0.195466,0.46318,0.249308,F_Medium5,B_Medium4,A,4.188027,65.892649,1,189.598737,189.598737,0.077981
3,3,1,0.607267,0.663281,0.218978,0.180671,0.736642,0.289179,0.283937,F_Medium5,B_Medium4,C,3.990403,54.076672,2,343.660024,343.660024,0.404585
4,4,1,0.476624,0.482089,0.587856,0.203653,0.418678,0.318847,0.226245,F_Medium5,A_medium5,A,4.128777,62.101918,0,0.0,0.0,0.856713
5,5,1,0.865701,0.738571,0.70021,0.242262,0.594627,0.696948,0.840084,B_Medium3,B_Medium4,C,4.289321,72.916943,1,140.269291,140.269291,0.958254
6,6,1,0.03211,,0.825564,0.25546,0.107265,0.567558,0.978902,B_Medium3,A_medium5,A,3.649114,38.440604,0,0.0,0.0,0.467869
7,7,1,0.643868,0.116547,0.406971,0.455716,0.631584,0.486494,0.755187,C_Low1,B_Medium4,A,3.285103,26.711747,0,0.0,0.0,0.505089
8,8,1,0.762949,0.709568,0.686922,0.509573,0.37355,0.202774,0.823578,B_Medium3,C_Medium6,C,4.230239,68.733626,0,0.0,0.0,0.14932
9,9,1,0.759487,0.230344,0.303201,0.308879,0.33419,0.875456,0.6205,C_Low1,C_Medium6,C,3.594632,36.402315,1,2.809535,2.809535,0.187948


# Data Filtering & Splitting

Within *make_pipeline* there are many things we can do.  Here we split into train and test sets, then we filter to only those rows with EE greater than zero.  We also filter where AnotherFilter is less than .95.  We do this strictly to showcase the syntax for doing so within the *make_pipeline* function.

In [4]:
WEIGHT = ['EE']
TARGET = ['PP']

def train_test_split(x):
    return {'train': x[x['rand'] < 0.8], 'test': x[x['rand'] > .8]}


pipeline_1filter = v_utils.make_pipeline(
    v_prep.QueryTransformer('EE > 0'),
    v_prep.FunctionTransformer(train_test_split)
)


pipeline_2filters = v_utils.make_pipeline(
    v_prep.QueryTransformer('EE > 0 & AnotherFilter < .95'),
    v_prep.FunctionTransformer(train_test_split)
)

samples = pipeline_1filter.fit_transform(data_df)

print('# of Rows after 1 filter:', len(samples['train']) + len(samples['test']))

print('Train # of Rows after 1 filter:', len(samples['train']))
print('Train sum EE after 1 filter:', samples['train'][WEIGHT].sum().values[0])

print('Test # of Rows after 1 filter:', len(samples['test']))
print('Test sum EE after 1 filter:', samples['test'][WEIGHT].sum().values[0])
print("\n")

samples = pipeline_2filters.fit_transform(data_df)

print('# of Rows after 2 filters:', len(samples['train']) + len(samples['test']))

print('Train # of Rows after 2 filters:', len(samples['train']))
print('Train sum EE after 2 filters:', samples['train'][WEIGHT].sum().values[0])

print('Test # of Rows after 2 filters:', len(samples['test']))
print('Test sum EE after 2 filters:', samples['test'][WEIGHT].sum().values[0])

train_y = pd.to_numeric(samples['train'][TARGET].iloc[:, 0])
test_y = pd.to_numeric(samples['test'][TARGET].iloc[:, 0])

train_w = pd.to_numeric(samples['train'][WEIGHT].iloc[:, 0])
test_w = pd.to_numeric(samples['test'][WEIGHT].iloc[:, 0])

# of Rows after 1 filter: 9039
Train # of Rows after 1 filter: 7284
Train sum EE after 1 filter: 7284
Test # of Rows after 1 filter: 1755
Test sum EE after 1 filter: 1755


# of Rows after 2 filters: 8635
Train # of Rows after 2 filters: 6962
Train sum EE after 2 filters: 6962
Test # of Rows after 2 filters: 1673
Test sum EE after 2 filters: 1673


# Feature Selection

Here we create feature groups, which is essentially creating a named bucket for features.  Instead of creating a dict here, you could bring in a json file and convert to a dict.

In [5]:
feature_groups = {
    'interactions' : ['Interact_One', 'Interact_Two'],
    'unimportant' : ['UnimportantContin', 'UnimportantCategorical'],
    'strong' : ['StrongContin', 'StrongCategorical'],
    'weak' : ['WeakContin','WeakCategorical'],
    'continuous' : ['StrongContin', 'WeakContin', 'UnimportantContin'],
    'categorical' : ['StrongCategorical', 'WeakCategorical', 'UnimportantCategorical']
}

The ColumnExtractor will first search the second argument ('feature_groups' here) for the elements in the first argument, and if it doesn't find a match then it will assume that it is a feature (rather than a named group of features).  So, in this case it will notice that 'unimportant' is a named group of features, and will extract that group, but 'StrongContin' is not a group, so it will extract the feature named 'StrongContin'.  Notice that the pipeline is being fit to the *train* set, and applied to both the *train* and *test* sets.  This isn't as important here, but it will be later during one-hot encoding and imputation.

In [6]:
FEATURES = ['unimportant', 'StrongContin', 'weak']

pipeline = v_utils.make_pipeline(
            v_prep.ColumnExtractor(FEATURES, feature_groups)
)

train_X = pipeline.fit_transform(samples['train'])
test_X = pipeline.transform(samples['test'])
train_X.head(20)

Unnamed: 0,UnimportantContin,UnimportantCategorical,StrongContin,WeakContin,WeakCategorical
2,0.46318,A,0.176154,0.34664,B_Medium4
3,0.289179,C,0.607267,0.663281,B_Medium4
7,0.486494,A,0.643868,0.116547,B_Medium4
8,0.202774,C,0.762949,0.709568,C_Medium6
9,0.875456,C,0.759487,0.230344,C_Medium6
10,0.78102,B,0.886074,0.414477,A_medium5
12,0.367388,C,0.92781,0.135907,B_Medium4
13,0.542132,B,0.332657,0.319777,B_Medium4
15,0.397796,B,0.01408,,A_medium5
16,0.718832,B,0.006958,0.741813,C_Medium6


If, on the other hand, you *don't* want to create a data_dict but rather you only want to explicitly name features, you can just ignore the second argument:

In [7]:
FEATURES = ['UnimportantContin', 'StrongContin']

pipeline = v_utils.make_pipeline(
            v_prep.ColumnExtractor(FEATURES)
)

train_X = pipeline.fit_transform(samples['train'])
test_X = pipeline.transform(samples['test'])
train_X.head(20)

Unnamed: 0,UnimportantContin,StrongContin
2,0.46318,0.176154
3,0.289179,0.607267
7,0.486494,0.643868
8,0.202774,0.762949
9,0.875456,0.759487
10,0.78102,0.886074
12,0.367388,0.92781
13,0.542132,0.332657
15,0.397796,0.01408
16,0.718832,0.006958


## Label Encoder & FeatureUnion

We can stack even more prep functions into *make_pipeline*.  Here we will take all of our desired numeric features as they are, and then we will one-hot encode our desired categorical features.  The level of each categorical variable with the most observations is assumed to be the base level, so no column is encoded for it.

In [8]:
FEATURES = ['unimportant', 'StrongContin', 'weak']

pipeline = v_utils.make_pipeline(
    v_prep.FeatureUnion([
        ('numeric', v_utils.make_pipeline(
            v_prep.ColumnExtractor(FEATURES, feature_groups, want_numeric = True)
        ))
        ,
        ('cats', v_utils.make_pipeline(
                v_prep.ColumnExtractor(FEATURES, feature_groups, want_numeric = False)
                , v_prep.LabelEncoder()
            )),
        ])
)

train_X = pipeline.fit_transform(samples['train'])
test_X = pipeline.transform(samples['test'])
train_X.head(20)

Unnamed: 0,UnimportantContin,StrongContin,WeakContin,UnimportantCategorical__A,UnimportantCategorical__C,WeakCategorical__C_Medium6,WeakCategorical__B_Medium4
0,0.46318,0.176154,0.34664,1.0,0.0,0.0,1.0
1,0.289179,0.607267,0.663281,0.0,1.0,0.0,1.0
2,0.486494,0.643868,0.116547,1.0,0.0,0.0,1.0
3,0.202774,0.762949,0.709568,0.0,1.0,1.0,0.0
4,0.875456,0.759487,0.230344,0.0,1.0,1.0,0.0
5,0.78102,0.886074,0.414477,0.0,0.0,0.0,0.0
6,0.367388,0.92781,0.135907,0.0,1.0,0.0,1.0
7,0.542132,0.332657,0.319777,0.0,0.0,0.0,1.0
8,0.397796,0.01408,,0.0,0.0,0.0,0.0
9,0.718832,0.006958,0.741813,0.0,0.0,1.0,0.0


Alternativly, you may use 'feature_filter' in most of the vogel transformers to only affect specific features.

In [10]:
FEATURES = ['unimportant', 'StrongContin', 'weak']
CAT_FEATURES = ['UnimportantCategorical', 'WeakCategorical']

pipeline = v_utils.make_pipeline(
    v_prep.ColumnExtractor(FEATURES, feature_groups),
    v_prep.LabelEncoder(feature_filter=CAT_FEATURES)
)

train_X = pipeline.fit_transform(samples['train'])
test_X = pipeline.transform(samples['test'])
train_X.head(20)

Unnamed: 0,UnimportantContin,StrongContin,WeakContin,UnimportantCategorical__A,UnimportantCategorical__C,WeakCategorical__C_Medium6,WeakCategorical__B_Medium4
2,0.46318,0.176154,0.34664,1,0,0,1
3,0.289179,0.607267,0.663281,0,1,0,1
7,0.486494,0.643868,0.116547,1,0,0,1
8,0.202774,0.762949,0.709568,0,1,1,0
9,0.875456,0.759487,0.230344,0,1,1,0
10,0.78102,0.886074,0.414477,0,0,0,0
12,0.367388,0.92781,0.135907,0,1,0,1
13,0.542132,0.332657,0.319777,0,0,0,1
15,0.397796,0.01408,,0,0,0,0
16,0.718832,0.006958,0.741813,0,0,1,0


# NullEncoder & Imputer

Here we go even further by creating an indicator column for each feature that has null values, to indicate which observation was null; this is done using the *NullEncoder* function.  Also, we use *Imputer* to impute the mean value into the null values.  Notice that *Imputer* is run after *NullEncoder*.  If it were run before, then the mean would be imputed and no null values would exist to be encoded by *NullEncoder*.

In [9]:
FEATURES = ['unimportant', 'StrongContin', 'weak']

pipeline = v_utils.make_pipeline(
    v_prep.FeatureUnion([
        ('numeric',
         v_utils.make_pipeline(
             v_prep.ColumnExtractor(FEATURES, feature_groups, want_numeric=True),
             v_prep.NullEncoder(),
             v_prep.Imputer())),
        ('cats',
         v_utils.make_pipeline(
             v_prep.ColumnExtractor(FEATURES, feature_groups, want_numeric=False),
             v_prep.LabelEncoder())),
    ]))

train_X = pipeline.fit_transform(samples['train'])
test_X = pipeline.transform(samples['test'])
train_X.head(20)

Unnamed: 0,UnimportantContin,StrongContin,WeakContin,WeakContin_nan,UnimportantCategorical__A,UnimportantCategorical__C,WeakCategorical__C_Medium6,WeakCategorical__B_Medium4
0,0.46318,0.176154,0.34664,0.0,1.0,0.0,0.0,1.0
1,0.289179,0.607267,0.663281,0.0,0.0,1.0,0.0,1.0
2,0.486494,0.643868,0.116547,0.0,1.0,0.0,0.0,1.0
3,0.202774,0.762949,0.709568,0.0,0.0,1.0,1.0,0.0
4,0.875456,0.759487,0.230344,0.0,0.0,1.0,1.0,0.0
5,0.78102,0.886074,0.414477,0.0,0.0,0.0,0.0,0.0
6,0.367388,0.92781,0.135907,0.0,0.0,1.0,0.0,1.0
7,0.542132,0.332657,0.319777,0.0,0.0,0.0,0.0,1.0
8,0.397796,0.01408,0.497474,1.0,0.0,0.0,0.0,0.0
9,0.718832,0.006958,0.741813,0.0,0.0,0.0,1.0,0.0


# Binning & Grouping

Here we will bin continuous variables.  In this case we will create 10 bins for each continuous variable, and replace the value with the weighted average value within that bin; however, for the variable *UnimportantContin* we will override with our own predetermined bins and values.  For the categorical predictor *StrongCategorical* we will group two levels together.  Again, notice the order: we have grouped the levels of the categorical variable together before encoding.  There won't be a resultant column for our grouped levels because it will be the most populous level after grouping and therefore will be the base level.

In [10]:
FEATURES = ['unimportant', 'StrongContin', 'StrongCategorical']

bins = {
    'UnimportantContin': {
        'cutoffs': [-np.inf, 0, .5, .8, np.inf],
        'ids': [0, .5, .8, 1]
    }
}

groups = {
    'StrongCategorical' : {
        'A_Medium5' : 'AF_Medium5',
        'F_Medium5' : 'AF_Medium5'
    }
}

pipeline = v_utils.make_pipeline(
    v_prep.FeatureUnion([
        ('numeric', v_utils.make_pipeline(
            v_prep.ColumnExtractor(FEATURES, feature_groups, want_numeric = True)
            , v_prep.NullEncoder()
            , v_prep.Imputer()
            , v_prep.Binning(bin_type='qcut', bins=10, bin_id='wavg', weight=train_w, overrides=bins)
        ))
        ,
        ('cats', v_utils.make_pipeline(
                v_prep.ColumnExtractor(FEATURES, feature_groups, want_numeric = False)
                , v_prep.GrouperTransformer(groups)
                , v_prep.LabelEncoder()
            )),
        ])
    , 
)

train_X = pipeline.fit_transform(samples['train'])
test_X = pipeline.transform(samples['test'])
train_X.head()

Unnamed: 0,UnimportantContin,StrongContin,UnimportantContin_cust,StrongContin_q_g10,UnimportantCategorical__A,UnimportantCategorical__C,StrongCategorical__B_Medium3,StrongCategorical__E_Medium7,StrongCategorical__D_High9,StrongCategorical__C_Low1
0,0.46318,0.176154,0.5,0.1501,1.0,0.0,0.0,0.0,0.0,0.0
1,0.289179,0.607267,0.5,0.657978,0.0,1.0,0.0,0.0,0.0,0.0
2,0.486494,0.643868,0.5,0.657978,1.0,0.0,0.0,0.0,0.0,1.0
3,0.202774,0.762949,0.5,0.754859,0.0,1.0,1.0,0.0,0.0,0.0
4,0.875456,0.759487,1.0,0.754859,0.0,1.0,0.0,0.0,0.0,1.0


If you're curious what the different levels are of categorical variables and what the base level is, you can use *find_label_dicts*

In [11]:
v_utils.find_label_dicts(pipeline)

{'UnimportantCategorical': {'items': ['A', 'C'], 'hold': 'B'},
 'StrongCategorical': {'items': ['B_Medium3',
   'E_Medium7',
   'D_High9',
   'C_Low1'],
  'hold': 'AF_Medium5'}}

Similarly, for continuous variables we can look at what the bin cutoffs are, and what the substituted values are.

In [12]:
pipeline.named_steps['featureunion'].transformer_list[0][1].named_steps['binning'].bin_dict

{'UnimportantContin': {'cutoffs': array([-inf,  0. ,  0.5,  0.8,  inf]),
  'ids': [0, 0.5, 0.8, 1]},
 'StrongContin': {'cutoffs': array([       -inf,  0.10003497,  0.19975487,  0.30266221,  0.40635642,
          0.50331623,  0.60600014,  0.70844668,  0.80347891,  0.90101835,
                 inf]),
  'ids': [0.050533393250960665,
   0.15010040204170377,
   0.250094117709395,
   0.35266728752807974,
   0.4545220812682097,
   0.5529816948425048,
   0.6579784358215297,
   0.7548594716229268,
   0.85297994670564,
   0.9516602055267765]}}

# InteractionGroups & PolynomialTransform

Here, we're going to bin two continuous variables, 

In [13]:
FEATURES = [ 'interactions', 'Quadratic']


pipeline = v_utils.make_pipeline(
    v_prep.ColumnExtractor(FEATURES, feature_groups, want_numeric = True)
    , v_prep.NullEncoder()
    , v_prep.Imputer()
    , v_prep.InteractionGroups(['Interact_One'], features=['Interact_Two', 'Quadratic'])
#     , v_prep.PolynomialTransform(levels=2, poly_type='orthogonal', weight=train_w, drop=True)
)

train_X = pipeline.fit_transform(samples['train'])
test_X = pipeline.transform(samples['test'])
train_X.head()

AttributeError: 'str' object has no attribute 'foramt'

In [None]:
v_prep.InteractionGroups?