In [27]:
import pandas as pd
import numpy as np

In [28]:
train_policies_complete = pd.read_csv('../data/train_policies_complete.csv', index_col=0)

In [29]:
class DataValidation:
    def __init__(self, df):
        self._data = df
        self.clean = None
        self.raw_catvars = self._data.select_dtypes(exclude=[np.number])
        self.raw_numvars = self._data.select_dtypes(include=[np.number])

    # returns numpy array of categorical variables as they appear in dataset
    def get_categoric(self):
        return self.raw_catvars.columns.values

    # returns numpy array of numerical variables as they appear in dataset
    def get_numeric(self):
        return self.raw_numvars.columns.values

    # returns df of pct missing data points for each predictor
    def get_missing(self):
        dict_missing = {'col':[], 'pct_missing':[]}

        for col in self._data.columns:
            mean_missing = np.mean(self._data[col].isnull())
            pct_missing = round(mean_missing * 100, 5)
            
            dict_missing['col'].append(col)
            dict_missing['pct_missing'].append(pct_missing)

        df_missing = pd.DataFrame(data=dict_missing)
        return df_missing

In [30]:
train_policy = DataValidation(train_policies_complete)

In [31]:
large = ['credit_score']
for col in train_policy.get_numeric():
    if col not in large:
        print("{}: {}".format(col, train_policy._data[col].unique()))

CAT_zone: [2. 4. 1. 5. 3.]
number_drivers: [2 1 4 3 5 6]
num_loaned_veh: [1 0 2 3]
num_owned_veh: [2 1 3]
num_leased_veh: [0 1 2]
total_number_veh: [3 4 6 2 5 7 1 8]
convert_ind: [0. 1.]


In [32]:
for col in train_policy.get_categoric():
    print("{}: {}".format(col, train_policy._data[col].unique()))

Quote_dt: ['2015-01-28' '2018-09-03' '2016-05-18' ... '2017-06-08' '2017-04-27'
 '2017-10-02']
discount: ['Yes' 'No']
Home_policy_ind: ['Y' 'N']
state_id: ['NY' 'FL' 'MN' 'NJ' 'WI' 'CT' 'GA' 'AL']
quoted_amt: ['$5,153' '$3,090' '$14,917' ... '$6,669' '$271' '$8,428']
Prior_carrier_grp: ['Carrier_1' 'Carrier_4' 'Carrier_3' 'Carrier_5' 'Carrier_2' 'Carrier_8'
 'Carrier_6' 'Carrier_7' 'Other']
Cov_package_type: ['High' 'Medium' 'Low']
policy_id: ['policy_87209' 'policy_91413' 'policy_71845' ... 'policy_67016'
 'policy_30163' 'policy_63982']
primary_parking: ['home/driveway' 'unknown' 'parking garage' 'street']


In [33]:
# group 'Quote_dt' by year, make dummies
train_policy._data['Quote_dt'] = train_policy._data['Quote_dt'].str[:4]
train_policy._data['Quote_dt'].unique()

array(['2015', '2018', '2016', '2017'], dtype=object)

In [34]:
# group 'state_id' by region, make dummies
region = {'WI':'midwest', 'MN':'midwest', 'FL':'southeast', 'GA':'southeast', 'AL':'southeast', 'NY':'northeast', 
            'NJ':'northeast', 'CT':'northeast'}
train_policy._data['state_id'] = pd.Series(region[state] for state in train_policy._data['state_id'])

In [35]:
# regex 'quoted_amt' to verify consistent format, convert to int

In [36]:
# binary encode 'discount' and 'Home_poilcy_ind'
train_policy._data['discount'] = train_policy._data['discount'].apply(lambda x: 0 if x == 'No' else 1)
train_policy._data['Home_policy_ind'] = train_policy._data['Home_policy_ind'].apply(lambda x: 0 if x == 'N' else 1)

In [38]:
train_policy._data.head()

Unnamed: 0,Quote_dt,discount,Home_policy_ind,state_id,quoted_amt,Prior_carrier_grp,credit_score,Cov_package_type,CAT_zone,policy_id,number_drivers,num_loaned_veh,num_owned_veh,num_leased_veh,total_number_veh,convert_ind,primary_parking
1,2015,1,1,northeast,"$5,153",Carrier_1,613.0,High,2.0,policy_87209,2,1,2,0,3,0.0,home/driveway
3,2018,0,0,southeast,"$3,090",Carrier_4,631.0,Medium,2.0,policy_91413,2,1,2,1,4,0.0,unknown
5,2016,0,0,midwest,"$14,917",Carrier_3,602.0,Medium,4.0,policy_71845,2,0,1,2,3,0.0,unknown
6,2016,0,0,southeast,"$4,620",Carrier_4,704.0,High,1.0,policy_29027,1,2,2,0,4,0.0,parking garage
7,2017,0,0,southeast,"$11,470",Carrier_5,611.0,High,4.0,policy_75562,4,2,2,2,6,0.0,home/driveway
