In [193]:
import pandas as pd
import numpy as np

In [194]:
train_policies_complete = pd.read_csv('../data/train_policies_complete.csv', index_col=0)
test_policies_complete = pd.read_csv('../data/test_policies_complete.csv', index_col=0)

In [195]:
class CleanData:
    def __init__(self, df):
        self._data = df
        self.raw_catvars = self._data.select_dtypes(exclude=[np.number])
        self.raw_numvars = self._data.select_dtypes(include=[np.number])
    
    # returns numpy array of categorical variables as they appear in dataset
    def get_categoric(self):
        return self.raw_catvars.columns.values

    # returns numpy array of numerical variables as they appear in dataset
    def get_numeric(self):
        return self.raw_numvars.columns.values

    def credit_level(self, score):
            if score >= 300 and score <= 629:
                return 1 # 'Bad'
            elif score >= 630 and score <= 689:
                return 2 # 'Fair'
            elif score >= 690 and score <= 719:
                return 3 # 'Good'
            else:
                return 4 # 'Excellent'

    # doesn't really "clean data" atm, just groups variables 
    def clean_data(self):
        # group 'credit_score' by rating
        self._data['credit_score'] = self._data['credit_score'].apply(self.credit_level)
        
        # group 'Quote_dt' by year
        self._data['Quote_dt'] = self._data['Quote_dt'].str[:4]

        # group 'state_id' by region
        region = {'WI':'midwest', 'MN':'midwest', 'FL':'southeast', 'GA':'southeast', 'AL':'southeast', 
                    'NY':'northeast', 'NJ':'northeast', 'CT':'northeast'}
        self._data['state_id'] = self._data['state_id'].replace(region)

        # regex 'quoted_amt' to float
        self._data['quoted_amt'] = self._data['quoted_amt'].replace('[\$,]', '', regex=True).astype(float)

        # binary encode 'discount' and 'Home_poilcy_ind'
        self._data['discount'] = self._data['discount'].apply(lambda x: 0 if x == 'No' else 1)
        self._data['Home_policy_ind'] = self._data['Home_policy_ind'].apply(lambda x: 0 if x == 'N' else 1)

        # cast 'Cat_zone' data to int
        self._data['CAT_zone'] = self._data['CAT_zone'].astype('int')

        # ordinally encode 'Cov_package_type' and 'primary_parking'
        cov_map = {'Low':1, 'Medium':2, 'High':3}
        park_map = {'home/driveway':1, 'parking garage':2, 'street':3, 'unknown':4}

        self._data['Cov_package_type'] = self._data['Cov_package_type'].replace(cov_map)
        self._data['primary_parking'] = self._data['primary_parking'].replace(park_map)
        self._data['Cov_package_type'] = self._data['Cov_package_type'].astype('int')
        self._data['primary_parking'] = self._data['primary_parking'].astype('int')

        # set 'policy_id' as row name/index
        self._data = self._data.set_index('policy_id')
    
    def make_dummies(self):
        dummy_cols = ['Quote_dt', 'state_id', 'Prior_carrier_grp']
        self._data = pd.get_dummies(self._data, columns=dummy_cols)

In [196]:
train_policy = CleanData(train_policies_complete)
test_policy = CleanData(test_policies_complete)

In [197]:
train_policy.clean_data()
test_policy.clean_data()
train_policy.make_dummies()
test_policy.make_dummies()

In [198]:
train_policy._data.head()

Unnamed: 0_level_0,discount,Home_policy_ind,quoted_amt,credit_score,Cov_package_type,CAT_zone,number_drivers,num_loaned_veh,num_owned_veh,num_leased_veh,...,state_id_southeast,Prior_carrier_grp_Carrier_1,Prior_carrier_grp_Carrier_2,Prior_carrier_grp_Carrier_3,Prior_carrier_grp_Carrier_4,Prior_carrier_grp_Carrier_5,Prior_carrier_grp_Carrier_6,Prior_carrier_grp_Carrier_7,Prior_carrier_grp_Carrier_8,Prior_carrier_grp_Other
policy_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
policy_87209,1,1,5153.0,1,3,2,2,1,2,0,...,0,1,0,0,0,0,0,0,0,0
policy_91413,0,0,3090.0,2,2,2,2,1,2,1,...,0,0,0,0,1,0,0,0,0,0
policy_71845,0,0,14917.0,1,2,4,2,0,1,2,...,0,0,0,1,0,0,0,0,0,0
policy_29027,0,0,4620.0,3,3,1,1,2,2,0,...,1,0,0,0,1,0,0,0,0,0
policy_75562,0,0,11470.0,1,3,4,4,2,2,2,...,0,0,0,0,0,1,0,0,0,0


In [199]:
train_policy._data.to_csv('../data/train_policies_clean.csv')
test_policy._data.to_csv('../data/test_policies_clean.csv')