In [147]:
import pandas as pd
import numpy as np

In [148]:
train_policies_complete = pd.read_csv('../data/train_policies_complete.csv', index_col=0)
test_policies_complete = pd.read_csv('../data/test_policies_complete.csv', index_col=0)

In [149]:
class CleanData:
    def __init__(self, df):
        self._data = df
        self.raw_catvars = self._data.select_dtypes(exclude=[np.number])
        self.raw_numvars = self._data.select_dtypes(include=[np.number])
    
    # returns numpy array of categorical variables as they appear in dataset
    def get_categoric(self):
        return self.raw_catvars.columns.values

    # returns numpy array of numerical variables as they appear in dataset
    def get_numeric(self):
        return self.raw_numvars.columns.values

    def credit_level(self, score):
            if score >= 300 and score <= 629:
                return 1 # 'Bad'
            elif score >= 630 and score <= 689:
                return 2 # 'Fair'
            elif score >= 690 and score <= 719:
                return 3 # 'Good'
            else:
                return 4 # 'Excellent'

    # doesn't really "clean data" atm, just groups variables 
    def clean_data(self):
        # group 'credit_score' by rating
        self._data['credit_score'] = self._data['credit_score'].apply(self.credit_level)
        
        # group 'Quote_dt' by year
        self._data['Quote_dt'] = self._data['Quote_dt'].str[:4]

        # group 'state_id' by region
        region = {'WI':'midwest', 'MN':'midwest', 'FL':'southeast', 'GA':'southeast', 'AL':'southeast', 
                    'NY':'northeast', 'NJ':'northeast', 'CT':'northeast'}
        self._data['state_id'] = self._data['state_id'].replace(region)

        # regex 'quoted_amt' to float
        self._data['quoted_amt'] = self._data['quoted_amt'].replace('[\$,]', '', regex=True).astype(float)

        # binary encode 'discount' and 'Home_poilcy_ind'
        self._data['discount'] = self._data['discount'].apply(lambda x: 0 if x == 'No' else 1)
        self._data['Home_policy_ind'] = self._data['Home_policy_ind'].apply(lambda x: 0 if x == 'N' else 1)

        # cast 'Cat_zone' data to int
        self._data['CAT_zone'] = self._data['CAT_zone'].astype('int')

        # ordinally encode 'Cov_package_type' and 'primary_parking'
        cov_map = {'Low':1, 'Medium':2, 'High':3}
        park_map = {'home/driveway':1, 'parking garage':2, 'street':3, 'unknown':4}
        self._data['Cov_package_type'] = self._data['Cov_package_type'].replace(cov_map)
        self._data['primary_parking'] = self._data['primary_parking'].replace(park_map)
        self._data['Cov_package_type'] = self._data['Cov_package_type'].astype('int')
        self._data['primary_parking'] = self._data['primary_parking'].astype('int')
        


In [150]:
train_policy = CleanData(train_policies_complete)
test_policy = CleanData(test_policies_complete)

In [151]:
train_policy.clean_data()
train_policy._data.head()

Unnamed: 0,Quote_dt,discount,Home_policy_ind,state_id,quoted_amt,Prior_carrier_grp,credit_score,Cov_package_type,CAT_zone,policy_id,number_drivers,num_loaned_veh,num_owned_veh,num_leased_veh,total_number_veh,convert_ind,primary_parking
1,2015,1,1,northeast,5153.0,Carrier_1,Bad,3,2,policy_87209,2,1,2,0,3,0.0,1
3,2018,0,0,northeast,3090.0,Carrier_4,Fair,2,2,policy_91413,2,1,2,1,4,0.0,4
5,2016,0,0,northeast,14917.0,Carrier_3,Bad,2,4,policy_71845,2,0,1,2,3,0.0,4
6,2016,0,0,southeast,4620.0,Carrier_4,Good,3,1,policy_29027,1,2,2,0,4,0.0,2
7,2017,0,0,northeast,11470.0,Carrier_5,Bad,3,4,policy_75562,4,2,2,2,6,0.0,1


In [153]:
train_policy._data = pd.get_dummies(train_policy._data, columns=['Quote_dt', 'state_id', 'Prior_carrier_grp'])
train_policy._data.head()