In [None]:
import pandas as pd
import numpy as np

In [None]:
train_policies_complete = pd.read_csv('../data/train_policies_complete.csv', index_col=0)
test_policies_complete = pd.read_csv('../data/test_policies_complete.csv', index_col=0)

In [None]:
class CleanData:
    def __init__(self, df):
        self._data = df
        self.raw_catvars = self._data.select_dtypes(exclude=[np.number])
        self.raw_numvars = self._data.select_dtypes(include=[np.number])
    
    # returns numpy array of categorical variables as they appear in dataset
    def get_categoric(self):
        return self.raw_catvars.columns.values

    # returns numpy array of numerical variables as they appear in dataset
    def get_numeric(self):
        return self.raw_numvars.columns.values

    def credit_level(self, score):
            if score >= 300 and score <= 629:
                return 'Bad'
            elif score >= 630 and score <= 689:
                return 'Fair'
            elif score >= 690 and score <= 719:
                return 'Good'
            else:
                return 'Excellent'

    # doesn't really "clean data" atm, just groups variables 
    def clean_data(self):
        # group 'credit_score' by rating
        self._data['credit_score'] = pd.Series(self.credit_level(score) for score in self._data['credit_score'])
        
        # group 'Quote_dt' by year
        self._data['Quote_dt'] = self._data['Quote_dt'].str[:4]

        # group 'state_id' by region
        region = {'WI':'midwest', 'MN':'midwest', 'FL':'southeast', 'GA':'southeast', 'AL':'southeast', 
                    'NY':'northeast', 'NJ':'northeast', 'CT':'northeast'}
        self._data['state_id'] = pd.Series(region[state] for state in self._data['state_id'])

        # regex 'quoted_amt' to float
        self._data['quoted_amt'] = self._data['quoted_amt'].replace('[\$,]', '', regex=True).astype(float)

        # binary encode 'discount' and 'Home_poilcy_ind'
        self._data['discount'] = self._data['discount'].apply(lambda x: 0 if x == 'No' else 1)
        self._data['Home_policy_ind'] = self._data['Home_policy_ind'].apply(lambda x: 0 if x == 'N' else 1)


In [None]:
train_policy = CleanData(train_policies_complete)
test_policy = CleanData(test_policies_complete)

In [None]:
train_policy.clean_data()

In [None]:
train_policy._data = pd.get_dummies(train_policy._data, columns=['Quote_dt', 'state_id', 'Prior_carrier_grp'])

In [None]:
# print(train_policy.get_numeric())
# large = ['credit_score']
# for col in train_policy.get_numeric():
#     if col not in large:
#         print("{}: {}".format(col, train_policy._data[col].unique()))

In [None]:
# for col in train_policy.get_categoric():
#     print("{}: {}".format(col, train_policy._data[col].unique()))