In [20]:
# import utility modules
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

# load raw data
policies_complete = pd.read_csv('../data/policies_complete.csv', index_col=0)
drivers_complete = pd.read_csv('../data/drivers_complete.csv', index_col=0)
vehicles_complete = pd.read_csv('../data/vehicles_complete.csv', index_col=0)

In [21]:
class DataValidation:
    def __init__(self, df):
        self._data = df
        self.raw_catvars = self._data.select_dtypes(exclude=[np.number])
        self.raw_numvars = self._data.select_dtypes(include=[np.number])

    # returns numpy array of categorical variables as they appear in dataset
    def get_categoric(self):
        return self.raw_catvars.columns.values

    # returns numpy array of numerical variables as they appear in dataset
    def get_numeric(self):
        return self.raw_numvars.columns.values

    # returns df of pct missing data points for each predictor
    def get_missing(self):
        dict_missing = {'col':[], 'pct_missing':[]}

        for col in self._data.columns:
            mean_missing = np.mean(self._data[col].isnull())
            pct_missing = round(mean_missing * 100, 5)
            
            dict_missing['col'].append(col)
            dict_missing['pct_missing'].append(pct_missing)

        df_missing = pd.DataFrame(data=dict_missing)
        return df_missing

In [22]:
policy_data = DataValidation(policies_complete)
driver_data = DataValidation(drivers_complete)
vehicle_data = DataValidation(vehicles_complete)

In [29]:
for col in policy_data.get_categoric():
    print(policy_data._data[col].unique())

['2015-01-28' '2018-09-03' '2016-05-18' ... '2017-06-08' '2017-04-27'
 '2017-10-02']
['Yes' 'No']
['Y' 'N']
['NY' 'FL' 'MN' 'NJ' 'WI' 'CT' 'GA' 'AL']
['$5,153' '$3,090' '$14,917' ... '$6,669' '$271' '$8,428']
['Carrier_1' 'Carrier_4' 'Carrier_3' 'Carrier_5' 'Carrier_2' 'Carrier_8'
 'Carrier_6' 'Carrier_7' 'Other']
['High' 'Medium' 'Low']
['policy_87209' 'policy_91413' 'policy_71845' ... 'policy_67016'
 'policy_30163' 'policy_63982']
['Train']
['home/driveway' 'unknown' 'parking garage' 'street']
