In [1]:
import os
import sys
directory_path = os.path.abspath('/notebooks/')
if directory_path not in sys.path:
    sys.path.append(directory_path)

import pandas as pd
import numpy as np

In [2]:
policies_raw = pd.read_csv('../data/policies.csv', index_col=0)
cols_to_drop = ['zip', 'county_name', 'Agent_cd']
policies_raw = policies_raw.drop(cols_to_drop, axis=1)

In [3]:
class DataValidation:
    def __init__(self, df):
        self.raw_data = df
        self.complete_data = None  # implement this later
        self.raw_catvars = self.raw_data.select_dtypes(exclude=[np.number])
        self.raw_numvars = self.raw_data.select_dtypes(include=[np.number])

    # returns numpy array of categorical variables as they appear in dataset
    def get_categoric(self):
        return self.raw_catvars.columns.values

    # returns numpy array of numerical variables as they appear in dataset
    def get_numeric(self):
        return self.raw_numvars.columns.values

    # returns df of pct missing data points for each predictor
    def get_missing(self):
        dict_missing = {'col':[], 'pct_missing':[]}

        for col in self.raw_data.columns:
            mean_missing = np.mean(self.raw_data[col].isnull())
            pct_missing = round(mean_missing * 100, 5)
            
            dict_missing['col'].append(col)
            dict_missing['pct_missing'].append(pct_missing)

        df_missing = pd.DataFrame(data=dict_missing)
        return df_missing

In [4]:
policy_data = DataValidation(policies_raw)
print(policy_data.get_categoric())
print(policy_data.get_numeric())
print(policy_data.get_missing())

['Quote_dt' 'discount' 'Home_policy_ind' 'state_id' 'quoted_amt'
 'Prior_carrier_grp' 'Cov_package_type' 'policy_id' 'split'
 'primary_parking']
['credit_score' 'CAT_zone' 'number_drivers' 'num_loaned_veh'
 'num_owned_veh' 'num_leased_veh' 'total_number_veh' 'convert_ind']
                  col  pct_missing
0            Quote_dt      0.00000
1            discount      0.00000
2     Home_policy_ind      0.00000
3            state_id      0.00000
4          quoted_amt      0.22782
5   Prior_carrier_grp     10.17046
6        credit_score      0.61023
7    Cov_package_type      1.56625
8            CAT_zone      0.50852
9           policy_id      0.00000
10     number_drivers      0.00000
11     num_loaned_veh      0.00000
12      num_owned_veh      0.00000
13     num_leased_veh      0.00000
14   total_number_veh      0.00000
15        convert_ind     25.00102
16              split      0.00000
17    primary_parking      0.00000


Verify that there are no duplicate poicy ids before making set of dropped ids

In [5]:
unique_id_counts = policy_data.raw_data['policy_id'].value_counts()
duplicates = [item for item in unique_id_counts if item != 1]
assert(len(duplicates) == 0) # there are no duplicate ids

In [13]:
dropped_policy_ids = set()

# make copy of raw policy data df, then drop missing values
df_missing_vals = policy_data.raw_data
df_missing_vals = df_missing_vals.dropna(how='all').dropna(how='all', axis=1)

# store T/F values of isnull() for each policy id in a pd series
is_missing = df_missing_vals.isnull().any(axis=1)
row_num = 1
for item in is_missing:
    if item: 
        dropped_policy_ids.add(policy_data.raw_data['policy_id'][row_num])
    row_num += 1

Verify that len of dropped policy ids set is equal to total number of rows containing missing values

In [16]:
is_missing_counts = np.unique(is_missing, return_counts=True)
print(is_missing_counts)
assert(len(dropped_policy_ids) == is_missing_counts[1][1]) # flagged policy ids were dropped successfully

(array([False,  True]), array([32176, 16986]))
