In [1]:
# import utility modules
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

# load raw data
policies_raw = pd.read_csv('../data/policies.csv', index_col=0)
drivers_raw = pd.read_csv('../data/drivers.csv', index_col=0)
vehicles_raw = pd.read_csv('../data/drivers.csv', index_col=0)

In [2]:
class DataValidation:
    def __init__(self, df):
        self.raw_data = df
        self.complete_data = None  # implement this later
        self.raw_catvars = self.raw_data.select_dtypes(exclude=[np.number])
        self.raw_numvars = self.raw_data.select_dtypes(include=[np.number])

    # returns numpy array of categorical variables as they appear in dataset
    def get_categoric(self):
        return self.raw_catvars.columns.values

    # returns numpy array of numerical variables as they appear in dataset
    def get_numeric(self):
        return self.raw_numvars.columns.values

    # returns df of pct missing data points for each predictor
    def get_missing(self):
        dict_missing = {'col':[], 'pct_missing':[]}

        for col in self.raw_data.columns:
            mean_missing = np.mean(self.raw_data[col].isnull())
            pct_missing = round(mean_missing * 100, 5)
            
            dict_missing['col'].append(col)
            dict_missing['pct_missing'].append(pct_missing)

        df_missing = pd.DataFrame(data=dict_missing)
        return df_missing

In [3]:
policy_data = DataValidation(policies_raw)
driver_data = DataValidation(drivers_raw)
vehicle_data = DataValidation(vehicles_raw)

In [5]:
driver_data.get_missing()

Unnamed: 0,col,pct_missing
0,policy_id,0.0
1,gender,0.0
2,living_status,0.04704
3,age,0.0
4,safty_rating,0.07244
5,high_education_ind,0.51085


In [None]:
vehicle_data.get_missing()

Membership Constraints:
- ~~state id must be within the set of ratified US states~~ 

Range Constraints:
- credit score must be within standard range (i.e., [300, 850])

Cross-validation Constraints:
- ~~county name must be valid with resepect to state id~~

Regular Expressions:
- ~~zip code must follow standard US format~~
- policy ID may follow a certain format
- date must follow a valid format that is consistent
- quoted amount must follow conventional monetary format
    + data type of quote amount may be altered to more useable format

Uniqueness Constraints:
- customer identifiers must be unique
- all observations must be unique

Data-Type Constraints:
- credit score should be int
- all features should have consistent data-types
- Y/N values are preferred to be consistent across dataset
- all quote amounts must be in integer form

In [4]:
# data-type constraints
# data_types_dict = {'credit_score': int}
# policies_raw = policies_raw.astype(data_types_dict)

# membership constraints

# range constraints
# policies_raw = policies_raw[policies_raw['credit_score'] >= 350 and policies_raw['credit_score'] <= 850]
# print("size after range constraints : {}".format(policies_raw.shape[0]))
