In [1]:
#example:  https://github.com/SauceCat/pydqc/blob/master/test/pydqc_test_on_Zillow.ipynb
#Import Pandas & numpy library
import pandas as pd
import numpy
#install pydqc from github "pip install git+https://github.com/SauceCat/pydqc.git"
#import infer_schema, data_summary, data_compare, data_consist data quality libraries
from pydqc import infer_schema, data_summary, data_compare, data_consist 

In [2]:
#Load Dataset, then explore Rows/Columns in a Pandas DataFrame
data_2016 = pd.read_csv('data/properties_2016.csv')
data_2017 = pd.read_csv('data/properties_2017.csv')
#explore dataframe shape "Rows/Columns", then data
print(data_2016.shape)
print(data_2017.shape)
data_2016.head()

(10886, 58)
(10901, 58)


Unnamed: 0,parcelid,airconditioningtypeid,architecturalstyletypeid,basementsqft,bathroomcnt,bedroomcnt,buildingclasstypeid,buildingqualitytypeid,calculatedbathnbr,decktypeid,...,numberofstories,fireplaceflag,structuretaxvaluedollarcnt,taxvaluedollarcnt,assessmentyear,landtaxvaluedollarcnt,taxamount,taxdelinquencyflag,taxdelinquencyyear,censustractandblock
0,10754147,,,,0.0,0.0,,,,,...,,,,9.0,2015,9.0,,,,
1,10759547,,,,0.0,0.0,,,,,...,,,,27516.0,2015,27516.0,,,,
2,10843547,,,,0.0,0.0,,,,,...,,,650756.0,1413387.0,2015,762631.0,20800.37,,,
3,10859147,,,,0.0,0.0,3.0,7.0,,,...,1.0,,571346.0,1156834.0,2015,585488.0,14557.57,,,
4,10879947,,,,0.0,0.0,4.0,,,,...,,,193796.0,433491.0,2015,239695.0,5725.17,,,


In [3]:
#infer schema based on full set of data
infer_schema.infer_schema(data=data_2016, fname='properties_2016', output_root='output/', 
                          sample_size=1.0, type_threshold=0.5, n_jobs=2, base_schema=None)

In [4]:
#Import infered schema from excel file, then create data quality summary from data
#note: converted excel to csv & changed to appropriate data types
data_2016_schema = pd.read_csv('output/data_schema_properties_2016.csv')
data_2016_schema.head()

Unnamed: 0,column,type,include,sample_value,sample_num_uni,sample_uni_percentage,sample_min,sample_median,sample_max,sample_std
0,parcelid,numeric,1,"[13008151, 12445474, 12773517, 17142187, 11507...",10886,1.0,10717217.0,12211634.0,163187347.0,3088965.0
1,airconditioningtypeid,numeric,1,"[1.0, 1.0, 1.0, 1.0, 1.0]",4,0.00133,1.0,1.0,13.0,1.158609
2,architecturalstyletypeid,numeric,1,"[7.0, 7.0]",1,0.5,7.0,7.0,7.0,0.0
3,basementsqft,numeric,1,"[516.0, 216.0, 224.0, 516.0, 732.0]",8,1.0,216.0,577.5,782.0,198.9786
4,bathroomcnt,numeric,1,"[2.0, 2.0, 3.0, 2.0, 2.5]",20,0.00184,0.0,2.0,12.0,1.110438


In [5]:
#import 2017 housing data, for comparison dataset vs 2016 schema
infer_schema.infer_schema(data=data_2017, fname='properties_2017_sample', output_root='output/', 
                          sample_size=0.1, type_threshold=0.5, n_jobs=2, base_schema=data_2016_schema)

In [6]:
#data_summary
#generate data summary report based on the modified data schema (see "data_summary_properties_2016.xlsx" in "output" folder)
data_summary.data_summary(table_schema=data_2016_schema, table=data_2016, fname='properties_2016', 
                          sample_size=1.0, output_root='output/', keep_images=False, n_jobs=2)

In [7]:
#generate python code "notebook" that computes summary statistics 
data_summary.data_summary_notebook(table_schema=data_2016_schema, table=data_2016, 
                                   fname='properties_2016', output_root='output/')

In [8]:
#compare vs. 2017 data
#first, generat 2017 schema 
infer_schema.infer_schema(data=data_2016, fname='properties_2017', output_root='output/', 
                          sample_size=1.0, type_threshold=0.5, n_jobs=2, base_schema=None)

In [9]:
#then compare 2016 vs. 2017 results (see "data_compare_properties_2016.xlsx")
data_2017_schema = pd.read_csv('output/data_schema_properties_2017.csv')
data_compare.data_compare(table1=data_2016, table2=data_2017, schema1=data_2016_schema, schema2=data_2017_schema,
                          fname='properties_2016', sample_size=1.0, output_root='output/', keep_images=False, n_jobs=2)