### driver code of data profiling

In [0]:
%run ./Data_profiling_sample

In [0]:
df_schema = StructType([
    StructField("id", IntegerType(), True),
    StructField("first_name", StringType(), True),
    StructField("last_name", StringType(), True),
    StructField("gender", StringType(), True),
    StructField("age", IntegerType(), True),
    StructField("education", StringType(), True),
    StructField("occupation", StringType(), True),
    StructField("income", IntegerType(), True),
    StructField("street_address", StringType(), True),
    StructField("city", StringType(), True),
    StructField("state", StringType(), True),
])

In [0]:
path = 'dbfs:/FileStore/tables/data_profilng/sample_data.csv'
null_cols = ['id','first_name','last_name','gender','age','education','occupation','income','street_address','city','state']
numeric_cols = ['age','income']
aggregate_cols=['occupation','state']
result_limit = 100
data_quality_cols_regex = {'age': '^[0-99]{1,2}$', 'first_name': '^[a-zA-Z]*$', 'gender': '^M(ale)?$|^F(emale)?$'}



In [0]:
print("sample dataframe")
df = create_df(df_schema,path)
display(df)

sample dataframe


id,first_name,last_name,gender,age,education,occupation,income,street_address,city,state
1,Madaline,Murphy,female,24.0,Doctoral,Botanist,116989,384-7572 Risus Ave,New York City,New York
2,Kristian,Stewart,Male,24.0,Upper secondary,Salesman,45736,Ap #139-5202 Donec Ave,Los Angeles,
3,Aston,Baker,Male,18.0,Master,Photographer,185036,Ap #536-5718 Orci Rd.,Chicago,Illinois
4,Brooke,Holmes,Female,30.0,Primary,Social Worker,66673,"P.O. Box 961, 1525 Eu Rd.",Houston,Texas
5,Jenna,Crawford,Female,28.0,Doctoral,Actor,93689,"P.O. Box 940, 7197 Cursus, St.",Phoenix,
6,Dale,Williams,Male,24.0,Primary,Dancer,183076,"P.O. Box 351, 3083 Nulla Ave",,Pennsylvania
7,Garry,Hall,Male,20.0,Master,Actor,75410,"P.O. Box 659, 6221 Ligula. Road",San Antonio,Texas
8,Darcy,Barnes,Female,30.0,Doctoral,Lawer,155934,"P.O. Box 375, 3299 Velit. Av.",San Diego,California
9,Reid,Sullivan,Male,22.0,Master,Astronomer,55463,"P.O. Box 351, 860 A, Rd.",Dallas,Texas
10,Daniel,Bailey,Male,,Master,Singer,122878,5847 Eget Av.,Austin,Texas


In [0]:
# 1. NULL Checks
print("NULL/Empty Percentage for Columns")
null_perc_df = get_null_perc(df,null_cols)
display(null_perc_df)

NULL/Empty Percentage for Columns


Column,NullPercentage
id,0.0%
first_name,0.0%
last_name,0.0%
gender,0.0%
age,20.0%
education,0.0%
occupation,0.0%
income,0.0%
street_address,0.0%
city,17.5%


In [0]:
#2. Summary, Average, Standard Deviation, Percentiles for Numeric Columns
print("Summary for Numeric Columns")
sum_numeric_df = get_summary_numeric(df,numeric_cols)
display(sum_numeric_df)

Summary for Numeric Columns


summary,age,income
count,32.0,40.0
mean,26.3125,117556.55
stddev,26.182285934849023,47448.1794122377
min,-21.0,40433.0
25%,20.0,74585.0
50%,25.0,116989.0
75%,28.0,156289.0
max,155.0,199284.0


In [0]:
#3. Distinct Count
print("Distinct Counts for Aggregate Columns")
dis_count_df = get_distinct_counts(df,aggregate_cols)
display(dis_count_df)

Distinct Counts for Aggregate Columns


Column,DistinctCount
occupation,27
state,26


In [0]:
#4. Distribution Count
print("Distribution Count for Aggregate Columns")
distribute_count = get_distribution_counts(df, aggregate_cols)
for i in distribute_count:
    print("Distribution for - " + i.columns[0])
    display(i)

Distribution Count for Aggregate Columns
Distribution for - occupation


occupation,count
Lawer,4
Singer,3
Astronomer,3
Social Worker,3
Police Officer,2
Historian,2
Dancer,2
Actor,2
Driver,1
Florist,1


Distribution for - state


state,count
Texas,6
California,5
,2
North Carolina,2
Arizona,2
Tennessee,2
Colorado,2
Ohio,1
Oregon,1
Pennsylvania,1


In [0]:
#5. Data Quality
print("Data Quality Issue Percentage for Columns")
mismatch_perc_df = get_mismatch_perc(spark, df, data_quality_cols_regex)
display(mismatch_perc_df)

Data Quality Issue Percentage for Columns


Column,MismatchPercentage
age,7.5%
first_name,0.0%
gender,7.5%
