In [None]:
#libraries
import numpy as np
import pandas as pd
import os
import csv
import dataframe_image as dfi

#pandas options
pd.set_option('display.max_columns', None)

# Create folder for file exports

In [None]:
#Create directory folder saving exports
outdir = './exports'
if not os.path.exists(outdir):
    os.mkdir(outdir)

# Load Data

In [None]:
#Load all medications data
df = pd.read_csv("dataset.csv")

In [None]:
df.head()

# Get full null columns and non-varying value columns

### Working with full null columns

In [None]:
#Identify columns where all values are NULL
full_null_cols_df = pd.DataFrame(df.isnull().all()).reset_index().rename(columns={'index': 'field name'})
full_null_cols_df = full_null_cols_df[full_null_cols_df[0] ==True]

#Export list of full null columns
fullname = os.path.join(outdir, 'full_null_cols_list.csv')
full_null_cols_df['field name'].to_csv(fullname, index=False)

### Working with columns where all values are the same

In [None]:
#Identify columns where all values are the same
num_unique_vals_df = pd.DataFrame(df.nunique()).reset_index().rename(columns={'index': 'field name'})
non_varying_values_df = num_unique_vals_df[num_unique_vals_df[0] == 1]

#Add column with the non-varying value
col_lst = list(non_varying_values_df['field name'])

col_unique_vals_lst = []

for col in col_lst:
    col_unique_vals_lst.append(df[col].unique().tolist())
    
    
unq_vals_col_df = pd.DataFrame(list(zip(col_lst, col_unique_vals_lst)), columns=['field name', 'unq_val'])
unq_vals_col_df

#Export list of non-varying columns
fullname = os.path.join(outdir, 'nonvarying_cols_list.csv')
unq_vals_col_df.to_csv(fullname, index=False)

# Create completeness metrics table

In [None]:
#Remove columns (empty, or single values throughout)
null_and_nonvaryingvals_lst = full_null_cols_df['field name'].tolist() + non_varying_values_df['field name'].tolist()
df = df.loc[:, ~df.columns.str.contains('^Unnamed')] #remove unnamed column - redundant index value if exists
df.drop(null_and_nonvaryingvals_lst, axis=1, inplace=True)

In [None]:
#Get number of records in dataframe
num_recs = df.shape[0]
print(num_recs)

In [None]:
#Create dataframe with number of distinct values in each column
col_lst = list(df.columns)
col_unique_lst = []
for col in col_lst:
    col_unique_lst.append(df[col].nunique())

unq_vals_col_df = pd.DataFrame(list(zip(col_lst, col_unique_lst)), columns=['field name', 'num_unq_vals'])
unq_vals_col_df = unq_vals_col_df.set_index('field name')

In [None]:
unq_vals_col_df

In [None]:
compl_df = pd.DataFrame((df.isnull().sum(axis = 0)), columns = ['NULL_CNT'])
compl_df['NON_NULL_CNT'] = num_recs-(compl_df['NULL_CNT'])
compl_df['PERC_NULL'] = compl_df['NULL_CNT']/num_recs
compl_df['PERC_NON_NULL'] = compl_df['NON_NULL_CNT']/num_recs
compl_df = compl_df.join(unq_vals_col_df)


#Export list of non-varying columns
fullname = os.path.join(outdir, 'completeness_tabular_results.csv')
compl_df.to_csv(fullname, index=True)

In [None]:
compl_df