# Track record data quality report

Description: Report the quality of the data file
- Number of missing values
- Number of unique entries for discrete variables
- Basic statistics for continuous variables

#### === import libaries ===

In [None]:
# import libraries
import dateutil
import sys
sys.path.append(home_dir+'/lib')

In [None]:
data_source_dir = home_dir+'/ETL/vars/'

# select latest dated file
cmd = 'ls {data_source_dir}/input_file.csv'.format(data_source_dir=data_source_dir)
input_filepath = commands.getoutput(cmd).splitlines()[-1]
__nb_logger.info('reading file: %s' % input_filepath)

# actually read
df = pd.read_csv(input_filepath)

### === MAIN ===

#### parameters setting

In [None]:
# if number of filled entries is less than this ratio, remove the feature
thr = 0.01

### Report data quality

In [None]:
input_filepath = 'mcs_claims_extraction_san.txt'
df = pd.read_csv(input_filepath, sep='|')

In [None]:
test_length = 200
disc_thr = 20

In [None]:
cont_variables, disc_variables, dt_variables, empty_variables = [[], [], [], []]

In [None]:
dt_keywords = ['DATE', 'DT', 'TIME']
for colname in df.columns:
    vals = df[colname][:test_length].dropna().values
    if len(vals) == 0:
        empty_variables.append(colname)
    elif len(set(vals)) > disc_thr:
        if any([keyword in colname.upper() for keyword in dt_keywords]):
            dt_variables.append(colname)
        else:
            cont_variables.append(colname)
        # end f
    else:
        disc_variables.append(colname)
    # end if
# end for

In [None]:
print ('The following are assumed to be discrete variables:')
print (disc_variables)
print ()
print ('The following are assumed to be continuous variables:')
print (cont_variables)
print ()
print ('The following are assumed to be datetime variables:')
print (dt_variables)
print ()
print ('The following are likely to be empty:')
print (empty_variables)

In [None]:
# human judgments
disc_variables  = []
cont_variables  = []
empty_variables = []
dt_variables    = []
other_variables = []

### Analyse discrete variables

In [None]:
disc_output = {}
for colname in disc_variables:
    __nb_logger.info('working on %s' % (colname))
    counter = count_elements(df[colname].dropna().values, skip_na_test=True)
    temp = {
        '__ucount__': len(counter),
        '__NA__'    : len(df) - sum(counter.values()),
        'counter'   : counter
    } # end temp
    disc_output[colname] = temp
# end for

In [None]:
_df = pd.DataFrame({colname: map(disc_output[colname].get, ['__NA__', '__ucount__']) for colname in disc_output.keys()}).transpose()
_df.columns = ['missing_count', 'unique_count']
_df

### Analyse continuous variables

In [None]:
cont_output = {}
for colname in cont_variables:
    __nb_logger.info('working on %s' % (colname))
    vals = df[colname].dropna().values
    try:
        vals = map(float, vals)
    except ValueError:
        other_variables.append(colname)
        continue
    # end try
    temp = {
        '__NA__': len(df) - len(vals),
        'stats' : simple_stats(vals)
    } # end temp
    cont_output[colname] = temp
# end for

In [None]:
_df = pd.DataFrame({colname: cont_output[colname]['stats'] for colname in cont_output.keys()}).transpose()
_df['missing_count'] = NAs = [cont_output[colname]['__NA__'] for colname in cont_output.keys()]
_df

### Analyse datetime variables

In [None]:
dt_output = {}
for colname in dt_variables:
    __nb_logger.info('working on %s' % (colname))
    vals = df[colname].dropna().values
    try:
        vals = map(dateutil.parser.parse, vals)
        vals = [_.year for _ in vals] # yearly basis
    except ValueError:
        other_variables.append(colname)
        continue
    # end try
    temp = {
        '__NA__': len(df) - len(vals),
        'stats' : simple_stats(vals)
    } # end temp
    dt_output[colname] = temp
# end for

In [None]:
_df = pd.DataFrame({colname: dt_output[colname]['stats'] for colname in dt_output.keys()}).transpose()
_df['missing_count'] = NAs = [dt_output[colname]['__NA__'] for colname in dt_output.keys()]
_df

#### print warnings

In [None]:
if len(other_variables) != 0:
    print 'The following variables contain non-float values. Require attention.'
    print other_variables
# end if

### Aggregate all output 

In [None]:
# next time to excel
output = {
  'discrete'  : disc_output,
  'continuous': cont_output,
  'datetime'  : dt_output
} # end output

### Export to file

In [None]:
# specify output
outfilename = working_dir + '/vars/data_summary.%s.pickle' % int(utc_timestamp())
__nb_logger.info('write to output: %s' % outfilename)

In [None]:
pickle.dump(output, open(outfilename, 'w'))