In [None]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:95% !important; }</style>")) # sets width of notebook cell
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

%matplotlib inline

In [None]:
bureau_bal = pd.read_csv('../data/raw/bureau_balance.csv')

In [None]:
bureau_bal.head()

In [None]:
def missing_values(df: pd.DataFrame) -> pd.DataFrame:
    """returns DataFrame with count and percentage of NaN values.
        Input:
            df: DataFrame
        Output:
            nan_df: DataFrame containing count and percentage of NaN values of input DataFrame
    """
    count = df.isnull().sum().sort_values(ascending=False)
    n = df.shape[0]
    nan_percentage = count / n*100
    nan_df = pd.concat([count,nan_percentage],axis = 1, keys=['Count', '% of null'])
    return nan_df 

def filter_columns(df: pd.DataFrame, excl_dtypes: list, excl_columns: list) -> list:
    """Returns list of columns without pre-specified datatypes and column names
        Inputs:
            df: pandas dataframe
            excl_dtypes:  list of excluded datatypes
            excl_columns: list of excluded columns
        Outputs:
            arr: python list of strings (column names)
    """
    return [col for col in df.columns if df[col].dtype not in excl_dtypes and col not in excl_columns]

def unique_values(arr: list, df: pd.DataFrame) -> dict:
    """Returns list of unique values for each column in input list
        Inputs:
            arr: list of columns
            df_name: name of dataframe
        Outputs:
            dict: dictionary of column_name:unique_values
    """
    
    d = {}
    for col in arr:
        d[col] =  df[col].nunique(),df[col].unique().tolist()
    return d   

In [None]:
missing_values(bureau_bal)

In [None]:
dtypes_filter = ['float64','int64']
columns_filter = ['SK_ID_CURR', 'DAYS_BIRTH', 'DAYS_EMPLOYED', 'DAYS_ID_PUBLISH']
filtered_cols = filter_columns(bureau, dtypes_filter, columns_filter)
non_continuous = unique_values(filtered_cols, bureau_bal)

for col, val in non_continuous.items():
    print('Column {} has {} distinct values.\n{}'.format(col,val[0],val[1]))
    print('-'*200)