# ADA development

- [x] data type detection.
- [x] format column name
- [x] extract datetime elements.
- [x] encapsular en una clase data para prepara el df input
    - [x] devolver datos por nombre de columna.
- [x] binning: bins equiespaciados (default 10).
- [ ] binning: bins por percentiles (default 10).
- [x] relationships iniciales.
- [ ] relationships a partir de una columna (recursivo).

In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
import math
import warnings
warnings.filterwarnings('ignore')
# datasets
import seaborn
# relationships
import itertools

## dataset

In [2]:
df = seaborn.load_dataset('titanic')
df.columns

Index(['survived', 'pclass', 'sex', 'age', 'sibsp', 'parch', 'fare',
       'embarked', 'class', 'who', 'adult_male', 'deck', 'embark_town',
       'alive', 'alone'],
      dtype='object')

In [3]:
df.dtypes

survived          int64
pclass            int64
sex              object
age             float64
sibsp             int64
parch             int64
fare            float64
embarked         object
class          category
who              object
adult_male         bool
deck           category
embark_town      object
alive            object
alone              bool
dtype: object

## DATA: type of columns

In [4]:
from dateutil.parser import parse

class Data():
    def __init__(self, df):
        self.df = df
        self.columns = dict()
        # run
        self.__run()
        # clean
        del df
    
    # get column(s) data
    def get(self, column_name: 'str or list of str', clean = False):   
        if clean:
            return self.df[column_name].dropna().values
        else:
            return self.df[column_name].values

    # run
    def __run(self):
        # check datetime columns and format if there are
        self.__datetime_format()
        # other types
        num_cols = [col for col in self.df.select_dtypes(include=['float64']).columns.values if not col in self.columns['datetime']]
        disc_cols = [col for col in self.df.select_dtypes(include=['int64', 'bool']).columns.values if not col in self.columns['datetime']]
        cat_cols = [col for col in self.df.select_dtypes(include=['object', 'category']).columns.values if not col in self.columns['datetime']]
        # store types
        self.columns['numerical'] = num_cols
        self.columns['discrete'] = disc_cols
        self.columns['categorical'] = cat_cols
        # format column names
        self.__format_column_name()
        
    # datetime detection
    def __datetime_format(self):
        # parse datetime values
        def parse_dt(dt):
            try:
                return parse(dt)
            except:
                return np.nan
        # initialize
        datetime_cols = list()
        # loop of columns
        for col in self.df.columns.tolist():
            data = self.df[col].values
            dt = np.array(list(map(lambda dt: parse_dt(dt), data)))
            # check if column is datetime
            if len(dt)==len(list(filter(lambda v: v==v, dt))):
                print('"%s" is datetime'%col)
                # conversion if it is datetime
                self.df[col] = dt
                # store column name
                datetime_cols.append(col)
            else:
                pass
        # extract datetime elements
        for col in datetime_cols:
            self.data.df['%s_year'] = self.data.df.dt.year
            self.data.df['%s_month'] = self.data.df.dt.month
            self.data.df['%s_day'] = self.data.df.dt.day
            self.data.df['%s_hour'] = self.data.df.dt.hour
            self.data.df['%s_doy'] = self.data.df.dt.dayofyear
            self.data.df['%s_dow'] = self.data.df.dt.dayofweek
        # set type of columns 
        self.columns['datetime'] = datetime_cols
        
    # format column name
    def __format_column_name(self):
        dc = {'numerical':'N', 'discrete':'D', 'categorical':'C', 'datetime':'DT'}
        for col_type, col_names in self.columns.items():
            columns = list()
            for col in col_names:
                col_new = '%s_%s'%(dc[col_type], str.lower(col.replace(' ', '_')))
                columns.append(col_new)
                self.df.rename(columns = {col:col_new}, inplace = True)
            self.columns[col_type] = columns

In [5]:
data = Data(df.copy())

In [6]:
data.columns      

{'datetime': [],
 'numerical': ['N_age', 'N_fare'],
 'discrete': ['D_survived',
  'D_pclass',
  'D_sibsp',
  'D_parch',
  'D_adult_male',
  'D_alone'],
 'categorical': ['C_sex',
  'C_embarked',
  'C_class',
  'C_who',
  'C_deck',
  'C_embark_town',
  'C_alive']}

In [7]:
data.df.head()

Unnamed: 0,D_survived,D_pclass,C_sex,N_age,D_sibsp,D_parch,N_fare,C_embarked,C_class,C_who,D_adult_male,C_deck,C_embark_town,C_alive,D_alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


## RELATIONSHIPS

In [16]:
## combinations without replacement
def combinations(l:list(), r:int=2)->list:
    return list(itertools.combinations(l,r=r))

## get bin labels
def get_bin_labels(bins):
    return ['%.3f-%.3f'%(bins[i], bins[i+1]) for i in range(len(bins[:-1]))]

## binning: equispaced bins
def num2cat_linspace(data:'obj', name:str, nbins:int = 10):
    # validate
    assert name in data.df.columns, 'column name is not available.'
    assert 'N_' in name, 'only possible for numerical column.'
    # get data
    X = data.get(name, clean = False)
    # create new column name
    name_new = name.replace('N_', 'C_linspace-%s_'%nbins)
    # calculate bins
    bins = np.linspace(np.nanmin(X), np.nanmax(X), nbins+1, endpoint=True)
    # get labels
    labels = get_bin_labels(bins) #np.arange(1,nbins+1,1)
    # binning
    data.df[name_new] = pd.cut(X, bins = bins, labels = labels)
    # include new column in dtypes
    data.columns['categorical'].append(name_new)
    return data

In [18]:
# numerical to categorical
for col_num in data.columns['numerical']:
    data = num2cat_linspace(data, col_num)
# initialize
drelationships = dict()
# univariate
drelationships['univariate'] = [col for col in data.df.columns.tolist() if not col in data.columns['datetime']]
# bivariate
drelationships['bivariate_N_N'] = combinations(data.columns['numerical'])
drelationships['bivariate_C_C'] = combinations(data.columns['categorical'] + data.columns['discrete'])
drelationships['bivariate_C_N'] = [(col_cat, col_num) for col_cat in (data.columns['categorical'] + data.columns['discrete']) for col_num in data.columns['numerical']]
# display
print('[info] num. of relationships:  univariage = %s / bivariate = %s'%(len(drelationships['univariate']), len(drelationships['bivariate_N_N'] + drelationships['bivariate_C_C'] + drelationships['bivariate_C_N'])))

[info] num. of relationships:  univariage = 17 / bivariate = 276


In [19]:
drelationships 

{'univariate': ['D_survived',
  'D_pclass',
  'C_sex',
  'N_age',
  'D_sibsp',
  'D_parch',
  'N_fare',
  'C_embarked',
  'C_class',
  'C_who',
  'D_adult_male',
  'C_deck',
  'C_embark_town',
  'C_alive',
  'D_alone',
  'C_linspace-10_age',
  'C_linspace-10_fare'],
 'bivariate_N_N': [('N_age', 'N_fare')],
 'bivariate_C_C': [('C_sex', 'C_embarked'),
  ('C_sex', 'C_class'),
  ('C_sex', 'C_who'),
  ('C_sex', 'C_deck'),
  ('C_sex', 'C_embark_town'),
  ('C_sex', 'C_alive'),
  ('C_sex', 'C_linspace-10_age'),
  ('C_sex', 'C_linspace-10_fare'),
  ('C_sex', 'C_linspace-10_age'),
  ('C_sex', 'C_linspace-10_fare'),
  ('C_sex', 'C_linspace-10_age'),
  ('C_sex', 'C_linspace-10_fare'),
  ('C_sex', 'C_linspace-10_age'),
  ('C_sex', 'C_linspace-10_age'),
  ('C_sex', 'C_linspace-10_fare'),
  ('C_sex', 'D_survived'),
  ('C_sex', 'D_pclass'),
  ('C_sex', 'D_sibsp'),
  ('C_sex', 'D_parch'),
  ('C_sex', 'D_adult_male'),
  ('C_sex', 'D_alone'),
  ('C_embarked', 'C_class'),
  ('C_embarked', 'C_who'),
  ('C_

# ANALYSIS UNIVARIATE