# ADA (Automatic Data Analysis) - Univariate


## Infraestructura

- [x] Identicacion de tipo de datos de columnas, renombrado de columnas, factorizacion de valores categoricos.
- [x] Creacion de lista de todas las combinaciones posibles de *queries* con variables categoricas para ser usadas por `df.query(squery)`.
- [x] Estimacion de numero minimo de registros para que se pueda hacer un analisis estadistico con significancia suficente.
- [ ] Transformacion de variables numericas en categoricas.

## Analisis univariantes

### num 

- [ ] es gaussiana.
- [ ] hay separabilidad. 
    - [ ] univariante: clustering 1D, medidas de informacion.
    - [ ] a traves de variable categorica.

### cat

- [ ] hay separabilidad. 


### num - num

- [ ] correlacion.
- [ ] F-test vs mutual information.
- [ ] misma distribucion.


### cat - cat

- [ ] es misma distibucion.
- [ ] tabla de contingencia.


In [1]:
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')
from sklearn.datasets import load_iris
import pandas as pd
import numpy as np
import itertools
import math
import scipy.stats as st

## function

In [2]:
# CALCULATE Z VALUE
def get_z(confidence_level:float)->float:
    """
    Calculate Z value for a given confidence level.
    
    confidence_level -- confidence level into percent. 
    return -- z value.
    """
    return st.norm.ppf(1-(1-confidence_level/100.)/2)


# CALCULATE THE SAMPLE SIZE
def sample_size(population_size:int, confidence_level:float, confidence_interval:float):
    """
    Calculate the sample size using the Cochran’s Sample Size Formula.
    
    population_size -- the total population size.
    confidence_level -- the seleceted confidence level in percent. 
    confidence_interval -- the selected confidence interval in percent.
    return -- sample size with the correction for smaller population (no large).
    """
    Z = 0.0
    p = 0.5
    e = confidence_interval/100.0
    N = population_size
    n_0 = 0.0
    n = 0.0

    # FIND THE NUM STD DEVIATIONS FOR THAT CONFIDENCE LEVEL
    Z = get_z(confidence_level)

    if Z == 0.0:
        return -1

    # CALC SAMPLE SIZE
    n_0 = ((Z**2) * p * (1-p)) / (e**2)

    # ADJUST SAMPLE SIZE FOR FINITE POPULATION
    n = n_0 / (1 + ((n_0 - 1) / float(N)) )

    return int(math.ceil(n)) # THE SAMPLE SIZE

## load data

In [3]:
# load dataset
dataset = load_iris()
dataset.keys()
# dataset to df
raw = pd.DataFrame(dataset.data, columns = dataset.feature_names)
raw['class'] = dataset.target
dclass = dict()
for i, ic in enumerate(dataset.target_names):
    dclass[i] = ic
raw['class'] = raw['class'].map(dclass)

# Data Preparation

In [4]:
class Columns():
    def __init__(self, df:pd.DataFrame):
        self.num = df.select_dtypes(include=['float64']).columns.values  # numerical columns
        self.ord = df.select_dtypes(include=['int64']).columns.values    # numerical columns
        self.cat = df.select_dtypes(include=['object']).columns.values   # categorical columns  

    def __str__(self):
        return f'Categorical: {self.cat} \nNumerical: {self.num} \nOrdinal: {self.ord}' 

In [5]:
## categorical values conversion
def conversion_categorical_values(df:pd.DataFrame, col:str)->(pd.DataFrame, dict):
    # rename column
    df.rename(columns = {col:'original'}, inplace = True)
    # factorize
    df[col] = pd.factorize(df['original'])[0]
    # as str
    df[col] = df[col].astype(str)
    # create conversor
    dcat = df[['original', col]].drop_duplicates().set_index(col).to_dict()['original']
    # remove original column
    df.drop('original', axis = 1, inplace = True)
    # return
    return (df, dcat)


## simplify dataset
def data_simplify(raw:pd.DataFrame)->(pd.DataFrame, dict, dict):
    # copy 
    data = raw.copy()
    # get columns
    cols = Columns(data)
    # initialize
    dcols_name_to_alias = dict()
    dcols_alias_to_name = dict()
    # columns name converters: numerical
    if len(cols.num)>0:
        for i, ic in enumerate(cols.num):
            dcols_name_to_alias[ic] = 'n{}'.format(i)
            dcols_alias_to_name['n{}'.format(i)] = ic
    # columns name converters: categorical
    if len(cols.cat)>0:
        for i, ic in enumerate(cols.cat):
            dcols_name_to_alias[ic] = 'c{}'.format(i)
            dcols_alias_to_name['c{}'.format(i)] = ic
    # columns name converters: ordinal
    if len(cols.ord)>0:
        for i, ic in enumerate(cols.ord):
            dcols_name_to_alias[ic] = 'o{}'.format(i)
            dcols_alias_to_name['o{}'.format(i)] = ic
    # rename columns
    data.rename(columns = dcols_name_to_alias, inplace = True)
    # get columns
    cols_new = Columns(data)
    # initialize
    d_converter_cat_values = dict()
    # loop of categorical columns
    for col in cols_new.cat:
        data, d_converter_cat_values[col] = conversion_categorical_values(data, col)
    # return
    return (data, dcols_alias_to_name, d_converter_cat_values)

In [6]:
# dataset simplification
data, dcols_alias_to_name, d_converter_cat_values = data_simplify(raw)

# Queries combinations

In [7]:
# add more categorical variables [PARA TESTING]
#data['c1'] = data['n1'].apply(lambda x: str(int(x)))
#data['c2'] = data['n2'].apply(lambda x: str(int(x)))


In [8]:
# initialize final lists with single queries
LIST_QUERIES = list()
LIST_INDEX = list()
# initialize
n = 0
cols = Columns(data)

## variables combination

# all possible combinations between variables
per_cols = list()
for i in range(1,len(cols.cat)+1,1):
    per_cols += list(itertools.permutations(list(cols.cat),r=i))

## singles queries

# initialize
dsingle_queries = dict()
# get single queries
for iper_cols in per_cols:
    dsingle_queries[iper_cols[0]] = [f"{iper_cols[0]} == '{cat}'" for cat in sorted(list(data[iper_cols[0]].unique()))]

# loop of single queries
for c in dsingle_queries:
    # add single queries
    LIST_QUERIES += dsingle_queries[c]
    # add their indexes
    LIST_INDEX += [n for i in range(len(dsingle_queries[c]))]
    # add to index
    n+=1
    
## non single queries

# get combination queries
for iper_cols in [pc for pc in per_cols if len(pc)>1]:
    # combine list of single queries
    isingle_queries = list()
    for c in iper_cols:
        isingle_queries += dsingle_queries[c]
    # get all possible combinations
    comb = list(itertools.combinations(isingle_queries,r=len(iper_cols)))
    # initialize
    final_comb = list()
    # loop of combinations
    for ic in comb:
        # create final query
        icomb = ' & '.join(ic)
        # append only necessary queries
        if np.prod([c in icomb for c in iper_cols]):
            final_comb.append(icomb)
    # sort and append to the final list
    final_comb = sorted(final_comb)
    LIST_QUERIES += final_comb
    
    # estimate their indexes
    l = [c.split(f' & {iper_cols[-1]}')[:-1] for c in final_comb]
    ln = [n]
    for i in range(len(l)-1):
        if l[i] != l[i+1]:
            n += 1
        ln.append(n) 
    # add indexes to the final list
    LIST_INDEX += ln
    
# store queries in a df
dfqueries = pd.DataFrame({'query':LIST_QUERIES, 'number':LIST_INDEX})


## add number of records per query

# initialize
LIST_SIZES = list()
# loop of queries
for squery in dfqueries['query'].values:
    LIST_SIZES.append(len(data.query(squery)))
# add new columnt
dfqueries['sample_size'] = LIST_SIZES

In [9]:
dfqueries

Unnamed: 0,query,number,sample_size
0,c0 == '0',0,50
1,c0 == '1',0,50
2,c0 == '2',0,50


# Filter queries by min size of sample

In [10]:
# estimate minimun size of sample
population_sz = len(data)
confidence_level = 95.0
confidence_interval = 5.0
n_min_sample_size = sample_size(population_sz, confidence_level, confidence_interval)
print("SAMPLE SIZE: %d from %d" %(n_min_sample_size, population_sz))
# FOR TESTING
n_min_sample_size = 50
# filter queries
dfqueries = dfqueries[dfqueries.sample_size>=n_min_sample_size]
print(f'Number of queries after filtering = {len(dfqueries)}')

SAMPLE SIZE: 109 from 150
Number of queries after filtering = 3


In [11]:
dfqueries

Unnamed: 0,query,number,sample_size
0,c0 == '0',0,50
1,c0 == '1',0,50
2,c0 == '2',0,50


# ANALYSIS

In [23]:
# get variables to remove of analising (in this case only numerical)
cols_all = data.columns.tolist()
cols_remove = [c for c in cols_all if c in squery]
cols_num = [c for c in cols.num if not c in cols_remove]

# get samples
numbers_query_sets = sorted(list(dfqueries['number'].unique()))

## get samples

# loop of numbers of query sets
number = numbers_query_sets[0] # JUAN

# initialize samples
dsamples = dict()
# get queries for this set
queries = dfqueries[dfqueries['number'] == number]['query'].tolist()
# get samples per set
for squery in queries:
    dsamples[squery] = data.query(squery)[cols_num]

In [28]:
variable = 'n3'
# loop of samples
for k, dfsample in dsamples.items():
    sample = dfsample[variable].values
    print(k, np.mean(sample), np.std(sample))

c0 == '0' 0.24599999999999997 0.1043264108459598
c0 == '1' 1.3259999999999998 0.19576516544063705
c0 == '2' 2.0260000000000002 0.2718896835115301
