# Probability estimation for numerical / ordinal / categorical variables

In [31]:
%matplotlib inline
import numpy as np
import pandas as pd
import seaborn
import warnings
warnings.filterwarnings('ignore')

## load dataset

In [32]:
df = seaborn.load_dataset('titanic')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 15 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   survived     891 non-null    int64   
 1   pclass       891 non-null    int64   
 2   sex          891 non-null    object  
 3   age          714 non-null    float64 
 4   sibsp        891 non-null    int64   
 5   parch        891 non-null    int64   
 6   fare         891 non-null    float64 
 7   embarked     889 non-null    object  
 8   class        891 non-null    category
 9   who          891 non-null    object  
 10  adult_male   891 non-null    bool    
 11  deck         203 non-null    category
 12  embark_town  889 non-null    object  
 13  alive        891 non-null    object  
 14  alone        891 non-null    bool    
dtypes: bool(2), category(2), float64(2), int64(4), object(5)
memory usage: 80.6+ KB


# PROBABILITY: for categorical / ordinal variables

In [76]:
## Estimate probability of a categorical/ordinal variable
def estimate_probability_non_numerical(df:pd.DataFrame, colname:str)->pd.DataFrame:
    """
    Estimate probability values of a categorical/ordinal variable.
    df -- df who include the variable to be used.
    colname -- variable name of data to be used.
    return -- df of categories and their respective probabilities.
    """
    # validate if colname in df
    assert colname in df.columns.tolist(), f'"{colname}" is required.'
    # validate if too much categories
    cats = list(df[colname].dropna().unique())
    assert len(cats) < 50, f'"{colname}" has too much categories.'
    # create dictionary of probability values per categories        
    dfprob = df[[colname]].dropna().groupby(colname)[[colname]].count() / len(df[[colname]].dropna())
    # rename
    dfprob.rename(columns = {colname: 'probability'}, inplace = True)
    # return 
    return dfprob.reset_index()

In [77]:
# for categorical
colname = 'embark_town'
dfprob = estimate_probability_non_numerical(df, colname)
dfprob

Unnamed: 0,embark_town,probability
0,Cherbourg,0.188976
1,Queenstown,0.086614
2,Southampton,0.724409


In [78]:
# for ordinal
colname = 'pclass'
dfprob = estimate_probability_non_numerical(df, colname)
dfprob

Unnamed: 0,pclass,probability
0,1,0.242424
1,2,0.20651
2,3,0.551066


# PROBABILITY: for numerical variables (estimated PDF)

> NOTE: The returned PDF is just an approximation by KDE.

In [88]:
## Get pdf estimated with KDE for 1D data
def estimate_probability_numerical(df:pd.DataFrame, colname:str)->pd.DataFrame:
    """
    Estimate probability values of a numerical variable.
    df -- df who include the variable to be used.
    colname -- variable name of data to be used.
    return -- df of categories and their respective probabilities.
    """
    from scipy import stats
    # validate if colname in df
    assert colname in df.columns.tolist(), f'"{colname}" is required.'    
    # get data
    v = df[colname].dropna().values
    # get x values
    x = np.linspace(v.min(),v.max(), v.shape[0])
    # get kde kernel
    kernel = stats.gaussian_kde(v)    
    # store in a df
    dfprob = pd.DataFrame({colname:x, 'probability':kernel(x)})
    # return
    return dfprob

In [96]:
colname = 'fare'
dfprob = estimate_probability_numerical(df, colname)
dfprob

Unnamed: 0,fare,probability
0,0.000000,0.014098
1,0.575651,0.014586
2,1.151302,0.015065
3,1.726952,0.015533
4,2.302603,0.015989
...,...,...
886,510.026597,0.000103
887,510.602248,0.000104
888,511.177898,0.000105
889,511.753549,0.000105
