In [None]:
import sys

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

import scipy.stats as stats

import pprint

DATA_DIR_NAME = '/Users/karenblakemore/Koverse/data/'

plt.rcParams.update({'figure.max_open_warning': 0})

In [None]:
def analyze_subpopulation(population, subpopulation, subpopulation_onehot, conditions, col_type, results):
    u = population.mean()
    t, t_prob = stats.ttest_1samp(subpopulation, u)
    rho, p_value = stats.spearmanr(population, subpopulation_onehot)
    count = subpopulation.sum()
    results = results.append({'conditions': conditions,
                              'col_type': col_type,
                              't': t, 
                              't_prob': t_prob, 
                              'rho': rho, 
                              'p_value': p_value,
                              'score_ttest': 0, 
                              'score_rho': 0, 
                              'population': population.tolist(),
                              'subpopulation': subpopulation.tolist()
                              },
                             ignore_index=True)
    return results

In [None]:
def population_statistics(population):    
    print('\033[1m' + '\nPopulation' + '\033[0m')
    print(population.describe().reset_index().to_string(header=None, index=None))
    print('median', population.median())
    print('Normalized Value Counts')
    print(population.value_counts(normalize=True).sort_index().reset_index().to_string(header=None, index=None))

In [None]:
def subpopulation_statistics(result):
    subpopulation = pd.Series(result['subpopulation'])
    conditions = result['conditions']
    col_type = result['col_type']
    
    print('\033[1m' + conditions  + '\033[0m')
    print('t-statistic', result['t'], 'two-tailed p-value', result['t_prob'])
    print('rho', result['rho'], 'p-value', result['p_value'])
    
    print('\033[1m' + '\nSubpopulation' + '\033[0m')
    print(subpopulation.describe().reset_index().to_string(header=None, index=None))
    print('median', subpopulation.median())
    if col_type in ['discrete', 'categorical']:
        print('Normalized Value Counts')
        print(subpopulation.value_counts(normalize=True).sort_index().reset_index().to_string(header=None, index=None))

In [None]:
def visualizations(result, scoring_col):
    conditions = result['conditions']
    population = pd.Series(result['population'])
    subpopulation = pd.Series(result['subpopulation'])
   
    try:
        sns.distplot(population, label='population', hist=False)
        sns.distplot(subpopulation, label=conditions, hist=False)
        plt.legend()
        plt.show()
    except:
        print("visualization failed", sys.exc_info()[0])
    print('\n')

In [None]:
DATA_SET_NAME = 'titanic'

pdf = pd.read_csv(DATA_DIR_NAME + DATA_SET_NAME + '.csv', encoding='latin-1')

scoring_col = 'Pclass'

results = pd.DataFrame(columns = ['conditions', 
                                  'col_type',
                                  't', 
                                  't_prob', 
                                  'rho', 
                                  'p_value', 
                                  'score_ttest', 
                                  'score_rho',
                                  'population',
                                  'subpopulation'])

pdf[scoring_col] = pdf[scoring_col].fillna(pdf[scoring_col].mean())
population = pdf[scoring_col]
population_statistics(population)

subpopulation = pdf[pdf['Survived'] == 1][scoring_col]
filter = lambda x: int(x == 1)
subpopulation_onehot = pdf['Survived'].apply(filter)
results = analyze_subpopulation(population, subpopulation, subpopulation_onehot, 'Survived = 1', 'discrete', results)

subpopulation = pdf[pdf['Survived'] == 0][scoring_col]
filter = lambda x: int(x == 0)
subpopulation_onehot = pdf['Survived'].apply(filter)
results = analyze_subpopulation(population, subpopulation, subpopulation_onehot, 'Survived = 0', 'discrete', results)

subpopulation = pdf[pdf['Embarked'] == 'C'][scoring_col]
filter = lambda x: int(x == 'C')
subpopulation_onehot = pdf['Embarked'].apply(filter)
results = analyze_subpopulation(population, subpopulation, subpopulation_onehot, 'Embarked = C', 'categorical', results)

subpopulation = pdf[pdf['Embarked'] == 'Q'][scoring_col]
filter = lambda x: int(x == 'Q')
subpopulation_onehot = pdf['Embarked'].apply(filter)
results = analyze_subpopulation(population, subpopulation, subpopulation_onehot, 'Embarked = Q', 'categorical', results)

subpopulation = pdf[pdf['Embarked'] == 'S'][scoring_col]
filter = lambda x: int(x == 'S')
subpopulation_onehot = pdf['Embarked'].apply(filter)
results = analyze_subpopulation(population, subpopulation, subpopulation_onehot, 'Embarked = S', 'categorical', results)

subpopulation = pdf[pdf['Sex'] == 'male'][scoring_col]
filter = lambda x: int(x == 'male')
subpopulation_onehot = pdf['Sex'].apply(filter)
results = analyze_subpopulation(population, subpopulation, subpopulation_onehot, 'Sex = male', 'categorical', results)

subpopulation = pdf[pdf['Sex'] == 'female'][scoring_col]
filter = lambda x: int(x == 'female')
subpopulation_onehot = pdf['Sex'].apply(filter)
results = analyze_subpopulation(population, subpopulation, subpopulation_onehot, 'Sex = female', 'categorical', results)

subpopulation = pdf[pdf['Age'] <= 30][scoring_col]
filter = lambda x: int(x <= 30)
subpopulation_onehot = pdf['Age'].apply(filter)
results = analyze_subpopulation(population, subpopulation, subpopulation_onehot, 'Age <= 30', 'continuous', results)

subpopulation = pdf[pdf['Age'] > 30][scoring_col]
filter = lambda x: int(x > 30)
subpopulation_onehot = pdf['Age'].apply(filter)
results = analyze_subpopulation(population, subpopulation, subpopulation_onehot, 'Age > 30', 'continuous', results)

subpopulation = pdf[pdf['Parch'] == 0][scoring_col]
filter = lambda x: int(x == 0)
subpopulation_onehot = pdf['Parch'].apply(filter)
results = analyze_subpopulation(population, subpopulation, subpopulation_onehot, 'Parch == 0', 'continuous', results)

subpopulation = pdf[(pdf['Parch'] >= 1) & (pdf['Parch'] <= 4)][scoring_col]
filter = lambda x: int(x >= 1 and x <= 4)
subpopulation_onehot = pdf['Parch'].apply(filter)
results = analyze_subpopulation(population, subpopulation, subpopulation_onehot, '1 <= Parch <= 4', 'continuous', results)

subpopulation = pdf[pdf['Parch'] > 4][scoring_col]
filter = lambda x: int(x > 4)
subpopulation_onehot = pdf['Parch'].apply(filter)
results = analyze_subpopulation(population, subpopulation, subpopulation_onehot, 'Parch > 4', 'continuous', results)

results['score_ttest'] = abs(results['t']) * (1 - results['t_prob'])
results['score_rho'] = abs(results['rho']) * (1 - results['p_value'])
results = results.sort_values(by=['score_ttest'], ascending=False)

display(results)

for index, row in results.iterrows():
    if(row['t'] != float('inf')):
        subpopulation_statistics(row)
        visualizations(row, scoring_col)
    