In [None]:
import sys

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

import scipy.stats as stats

import pprint

DATA_DIR_NAME = '/Users/karenblakemore/Koverse/data/'

plt.rcParams.update({'figure.max_open_warning': 0})

In [None]:
def analyze_subpopulation(population, subpopulation, subpopulation_onehot, conditions, results):
    u = population.mean()
    t, t_prob = stats.ttest_lsamp(subpopulation, u)
    rho, p_value = stats.spearmanr(population, subpopulation)
    count = subpopulation.sum()
    results = results.append({'conditions': conditions, 't': t, 't_prob': t_prob, 'rho': rho, 'p_value': p_value})
    return results

In [None]:
def population_statistics(conditions, population, pop_nulls):    
    print('\033[1m' + '\nPopulation' + '\033[0m')
    print(population.describe().reset_index().to_string(header=None, index=None))
    print('median', population.median())
    print('null count', pop_nulls)
    if coltype == 'discrete':
        print('Normalized Value Counts')
        print(population.value_counts(normalize=True).sort_index().reset_index().to_string(header=None, index=None))

In [None]:
def statistics(conditions, population, subpopulation, pop_nulls, subpop_nulls, coltype, sig_map):
    print('\033[1m' + col + '\033[0m')
    print('t-statistic', sig_map['t'], 'two-tailed p-value', sig_map['t_prob'])
    print('wilcoxon', sig_map['w'], 'two-sided p-value', sig_map['w_prob'])

    print('\033[1m' + '\nPopulation' + '\033[0m')
    print(population.describe().reset_index().to_string(header=None, index=None))
    print('median', population.median())
    print('null count', pop_nulls)
    if coltype == 'discrete':
        print('Normalized Value Counts')
        print(population.value_counts(normalize=True).sort_index().reset_index().to_string(header=None, index=None))
    
    print('\033[1m' + '\nSubpopulation' + '\033[0m')
    print(subpopulation.describe().reset_index().to_string(header=None, index=None))
    print('median', population.median())
    print('null count', subpop_nulls)
    if coltype == 'discrete':
        print('Normalized Value Counts')
        print(subpopulation.value_counts(normalize=True).sort_index().reset_index().to_string(header=None, index=None))

In [None]:
def visualizations(col, population, subpopulation):
    try:
        sns.distplot(population, label='population', hist=False)
        sns.distplot(subpopulation, label='subpopulation', hist=False)
        plt.legend()
        plt.show()
    except:
        print("continuous_visualization failed", sys.exc_info()[0])
    print('\n')

In [None]:
DATA_SET_NAME = 'titanic'

pdf = pd.read_csv(DATA_DIR_NAME + DATA_SET_NAME + '.csv', encoding='latin-1')

scoring_col = 'Pclass'

results = pd.DataFrame(columns = ['conditions', 't', 't_prob', 'rho', 'p_value'])

pdf[scoring_col] = pdf[scoring_col].fillna(pdf[col].mean())
population = pdf[scoring_col]
population_statistics(population)

subpopulation = pdf[pdf['Survived'] == 1][scoring_col]
filter = lambda x: x['Survived'] == 1
subpopulation_onehot = pdf.apply(filter)
results = analyze_subpopulation(population, subpopulation, subpopulation_onehot, 'Survived = 1', results)

subpopulation = pdf[pdf['Survived'] == 0][scoring_col]
filter = lambda x: x['Survived'] == 0
subpopulation_onehot = pdf.apply(filter)
results = analyze_subpopulation(population, subpopulation, subpopulation_onehot, 'Survived = 0', results)

subpopulation = pdf[pdf['Embarked'] == 'C'][scoring_col]
filter = lambda x: x['Embarked'] == 'C'
subpopulation_onehot = pdf.apply(filter)
results = analyze_subpopulation(population, subpopulation, subpopulation_onehot, 'Embarked = C', results)

subpopulation = pdf[pdf['Embarked'] == 'Q'][scoring_col]
filter = lambda x: x['Embarked'] == 'Q'
subpopulation_onehot = pdf.apply(filter)
results = analyze_subpopulation(population, subpopulation, subpopulation_onehot, 'Embarked = Q', results)

subpopulation = pdf[pdf['Embarked'] == 'S'][scoring_col]
filter = lambda x: x['Embarked'] == 'S'
subpopulation_onehot = pdf.apply(filter)
results = analyze_subpopulation(population, subpopulation, subpopulation_onehot, 'Embarked = S', results)

subpopulation = pdf[pdf['Sex'] == 'male'][scoring_col]
filter = lambda x: x['Sex'] == 'male'
subpopulation_onehot = pdf.apply(filter)
results = analyze_subpopulation(population, subpopulation, subpopulation_onehot, 'Sex = male', results)

subpopulation = pdf[pdf['Sex'] == 'female'][scoring_col]
filter = lambda x: x['Sex'] == 'female'
subpopulation_onehot = pdf.apply(filter)
results = analyze_subpopulation(population, subpopulation, subpopulation_onehot, 'Sex = female', results)

subpopulation = pdf[pdf['Age'] <= 30][scoring_col]
filter = lambda x: x['Age'] <= 30
subpopulation_onehot = pdf.apply(filter)
results = analyze_subpopulation(population, subpopulation, subpopulation_onehot, 'Age <= 30', results)

subpopulation = pdf[pdf['Age'] > 30][scoring_col]
filter = lambda x: x['Age'] > 30
subpopulation_onehot = pdf.apply(filter)
results = analyze_subpopulation(population, subpopulation, subpopulation_onehot, 'Age > 30', results)

subpopulation = pdf[pdf['Parch'] == 0][scoring_col]
filter = lambda x: x['Parch'] == 0
subpopulation_onehot = pdf.apply(filter)
results = analyze_subpopulation(population, subpopulation, subpopulation_onehot, 'Parch == 0', results)

subpopulation = pdf[(pdf['Parch'] >= 1) & (pdf['Parch'] <= 4)][scoring_col]
filter = lambda x: x['Parch'] >= 1 and x['Parch'] <= 4
subpopulation_onehot = pdf.apply(filter)
results = analyze_subpopulation(population, subpopulation, subpopulation_onehot, '1 <= Parch <= 4', results)

subpopulation = pdf[pdf['Parch'] > 4][scoring_col]
filter = lambda x: x['Parch'] > 4
subpopulation_onehot = pdf.apply(filter)
results = analyze_subpopulation(population, subpopulation, subpopulation_onehot, 'Parch > 4', results)

for result in results:
    