In [1]:
import pandas as pd
import numpy as np
import scipy.stats as ss
import itertools
import math
from collections import Counter

In [2]:
def cramers_v(x, y):
    confusion_matrix = pd.crosstab(x,y)
    chi2 = ss.chi2_contingency(confusion_matrix)[0]
    n = confusion_matrix.sum().sum()
    phi2 = chi2/n
    r,k = confusion_matrix.shape
    phi2corr = max(0, phi2-((k-1)*(r-1))/(n-1))
    rcorr = r-((r-1)**2)/(n-1)
    kcorr = k-((k-1)**2)/(n-1)
    return np.sqrt(phi2corr/min((kcorr-1), (rcorr-1)))

def conditional_entropy(x,y):
    # entropy of x given y
    y_counter = Counter(y)
    xy_counter = Counter(list(zip(x,y)))
    total_occurrences = sum(y_counter.values())
    entropy = 0
    for xy in xy_counter.keys():
        p_xy = xy_counter[xy] / total_occurrences
        p_y = y_counter[xy[1]] / total_occurrences
        entropy += p_xy * math.log(p_y/p_xy)
    return entropy

def uncertainty_coefficient(x,y):
    s_xy = conditional_entropy(x,y)
    x_counter = Counter(x)
    total_occurrences = sum(x_counter.values())
    p_x = list(map(lambda n: n/total_occurrences, x_counter.values()))
    s_x = ss.entropy(p_x)
    if s_x == 0:
        return 1
    else:
        return (s_x - s_xy) / s_x

def correlation_ratio(categories, measurements):
    fcat, _ = pd.factorize(categories)
    cat_num = np.max(fcat)+1
    y_avg_array = np.zeros(cat_num)
    n_array = np.zeros(cat_num)
    for i in range(0, cat_num):
        cat_measures = measurements[np.argwhere(fcat == i).flatten()]
        n_array[i] = len(cat_measures)
        y_avg_array[i] = np.average(cat_measures)
    y_total_avg = np.sum(np.multiply(y_avg_array, n_array))/np.sum(n_array)
    numerator = np.sum(np.multiply(n_array, np.power(np.subtract(y_avg_array, y_total_avg), 2)))
    denominator = np.sum(np.power(np.subtract(measurements, y_total_avg), 2))
    if numerator == 0:
        eta = 0.0
    else:
        eta = numerator/denominator
    return eta


In [4]:
df_labs = pd.read_csv(r'S:\NCATS\Clinical_Profiles\clean_data\EDS\jh_eds_labs.txt','|')
df_diagnoses_hpo = pd.read_csv(r'S:\NCATS\Clinical_Profiles\clean_data\EDS\jh_eds_diagnoses_hpo.txt','|')
df_encounter = pd.read_csv(r'S:\NCATS\Clinical_Profiles\clean_data\EDS\jh_eds_encounter.txt','|')
df_meds = pd.read_csv(r'S:\NCATS\Clinical_Profiles\clean_data\EDS\jh_eds_meds.txt','|')

In [5]:
df = (df_labs.merge(df_diagnoses_hpo, on='PatientID')
             .merge(df_encounter, on=['PatientID','EncounterID'], how='outer')
             .merge(df_meds, on=['PatientID','EncounterID'], how='outer'))

MemoryError: 