# Spearman Correlations of Dependent Variable & Independent Values

In [None]:
import pandas as pd
import numpy as np

import csv
import warnings

import scipy.stats as stats

DATA_DIR_NAME = '../../data/'

MAX_CARDINALITY = 4000

warnings.filterwarnings('ignore')

## Prepare Data Set
1. Impute missing values - median for discrete variables, mean for continuous
2. Separate dependent variable
3. Bin continuous variables
4. One-hot encode independent variables

In [None]:
def data_preparation(dependent_col_name, pdf, continuous_columns, discrete_columns, categorical_columns):
  
    # Drop columns with high cardinality
    max_cardinality = min(MAX_CARDINALITY, pdf.shape[0])
    pdf = pdf.drop([col for col in categorical_columns if pdf[col].nunique() >= max_cardinality], axis=1)
    
    # Impute missing discrete values
    for col in discrete_columns:
        pdf[col] = pd.to_numeric(pdf[col], downcast = 'integer')
        pdf[col] = pdf[col].fillna(pdf[col].median())
        
    # Impute missing continuous values
    for col in continuous_columns:
        pdf[col] = pd.to_numeric(pdf[col])
        pdf[col] = pdf[col].fillna(pdf[col].mean())
        
    # separate dependent variable from independent ones
    if dependent_col_name in discrete_columns: discrete_columns.remove(dependent_col_name)
    if dependent_col_name in continuous_columns: continuous_columns.remove(dependent_col_name)
        
    dependent_col = pdf[dependent_col_name]
    pdf = pdf.drop([dependent_col_name], axis=1)
    
    # bin continuous variables
    for col in continuous_columns:
        edges = np.linspace(pdf[col].min(), pdf[col].max(), num=10).tolist()
        pdf[col], edges = pd.cut(pdf[col], edges, retbins = True)
        
    # one hot encode variables
    for col in pdf:
        col_dummies = pd.get_dummies(pdf[col], prefix=col, prefix_sep='=')
        pdf = pdf.drop(col, axis=1)
        pdf = pd.concat([pdf, col_dummies], axis=1)  

    return dependent_col, pdf

## Calculate Correlations and Rank Results

In [None]:
CORRELATION_COLUMNS = ['value', 'correlation', 'p_value', 'count', 'score']

def calculate_correlations(dependent_col, pdf):
    correlations_pdf = pd.DataFrame(index=range(pdf.shape[1]), columns = CORRELATION_COLUMNS)
    idx = 0
    
    for col_name, col in pdf.iteritems():
        rho, p_value = stats.spearmanr(dependent_col, col)
        count = col.sum()
        row = {'value': col_name, 'correlation': rho, 'p_value': p_value, 'count': count}
        correlations_pdf.loc[idx] = row
        
        # print every 100 calculations   
        if(idx % 100 == 0): print(idx, row)
        idx += 1
     
    correlations_pdf['score'] = abs(correlations_pdf['correlation'])* (1 - correlations_pdf['p_value'])
    correlations_pdf = correlations_pdf.sort_values(by=['score'], ascending=False)
    return correlations_pdf

## Experiment - Hospital Readmissions with Overall Rating as Dependent Variable

In [None]:
DATA_SET_NAME = 'hospital_readmissions'
pdf = pd.read_csv(DATA_DIR_NAME + DATA_SET_NAME + '.csv', encoding='latin-1')
print(pdf.shape)
display(pdf.head())
print(pdf.nunique())

pdf = pdf.replace({'Below the national average': '1',
                   'Same as the national average': '2', 
                   'Above the national average': '3',
                   'Not Available': '',
                   'Too Few to Report': '',
                   'Results are not available for this reporting period': ''
                  }
                 )

continuous_columns = ['Excess Readmission Ratio',
                      'Expected Readmission Rate',
                      'Predicted Readmission Rate',
                      'Number of Discharges',                   
                      'Number of Readmissions'
                     ]

discrete_columns = ['Effectiveness of care national comparison',
                    'Efficient use of medical imaging national comparison',
                    'Hospital overall rating',
                    'Mortality national comparison',
                    'Patient experience national comparison',
                    'Readmission national comparison',
                    'Safety of care national comparison',
                    'Timeliness of care national comparison'
                   ]       

categorical_columns = ['Address',
                       'City', 
                       'County Name', 
                       'Effectiveness of care national comparison footnote',
                       'Efficient use of medical imaging national comparison footnote',
                       'Emergency Services',
                       'End Date',
                       'Footnote',
                       'Hospital Name',
                       'Hospital Ownership',
                       'Hospital Type',
                       'Hospital overall rating footnote',
                       'Measure Name',
                       'Meets criteria for meaningful use of EHRs',
                       'Mortality national comparison footnote',
                       'Patient experience national comparison footnote',
                       'Phone Number',
                       'Provider ID',
                       'Provider Number',
                       'Readmission national comparison footnote',
                       'Safety of care national comparison footnote',
                       'Start Date',
                       'State',
                       'Timeliness of care national comparison footnote',
                       'ZIP Code'
                      ]
 
dependent_col_name = 'Hospital overall rating'  
    
dependent_col, prepped_pdf = data_preparation(dependent_col_name, pdf, continuous_columns, discrete_columns, categorical_columns)
print(prepped_pdf.shape)

correlations_pdf = calculate_correlations(dependent_col, prepped_pdf)
display(correlations_pdf.head(20))

## Experiment - Titanic Data Set with Survived as Dependent Variable

In [None]:
DATA_SET_NAME = 'titanic'

pdf = pd.read_csv(DATA_DIR_NAME + DATA_SET_NAME + '.csv', encoding='latin-1')
print(pdf.shape)
display(pdf.head())
print(pdf.nunique())

continuous_columns = ['Age',
                      'Fare',
                      'Parch',
                      'SibSp'
                     ]

discrete_columns = ['Pclass',
                    'Survived'
                   ]
    
categorical_columns = ['Cabin',
                       'Embarked',
                       'Name',
                       'PassengerId',
                       'Sex',
                       'Ticket'
                      ]

drop_columns = ['Cabin',
                'Name',
                'PassengerId',
                'Ticket'
               ]

dependent_col_name = 'Survived'  
    
dependent_col, prepped_pdf = data_preparation(dependent_col_name, pdf, continuous_columns, discrete_columns, categorical_columns)
print(prepped_pdf.shape)

correlations_pdf = calculate_correlations(dependent_col, prepped_pdf)
display(correlations_pdf.head(20))