Experiment to determine if mutual information regression and classification provide value over correlations for determining feature importance relative to a dependent variable.

In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_selection import f_regression, f_classif, mutual_info_regression

import matplotlib.pyplot as plt
import seaborn as sns

import scipy.stats as stats

import pprint

DATA_DIR_NAME = '/Users/karenblakemore/Koverse/data/'

plt.rcParams.update({'figure.max_open_warning': 0})

## Data Preparation

In [None]:
def data_preparation(pdf, drop_columns, continuous_columns, discrete_columns):
    pdf = pdf.drop(drop_columns, axis=1)
    
    for col in discrete_columns:
        pdf[col] = pd.to_numeric(pdf[col], downcast = 'integer')
        pdf[col] = pdf[col].fillna(pdf[col].median())
        
    # Drop columns with all unique values, as this will cause an exeception in mutual_info_regression
    for col in discrete_columns:
        if pdf[col].nunique() == pdf.shape[0]:
            discrete_columns.remove(col)
            pdf = pdf.drop([col], axis=1)    
    
    for col in continuous_columns:
        pdf[col] = pd.to_numeric(pdf[col])
        pdf[col] = pdf[col].fillna(pdf[col].mean())

    return pdf, discrete_columns

## Calculate Scores

In [None]:
SCORE_COLUMNS = ['y', 'x', 'f_test', 'pearson', 'spearman', 'mi']

def calculate_scores(pdf, continuous_columns, discrete_columns):

    scores_pdf = pd.DataFrame(columns = SCORE_COLUMNS)

    for col in pdf:
        print('Calculating Scores for {}'.format(col))
        y = pdf[col]
        X = pdf.drop([col], axis=1)
    
        discrete_column_indices = [X.columns.get_loc(x) for x in discrete_columns if x != col]

        # Empty discrete indices causes an exception
        if col in continuous_columns:
            mi = mutual_info_regression(X.values, y) if discrete_column_indices == [] \
                else mutual_info_regression(X.values, y, discrete_column_indices)
            f_test, _ = f_regression(X.values, y)

        else:
            mi = mutual_info_classif(X.values, y) if discrete_column_indices == [] \
                else mutual_info_regression(X.values, y, discrete_column_indices)
            f_test, _ = f_classif(X.values, y)

            
        f_test /= np.max(f_test)    
        mi /= np.max(mi)
        
        pearson = [stats.pearsonr(y, X[col].tolist())[0] for col in X]
        spearman = [stats.spearmanr(y, X[col].tolist())[0] for col in X]    

        length = X.shape[1]
        results_pdf = pd.DataFrame(index=range(length), columns = SCORE_COLUMNS)
        results_pdf['y'] = col
        results_pdf['x'] = X.columns.values
        results_pdf['f_test'] = f_test
        results_pdf['pearson'] = pearson
        results_pdf['spearman'] = spearman
        results_pdf['mi'] = mi

        results_pdf = results_pdf.sort_values(by = ['mi'], ascending=False)
        display(results_pdf.head())

        scores_pdf = scores_pdf.append(results_pdf, sort=True) 
        
    return scores_pdf

## Plot Bivariate Distributions

In [None]:
def bivariate_plots(pdf, scores_pdf, discrete_columns):
    
    # Number of plots is min of 300 and number of correlations, rounded down to nearest number divisible by 3
    number_of_plots = (min(300, scores_pdf.shape[0]) // 3) * 3
    
    for i in range(0,number_of_plots,3):
        fig, axis = plt.subplots(ncols=3, figsize=(18,6))
        plt.subplots_adjust(wspace=.4)  # adjust vertical space between plots

        for j in range(3):
            x_var = scores_pdf.iloc[i+j]['x']
            y_var = scores_pdf.iloc[i+j]['y']
            pair_pdf = pdf[[x_var, y_var]]
            axis[j].set_title('f-test={:.3f} pearson={:.3f} spearman={:.3f} mi={:.3f}'.format(scores_pdf.iloc[i+j]['f_test'],
                                                                                             scores_pdf.iloc[i+j]['pearson'],
                                                                                             scores_pdf.iloc[i+j]['spearman'],
                                                                                             scores_pdf.iloc[i+j]['mi']),
                             y=1.04)
            if x_var in discrete_columns and y_var in discrete_columns:
                sns.countplot(x=x_var, hue=y_var, data=pair_pdf, ax=axis[j])
            elif y_var in discrete_columns: 
                sns.stripplot(x=x_var, y=y_var, data=pair_pdf, orient='h', ax=axis[j])
            elif x_var in discrete_columns:
                 sns.stripplot(x=x_var, y=y_var, data=pair_pdf, orient='v', ax=axis[j])
            else:
                sns.scatterplot(x=x_var, y=y_var, data=pair_pdf, ax=axis[j])
    plt.show()

## Basketball Statistics Experiment
[NBA Players stats since 1950](https://www.kaggle.com/drgilermo/nba-players-stats/home)

### Load, Prep & Calculate Scores

In [None]:
DATA_SET_NAME = 'season_stats'
pdf = pd.read_csv(DATA_DIR_NAME + DATA_SET_NAME + '.csv', encoding='latin-1')
print(pdf.shape)
display(pdf.head())

drop_columns = ['blank2', 'blanl', 'Player', 'Pos', 'Tm']
continuous_columns = ['PER', 'TS%', '3PAr', 'FTr', 'ORB%', 'DRB%', 'TRB%', 'AST%', 'STL%', 'BLK%', 'TOV%', 'USG%', \
                     'OWS', 'DWS', 'WS', 'WS/48', 'OBPM', 'DBPM', 'BPM', 'VORP', 'FG%', '3P%', '2P%', 'eFG%', 'FT%', \
                     'Year', 'Age', 'G', 'GS', 'MP', 'FG', 'FGA', '3P', '3PA', '2P', '2PA', 'FT', 'FTA', 'ORB', \
                     'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS']
discrete_columns = []

prepped_pdf, updated_discrete_columns = data_preparation(pdf, drop_columns, continuous_columns, discrete_columns)
display(prepped_pdf.head())

scores_pdf = calculate_scores(prepped_pdf, continuous_columns, updated_discrete_columns)
display(scores_pdf.head())

### Plot Results

In [None]:
scores_pdf['max_score'] = scores_pdf[['f_test', 'pearson', 'spearman', 'mi']].max(axis=1)
bivariate_plots(prepped_pdf, scores_pdf.sort_values(by=['max_score'], ascending=False).head(100), updated_discrete_columns)

## Movie Experiement
[CSM (Conventional and Social Media Movies) Dataset 2014 and 2015](https://archive.ics.uci.edu/ml/datasets/CSM+%28Conventional+and+Social+Media+Movies%29+Dataset+2014+and+2015)

### Load, Prep & Calculate Scores

In [None]:
DATA_SET_NAME = 'Movies'
pdf = pd.read_csv(DATA_DIR_NAME + DATA_SET_NAME + '.csv', encoding='latin-1')
print(pdf.shape)
display(pdf.head())
print(pdf.nunique())

drop_columns = ['Movie']
continuous_columns = ['Ratings', 
                      'Gross', 
                      'Budget', 
                      'Screens', 
                      'Sentiment', 
                      'Views', 
                      'Likes',
                      'Dislikes', 
                      'Comments',
                      'Aggregate Followers',
                      'Sequel'
                     ]
discrete_columns = ['Year', 
                    'Genre'
                   ]
prepped_pdf, updated_discrete_columns = data_preparation(pdf, drop_columns, continuous_columns, discrete_columns)
display(prepped_pdf.head())
print(updated_discrete_columns)

scores_pdf = calculate_scores(prepped_pdf, continuous_columns, updated_discrete_columns)
display(scores_pdf.head())

### Plot Results

In [None]:
scores_pdf['max_score'] = scores_pdf[['f_test', 'pearson', 'spearman', 'mi']].max(axis=1)
bivariate_plots(prepped_pdf, scores_pdf.sort_values(by=['max_score'], ascending=False), updated_discrete_columns)

## Hospital Readmissions

In [None]:
DATA_SET_NAME = 'hospital_readmissions'
pdf = pd.read_csv(DATA_DIR_NAME + DATA_SET_NAME + '.csv', encoding='latin-1')
print(pdf.shape)
display(pdf.head())
print(pdf.nunique())

pdf = pdf.replace({'Below the national average': '1',
                   'Same as the national average': '2', 
                   'Above the national average': '3',
                   'Not Available': '',
                   'Too Few to Report': '',
                   'Results are not available for this reporting period': ''
                  }
                 )

drop_columns = ['Address',
                'City', 
                'County Name', 
                'Effectiveness of care national comparison footnote',
                'Efficient use of medical imaging national comparison footnote',
                'Emergency Services',
                'End Date',
                'Footnote',
                'Hospital Name',
                'Hospital Ownership',
                'Hospital Type',
                'Hospital overall rating footnote',
                'Measure Name',
                'Meets criteria for meaningful use of EHRs',
                'Mortality national comparison footnote',
                'Patient experience national comparison footnote',
                'Phone Number',
                'Provider ID',
                'Provider Number',
                'Readmission national comparison footnote',
                'Safety of care national comparison footnote',
                'Start Date',
                'State',
                'Timeliness of care national comparison footnote',
                'ZIP Code',
                'Hospital Name'
               ]
continuous_columns = ['Excess Readmission Ratio',
                      'Expected Readmission Rate',
                      'Predicted Readmission Rate',
                      'Number of Discharges',                   
                      'Number of Readmissions',                      
                     ]
discrete_columns = ['Effectiveness of care national comparison',
                    'Efficient use of medical imaging national comparison',
                    'Hospital overall rating',
                    'Mortality national comparison',
                    'Patient experience national comparison',
                    'Readmission national comparison',
                    'Safety of care national comparison',
                    'Timeliness of care national comparison'
                   ]

prepped_pdf, updated_discrete_columns = data_preparation(pdf, drop_columns, continuous_columns, discrete_columns)
display(prepped_pdf.head())
print(updated_discrete_columns)

scores_pdf = calculate_scores(prepped_pdf, continuous_columns, updated_discrete_columns)
display(scores_pdf.head())

In [None]:
scores_pdf['max_score'] = scores_pdf[['f_test', 'pearson', 'spearman', 'mi']].abs().max(axis=1)
bivariate_plots(prepped_pdf, scores_pdf.sort_values(by=['max_score'], ascending=False).head(100), updated_discrete_columns)

## Score Comparison

In [None]:
np.random.seed(0)

X = np.random.rand(1000, 3)
y = X[:, 0] + np.sin(6 * np.pi * X[:, 1]) + 0.1 * np.random.randn(1000)

pdf = pd.DataFrame({'y': y, 'x0': X[:,0].tolist(), 'x1': X[:,1].tolist(), 'x2': X[:,2].tolist()})

print(pdf.shape)
display(pdf.head())
print(pdf.nunique())

drop_columns = []
continuous_columns = ['y', 'x0', 'x1', 'x2']
discrete_columns = []

prepped_pdf, updated_discrete_columns = data_preparation(pdf, drop_columns, continuous_columns, discrete_columns)
display(prepped_pdf.head())
print(updated_discrete_columns)

scores_pdf = calculate_scores(prepped_pdf, continuous_columns, updated_discrete_columns)
display(scores_pdf.head())

bivariate_plots(pdf, scores_pdf[scores_pdf['y'] == 'y'])