Compare pair-wise correlation algorithms for continuous variables.  Algorithms are MIC, Spearman and Pearson.  Data set is [NBA Players stats since 1950](https://www.kaggle.com/drgilermo/nba-players-stats/home). 

In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_selection import f_regression, mutual_info_regression

import scipy.stats as stats

import matplotlib.pyplot as plt
import seaborn as sns

from itertools import combinations

from minepy import MINE

DATA_DIR_NAME = '/Users/karenblakemore/Koverse/data/'

plt.rcParams.update({'figure.max_open_warning': 0})

## Function to calculate correlations

In [None]:
CORRELATION_COLUMNS = ['x', 'y', 'MIC', 'Spearman', 'Pearson', 'MIC-Spearman', 'Spearman-Pearson']

def calculate_coefficients(pdf, filter_nans):
    # Create pairs of all columns
    pairs = combinations(list(pdf.columns.values), 2)

    # Correlations dataframe
    correlations_pdf = pd.DataFrame(columns=CORRELATION_COLUMNS)

    # Intialize MINE parameters
    mine = MINE(alpha=.6, c=15, est="mic_approx")

    # Calculate Spearman & Pearson coefficients
    print('Calculating MIC, Spearman & Pearson coefficients')
    for idx, pair in enumerate(pairs):
        pair_pdf = pdf[[pair[0], pair[1]]]  
        
         # Drop pairs with nan's
        pair_pdf = pair_pdf.dropna()
        
        mine.compute_score(pair_pdf[pair[0]], pair_pdf[pair[1]])
        mic = mine.mic()

        spearman, _ = stats.spearmanr(pair_pdf[pair[0]], pair_pdf[pair[1]])
        pearson, _ = stats.pearsonr(pair_pdf[pair[0]], pair_pdf[pair[1]])
        
        correlation = { 'x': pair[0], 
                        'y': pair[1], 
                        'MIC': mic,
                        'Spearman': spearman,
                        'Pearson': pearson,
                        'MIC-Spearman': -1,
                        'Spearman-Pearson': -1}

        correlations_pdf = correlations_pdf.append(correlation, ignore_index=True)
        
        # print every 10 calculations    
        if(idx%10 == 0):   
            print(idx, mic, spearman, pearson, pair[0], pair[1])
    
    print(correlations_pdf.shape)
    correlations_pdf.describe()
    display(correlations_pdf.head())
    
    return correlations_pdf   

## Function to plot correlations

In [None]:
def plot_correlations(correlations_pdf, filter_nans):
    
    # Number of plots is min of 300 and number of correlations, rounded down to nearest number divisible by 3
    number_of_plots = (min(300, correlations_pdf.shape[0]) // 3) * 3
    
    for i in range(0,number_of_plots,3):
        fig, axis = plt.subplots(ncols=3, figsize=(18,6))
        plt.subplots_adjust(wspace=.4)  # adjust vertical space between plots

        for j in range(3):
            pair_pdf = pdf[[correlations_pdf.iloc[i+j]['x'], correlations_pdf.iloc[i+j]['y']]]
            if(filter_nans):
                pair_pdf = pair_pdf.dropna()
            axis[j].set_title('MIC {:.6f}  Spearman {:.6f}  Pearson {:.6f}'.format(correlations_pdf.iloc[i+j]['MIC'], 
                                                                     correlations_pdf.iloc[i+j]['Spearman'],                          
                                                                     correlations_pdf.iloc[i+j]['Pearson']))
        
            sns.scatterplot(x=correlations_pdf.iloc[i+j]['x'], y=correlations_pdf.iloc[i+j]['y'], 
                            data=pair_pdf, ax=axis[j])
    plt.show()

## Function to plot coefficients

In [None]:
def plot_coefficients(correlations_pdf, x, y):
    sns.jointplot(x=x, y=y, data=correlations_pdf, kind="kde");
    plt.show()

## Basketball Statistics Experiment
[NBA Players stats since 1950](https://www.kaggle.com/drgilermo/nba-players-stats/home)

### Compute Correlations

In [None]:
# Load data set
DATA_SET_NAME = 'season_stats'
pdf = pd.read_csv(DATA_DIR_NAME + DATA_SET_NAME + '.csv', encoding='latin-1')

# Drop string columns
pdf = pdf.drop(['blank2', 'blanl', 'Player', 'Pos', 'Tm'], axis=1)

pdf.describe()
print(pdf.shape)
display(pdf.head())

# Compute correlations and save
correlations_pdf = calculate_coefficients(pdf, False)
correlations_pdf.to_csv(DATA_DIR_NAME + DATA_SET_NAME + '_correlations.csv', index=False)

### Plot Correlations

In [None]:
# Load correlations
correlations_pdf = pd.read_csv(DATA_DIR_NAME + DATA_SET_NAME + '_correlations.csv', encoding='latin-1')

# Plot correlations
# Rank by MI - Spearman
correlations_pdf['MIC-Spearman'] = correlations_pdf['MIC'] - abs(correlations_pdf['Spearman'])
correlations_pdf.sort_values(by=['MIC-Spearman'], ascending=False, inplace=True)
print('MIC-Spearman Plots')
plot_correlations(correlations_pdf, True)

# Rank by Spearman - MI
correlations_pdf['Spearman-MIC'] = abs(correlations_pdf['Spearman']) - correlations_pdf['MIC']
correlations_pdf.sort_values(by=['Spearman-MIC'], ascending=False, inplace=True)
print('Top Spearman-MIC Plots')
plot_correlations(correlations_pdf, True)

# Rank by MIC
correlations_pdf.sort_values(by=['MIC'], ascending=False, inplace=True)
print('Top MIC Plots')
plot_correlations(correlations_pdf, True)

# Rank by Spearman
correlations_pdf = correlations_pdf.reindex(correlations_pdf['Spearman'].abs().sort_values(ascending=False).index)
print('Top Spearman Plots')
plot_correlations(correlations_pdf, True)

### Plot VORP correlations

In [None]:
# Plot VORP correlations ranked by maximum coefficient
correlations_pdf = pd.read_csv(DATA_DIR_NAME + DATA_SET_NAME + '_correlations.csv', encoding='latin-1')
correlations_pdf = correlations_pdf[(correlations_pdf['x'] == 'VORP') |  (correlations_pdf['y'] == 'VORP')]
correlations_pdf['VORP-Max-Coefficient'] = correlations_pdf[['MIC', 'Spearman', 'Pearson']].max(axis=1)
correlations_pdf.sort_values(by=['VORP-Max-Coefficient'], ascending=False, inplace=True)
plot_correlations(correlations_pdf, True)

### Plot Correlation Coefficients

In [None]:
# Load correlations
correlations_pdf = pd.read_csv(DATA_DIR_NAME + DATA_SET_NAME + '_correlations.csv', encoding='latin-1')

# Plot MIC vs Spearman
print('MIC vs Spearman Coefficients')
plot_coefficients(correlations_pdf, 'MIC', 'Spearman')

# Plot Spearman vs Pearson
print('Spearman vs Pearson Coefficients')
plot_coefficients(correlations_pdf, 'Spearman', 'Pearson')

## Plot comparison of MIC, Spearman & Pearson for canonical example

In [None]:
from __future__ import division

rs = np.random.RandomState(seed=0)

def mysubplot(x, y, numRows, numCols, plotNum,
              xlim=(-4, 4), ylim=(-4, 4)):

    r = np.around(stats.pearsonr(x, y)[0], 1)
    s = np.around(stats.spearmanr(x, y)[0], 1)
  
    mine = MINE(alpha=0.6, c=15, est="mic_approx")
    mine.compute_score(x, y)
    mic = np.around(mine.mic(), 1)
    ax = plt.subplot(numRows, numCols, plotNum,
                     xlim=xlim, ylim=ylim)
    ax.set_title('Pearson=%.1f\nSpearman=%.1f\nMIC=%.1f' % (r, s, mic),fontsize=12)
    ax.set_frame_on(False)
    ax.axes.get_xaxis().set_visible(False)
    ax.axes.get_yaxis().set_visible(False)
    ax.plot(x, y, ',')
    ax.set_xticks([])
    ax.set_yticks([])
    return ax

def rotation(xy, t):
    return np.dot(xy, [[np.cos(t), -np.sin(t)], [np.sin(t), np.cos(t)]])

def mvnormal(n=1000):
    cors = [1.0, 0.8, 0.4, 0.0, -0.4, -0.8, -1.0]
    for i, cor in enumerate(cors):
        cov = [[1, cor],[cor, 1]]
        xy = rs.multivariate_normal([0, 0], cov, n)
        mysubplot(xy[:, 0], xy[:, 1], 3, 7, i+1)

def rotnormal(n=1000):
    ts = [0, np.pi/12, np.pi/6, np.pi/4, np.pi/2-np.pi/6,
          np.pi/2-np.pi/12, np.pi/2]
    cov = [[1, 1],[1, 1]]
    xy = rs.multivariate_normal([0, 0], cov, n)
    for i, t in enumerate(ts):
        xy_r = rotation(xy, t)
        mysubplot(xy_r[:, 0], xy_r[:, 1], 3, 7, i+8)

def others(n=1000):
    x = rs.uniform(-1, 1, n)
    y = 4*(x**2-0.5)**2 + rs.uniform(-1, 1, n)/3
    mysubplot(x, y, 3, 7, 15, (-1, 1), (-1/3, 1+1/3))

    y = rs.uniform(-1, 1, n)
    xy = np.concatenate((x.reshape(-1, 1), y.reshape(-1, 1)), axis=1)
    xy = rotation(xy, -np.pi/8)
    lim = np.sqrt(2+np.sqrt(2)) / np.sqrt(2)
    mysubplot(xy[:, 0], xy[:, 1], 3, 7, 16, (-lim, lim), (-lim, lim))

    xy = rotation(xy, -np.pi/8)
    lim = np.sqrt(2)
    mysubplot(xy[:, 0], xy[:, 1], 3, 7, 17, (-lim, lim), (-lim, lim))

    y = 2*x**2 + rs.uniform(-1, 1, n)
    mysubplot(x, y, 3, 7, 18, (-1, 1), (-1, 3))

    y = (x**2 + rs.uniform(0, 0.5, n)) * \
        np.array([-1, 1])[rs.random_integers(0, 1, size=n)]
    mysubplot(x, y, 3, 7, 19, (-1.5, 1.5), (-1.5, 1.5))

    y = np.cos(x * np.pi) + rs.uniform(0, 1/8, n)
    x = np.sin(x * np.pi) + rs.uniform(0, 1/8, n)
    mysubplot(x, y, 3, 7, 20, (-1.5, 1.5), (-1.5, 1.5))

    xy1 = np.random.multivariate_normal([3, 3], [[1, 0], [0, 1]], int(n/4))
    xy2 = np.random.multivariate_normal([-3, 3], [[1, 0], [0, 1]], int(n/4))
    xy3 = np.random.multivariate_normal([-3, -3], [[1, 0], [0, 1]], int(n/4))
    xy4 = np.random.multivariate_normal([3, -3], [[1, 0], [0, 1]], int(n/4))
    xy = np.concatenate((xy1, xy2, xy3, xy4), axis=0)
    mysubplot(xy[:, 0], xy[:, 1], 3, 7, 21, (-7, 7), (-7, 7))

plt.figure(facecolor='white')
plt.figure(figsize=(12,9))

mvnormal(n=800)
rotnormal(n=200)
others(n=800)
plt.tight_layout()
plt.show()