The intent of this experiment is to determine if Maximal Information Correlations can be used to identify non-linear relationships.

In [None]:
import pandas as pd
import numpy as np

import scipy.stats as stats

import matplotlib.pyplot as plt
import seaborn as sns

from itertools import combinations

from minepy import MINE

DATA_DIR_NAME = '/Users/karenblakemore/Koverse/data/'

plt.rcParams.update({'figure.max_open_warning': 0})

## Function to calculate MIC and Spearman coefficients for a data set.

In [None]:
CORRELATION_COLUMNS = ['x', 'y', 'MIC', 'Spearman']

def calculate_coefficients(pdf, filter_zeros):
    # Create pairs of all columns
    pairs = combinations(list(pdf.columns.values), 2)

    # Correlations dataframe
    correlations_pdf = pd.DataFrame(columns=CORRELATION_COLUMNS)

    # Intialize MINE parameters
    mine = MINE(alpha=.6, c=15, est="mic_approx")

    # Calculate MIC and Spearman 
    for idx, pair in enumerate(pairs):
        pair_pdf = pdf[[pair[0], pair[1]]]
        
        # Filter-out rows with zero values
        if(filter_zeros):
            pair_pdf = pair_pdf.loc[~(pair_pdf==0).any(axis=1)]

        mine.compute_score(pair_pdf[pair[0]], pair_pdf[pair[1]])
        mic = mine.mic()
        rho, p_value = stats.spearmanr(pair_pdf[pair[0]], pair_pdf[pair[1]])

        correlation = { 'x': pair[0], 
                        'y': pair[1], 
                        'MIC': mic,
                        'Spearman': rho}

        correlations_pdf = correlations_pdf.append(correlation, ignore_index=True)

        # print every 10 calculations    
        if(idx%10 == 0):   
            print(idx, mic, rho, pair[0], pair[1])
    
    # Sort by difference between MIC & Spearman
    correlations_pdf['correlation_difference'] = correlations_pdf['MIC'] - abs(correlations_pdf['Spearman'])
    correlations_pdf.sort_values(by=['correlation_difference'], ascending=False, inplace=True)
    
    print(correlations_pdf.shape)
    correlations_pdf.describe()
    display(correlations_pdf.head())
    
    return correlations_pdf   
            

## Function to plot correlations

In [None]:
def plot_correlations(correlation_pdf, filter_zeros):
    for i in range(0,300,3):
        fig, (ax1, ax2, ax3) = plt.subplots(ncols=3, figsize=(15,5))
        plt.subplots_adjust(wspace=.4)  # adjust vertical space between plots

        pair_pdf = pdf[[correlations_pdf.iloc[i]['x'], correlations_pdf.iloc[i]['y']]]
        if(filter_zeros):
            pair_pdf = pair_pdf.loc[~(pair_pdf==0).any(axis=1)]
        ax1.set_title('MIC {:.6f}  Spearman {:.6f}'.format(correlations_pdf.iloc[i]['MIC'], correlations_pdf.iloc[i]['Spearman']))
        sns.scatterplot(x=correlations_pdf.iloc[i]['x'], y=correlations_pdf.iloc[i]['y'], data=pair_pdf, ax=ax1)

        pair_pdf = pdf[[correlations_pdf.iloc[i+1]['x'], correlations_pdf.iloc[i+1]['y']]]
        if(filter_zeros):
            pair_pdf = pair_pdf.loc[~(pair_pdf==0).any(axis=1)]
        ax2.set_title('MIC {:.6f}  Spearman {:.6f}'.format(correlations_pdf.iloc[i+1]['MIC'], correlations_pdf.iloc[i+1]['Spearman']))
        sns.scatterplot(x=correlations_pdf.iloc[i+1]['x'], y=correlations_pdf.iloc[i+1]['y'], data=pair_pdf, ax=ax2)

        pair_pdf = pdf[[correlations_pdf.iloc[i+2]['x'], correlations_pdf.iloc[i+2]['y']]]
        if(filter_zeros):
            pair_pdf = pair_pdf.loc[~(pair_pdf==0).any(axis=1)]
        ax3.set_title('MIC {:.6f}  Spearman {:.6f}'.format(correlations_pdf.iloc[i+2]['MIC'], correlations_pdf.iloc[i+2]['Spearman']))
        sns.scatterplot(x=correlations_pdf.iloc[i+2]['x'], y=correlations_pdf.iloc[i+2]['y'], data=pair_pdf, ax=ax3)

## Assay Matrix Experiment
Compute correlations for pairs of assays relative to chemical reactions.  Data Set is the [ToxCast & Tox21 Data Spreadsheet](https://www.epa.gov/chemical-research/toxicity-forecaster-toxcasttm-data)

In [None]:
# Load data set
DATA_SET_NAME = 'zscore_Matrix_151020'
pdf = pd.read_csv(DATA_DIR_NAME + DATA_SET_NAME + '.csv', index_col=0, encoding='latin-1')
print(pdf.shape)

# Impute missing values with zero
pdf = pdf.fillna(0)

# Drop columns with all zeros
pdf = pdf.loc[:, (pdf != 0).any(axis=0)]

pdf.describe()
print(pdf.shape)
display(pdf.head())

# Calculate or load pre-calculated coefficients
#correlations_pdf = calculate_coefficients(pdf, True)
#correlations_pdf.to_csv(DATA_DIR_NAME + DATA_SET_NAME + '_correlations.csv', index=False)
correlations_pdf = pd.read_csv(DATA_DIR_NAME + DATA_SET_NAME + '_correlations.csv', encoding='latin-1')
display(correlations_pdf.head())

# Plot correlations
plot_correlations(correlations_pdf, False)

## Basketball Statistics Test
Compute correlations for [NBA Players stats since 1950](https://www.kaggle.com/drgilermo/nba-players-stats/home)

In [None]:
# Load dta set
DATA_SET_NAME = 'season_stats'
pdf = pd.read_csv(DATA_DIR_NAME + DATA_SET_NAME + '.csv', encoding='latin-1')

# Imput missing values with mean
pdf = pdf.fillna(pdf.mean())

# Drop string columns
pdf = pdf.drop(['blank2', 'blanl', 'Player', 'Pos', 'Tm'], axis=1)
pdf.describe()
print(pdf.shape)
display(pdf.head())

# Compute correlations or load pre-calculated ones
#correlations_pdf = calculate_coefficients(pdf, False)
#correlations_pdf.to_csv(DATA_DIR_NAME + DATA_SET_NAME + '_correlations.csv', index=False)
correlations_pdf = pd.read_csv(DATA_DIR_NAME + DATA_SET_NAME + '_correlations.csv', encoding='latin-1')
display(correlations_pdf.head())

# Plot correlations
plot_correlations(correlations_pdf, False)