In [1]:
import sys
sys.path.append('..')

from sklearn.datasets import load_diabetes
from sklearn.preprocessing import minmax_scale
import numpy as np
import korr

load demo data set

In [2]:
y = minmax_scale(load_diabetes().target) > 0.5
X = minmax_scale(load_diabetes().data, axis=0) > 0.5

compute correlation

In [3]:
%%time
y_rho, y_pval, x_corr, x_pval = korr.slice_yx(*korr.mcc(np.c_[y, X]), ydim=1)

CPU times: user 27.1 ms, sys: 4.24 ms, total: 31.3 ms
Wall time: 29.4 ms


## What means best?
Find the best correlations, i.e. from 

1. The highest `|r|` that are `|r|>=rlim` and `p<plim`
2. From the remaining, the next best `|r|` that are still  `p<plim`
3. From the remaining, the lowest `p`

## What means worst?
Find the worst correlations, i.e. from 

1. The highest `p` that are `p>plim` and `|r|<rlim` 
2. From the remaining, the next worst `p` that are `|r|<rlim` 
3. From the remaining, the lowest `|r|`

## Find the best and worst
`y_rho` and `y_pval` are 1D vectors

In [4]:
y_rho, y_pval

(array([ 0.15184347,  0.04412473,  0.3570418 ,  0.35969325,  0.22976149,
         0.08161941, -0.24120479,  0.2036893 ,  0.46442679,  0.14314348]),
 array([1.41131872e-03, 3.53579142e-01, 6.08402217e-14, 3.96349620e-14,
        1.36218919e-06, 8.61712396e-02, 3.95646823e-07, 1.84952766e-05,
        0.00000000e+00, 2.61747636e-03]))

Find the "best" five relationships

In [5]:
idx = korr.find_best(y_rho, y_pval, m=5, rlim=.4, plim=0.01)
print(np.c_[idx, y_rho[idx], y_pval[idx].round(2)])

[[ 8.          0.46442679  0.        ]
 [ 3.          0.35969325  0.        ]
 [ 2.          0.3570418   0.        ]
 [ 6.         -0.24120479  0.        ]
 [ 4.          0.22976149  0.        ]]


Find the five "worst" relationships, i.e. unrelated correlations

In [6]:
idx = korr.find_worst(y_rho, y_pval, m=5)
print(np.c_[idx, y_rho[idx], y_pval[idx].round(2)])

[[1.         0.04412473 0.35      ]
 [5.         0.08161941 0.09      ]
 [9.         0.14314348 0.        ]
 [0.         0.15184347 0.        ]
 [7.         0.2036893  0.        ]]


## Process Correlations Matrices
`x_corr` and `x_pval` are 2D matricies.
First flatten these matricies, i.e. convert it to a pair index table.

In [7]:
df = korr.flatten(x_corr, x_pval)
#df.head()

In [8]:
idx = korr.find_best(df['cor'].values, df['pval'].values, m=10, rlim=.4, plim=0.01)
df.iloc[idx]

Unnamed: 0,i,j,cor,pval
30,4,5,0.619078,0.0
36,5,7,0.414115,0.0
33,4,8,0.40432,0.0
42,7,8,0.362236,2.620126e-14
2,0,3,0.332697,2.660872e-12
44,8,9,0.322831,1.143852e-11
32,4,7,0.319981,1.729528e-11
28,3,8,0.315754,3.172407e-11
13,1,6,-0.285395,1.972194e-09
40,6,8,-0.275855,6.650564e-09


In [9]:
idx = korr.find_worst(df['cor'].values, df['pval'].values, m=10, rlim=.01, plim=0.45)
df.iloc[idx]

Unnamed: 0,i,j,cor,pval
31,4,6,0.002757,0.953785
9,1,2,0.010208,0.830073
5,0,6,0.013254,0.780511
1,0,2,-0.016247,0.732666
6,0,7,0.027092,0.56896
11,1,4,0.057247,0.228761
25,3,5,0.065148,0.170795
19,2,5,0.078347,0.099525
4,0,5,0.100039,0.035448
12,1,5,0.111642,0.018918
