# Correlation

## Setup

In [1]:
import pandas as pd
import altair as alt

## Data

### Import data

In [6]:
df = pd.read_csv('https://raw.githubusercontent.com/kirenz/datasets/master/possum.csv')

### Data structure

In [7]:
df

Unnamed: 0,site,pop,sex,age,head_l,skull_w,total_l,tail_l
0,1,Vic,m,8.0,94.1,60.4,89.0,36.0
1,1,Vic,f,6.0,92.5,57.6,91.5,36.5
2,1,Vic,f,6.0,94.0,60.0,95.5,39.0
3,1,Vic,f,6.0,93.2,57.1,92.0,38.0
4,1,Vic,f,2.0,91.5,56.3,85.5,36.0
...,...,...,...,...,...,...,...,...
99,7,other,m,1.0,89.5,56.0,81.5,36.5
100,7,other,m,1.0,88.6,54.7,82.5,39.0
101,7,other,f,6.0,92.4,55.0,89.0,38.0
102,7,other,m,4.0,91.5,55.2,82.5,36.5


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 104 entries, 0 to 103
Data columns (total 8 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   site     104 non-null    int64  
 1   pop      104 non-null    object 
 2   sex      104 non-null    object 
 3   age      102 non-null    float64
 4   head_l   104 non-null    float64
 5   skull_w  104 non-null    float64
 6   total_l  104 non-null    float64
 7   tail_l   104 non-null    float64
dtypes: float64(5), int64(1), object(2)
memory usage: 6.6+ KB


### Variable lists

In [9]:
# define outcome variable as y_label
y_label = 'head_l'

list_numeric = ['age', 'head_l', 'skull_w', 'total_l', 'tail_l']

## Analysis

### Descriptive analysis

In [5]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
site,104.0,3.625,2.349086,1.0,1.0,3.0,6.0,7.0
age,102.0,3.833333,1.909244,1.0,2.25,3.0,5.0,9.0
head_l,104.0,92.602885,3.573349,82.5,90.675,92.8,94.725,103.1
skull_w,104.0,56.883654,3.113426,50.0,54.975,56.35,58.1,68.6
total_l,104.0,87.088462,4.310549,75.0,84.0,88.0,90.0,96.5
tail_l,104.0,37.009615,1.959518,32.0,35.875,37.0,38.0,43.0


### Explorative analysis

In [12]:
alt.Chart(df).mark_circle().encode(
    x=alt.X(alt.repeat("column"), 
            type='quantitative',
            scale=alt.Scale(zero=False)
            ),
    y=alt.Y(alt.repeat("row"), 
            type='quantitative',
            scale=alt.Scale(zero=False)
             )
).properties(
    width=150,
    height=150
).repeat(
    row=list_numeric,
    column=list_numeric
)

### Correlation

Methods of correlation in function `.corr()`:

**Pearson correlation coefficient**: 

`pearson` : Pearson correlation coefficient 

- is only appropriate for numeric variables. 
- can only detect linear relationships.
- can be misleading if there are outliers in the data.


**Rank correlation**: 

`kendall` : Kendall Tau correlation coefficient  
or  
`spearman` : Spearman rank correlation



- is appropriate for both continuous and discrete ordinal variables.
- is better in detecting nonlinear relationships (in comparison to `pearson`)
- is less sensitive to outliers.


In [14]:
corr = df.corr(method='pearson').round(2)

In [15]:
corr

Unnamed: 0,site,age,head_l,skull_w,total_l,tail_l
site,1.0,-0.13,-0.16,-0.08,-0.26,0.38
age,-0.13,1.0,0.32,0.29,0.26,0.12
head_l,-0.16,0.32,1.0,0.71,0.69,0.29
skull_w,-0.08,0.29,0.71,1.0,0.53,0.26
total_l,-0.26,0.26,0.69,0.53,1.0,0.57
tail_l,0.38,0.12,0.29,0.26,0.57,1.0


In [16]:
corr_blues = corr.style.background_gradient(cmap='Blues')

In [17]:
corr_blues

Unnamed: 0,site,age,head_l,skull_w,total_l,tail_l
site,1.0,-0.13,-0.16,-0.08,-0.26,0.38
age,-0.13,1.0,0.32,0.29,0.26,0.12
head_l,-0.16,0.32,1.0,0.71,0.69,0.29
skull_w,-0.08,0.29,0.71,1.0,0.53,0.26
total_l,-0.26,0.26,0.69,0.53,1.0,0.57
tail_l,0.38,0.12,0.29,0.26,0.57,1.0


In [18]:
# inspect correlation between response and predictors
corr_list = corr[y_label].sort_values(ascending=False)

In [19]:
corr_list

head_l     1.00
skull_w    0.71
total_l    0.69
age        0.32
tail_l     0.29
site      -0.16
Name: head_l, dtype: float64