# Correlation

## Setup

In [None]:
import pandas as pd
import altair as alt

## Data

### Import data

In [None]:
df = pd.read_csv('https://raw.githubusercontent.com/kirenz/datasets/master/possum.csv')

### Data structure

In [None]:
df

In [None]:
df.info()

### Variable lists

In [None]:
# define outcome variable as y_label
y_label = 'head_l'

list_numeric = ['age', 'head_l', 'skull_w', 'total_l', 'tail_l']

## Analysis

### Descriptive analysis

In [None]:
df.describe().T

### Explorative analysis

In [None]:
alt.Chart(df).mark_circle().encode(
    x=alt.X(alt.repeat("column"), 
            type='quantitative',
            scale=alt.Scale(zero=False)
            ),
    y=alt.Y(alt.repeat("row"), 
            type='quantitative',
            scale=alt.Scale(zero=False)
             )
).properties(
    width=150,
    height=150
).repeat(
    row=list_numeric,
    column=list_numeric
)

### Correlation

Methods of correlation in function `.corr()`:

**Pearson correlation coefficient**: 

`pearson` : Pearson correlation coefficient 

- is only appropriate for numeric variables. 
- can only detect linear relationships.
- can be misleading if there are outliers in the data.


**Rank correlation**: 

`kendall` : Kendall Tau correlation coefficient  
or  
`spearman` : Spearman rank correlation



- is appropriate for both continuous and discrete ordinal variables.
- is better in detecting nonlinear relationships (in comparison to `pearson`)
- is less sensitive to outliers.


In [None]:
corr = df.corr(method='pearson').round(2)

In [None]:
corr

In [None]:
corr_blues = corr.style.background_gradient(cmap='Blues')

In [None]:
corr_blues

In [None]:
# inspect correlation between response and predictors
corr_list = corr[y_label].sort_values(ascending=False)

In [None]:
corr_list