## Correlation

In [2]:
from __future__ import print_function, division

%matplotlib inline

import numpy as np

import brfss

import thinkstats2
import thinkplot

In [3]:
df = brfss.ReadBrfss(nrows=None)

In [4]:
def SampleRows(df, nrows, replace=False):
    indices = np.random.choice(df.index, nrows, replace=replace)
    sample = df.loc[indices]
    return sample

In [5]:
sample = SampleRows(df, 5000)
heights, weights = sample.htm3, sample.wtkg2

In [8]:
cleaned = df.dropna(subset=['htm3','wtkg2'])

#### Covariance

In [9]:
def Cov(xs, ys, meanx=None, meany=None):
    xs = np.asarray(xs)
    ys = np.asarray(ys)
    
    if meanx is None:
        meanx = np.mean(xs)
    if meany is None:
        meany = np.mean(ys)
        
    cov = np.dot(xs-meanx, ys-meany)/len(xs)
    return cov

In [10]:
heights, weights = cleaned.htm3, cleaned.wtkg2
Cov(heights, weights)

103.33290857697797

In [14]:
def Corr(xs, ys):
    xs = np.asarray(xs)
    ys = np.asarray(ys)
    
    meanx, varx = thinkstats2.MeanVar(xs)
    meany, vary = thinkstats2.MeanVar(ys)
    
    corr = Cov(xs, ys, meanx, meany)/np.sqrt(varx*vary)
    return corr

In [15]:
Corr(heights, weights)

0.5087364789734771

In [16]:
np.corrcoef(heights, weights)

array([[1.        , 0.50873648],
       [0.50873648, 1.        ]])

In [17]:
import pandas as pd

In [18]:
def SpearmanCorr(xs, ys):
    xranks = pd.Series(xs).rank()
    yranks = pd.Series(ys).rank()
    return Corr(xranks, yranks)

In [19]:
SpearmanCorr(heights, weights)

0.5405846262320476

In [20]:
def SpearmanCorr(xs, ys):
    xs = pd.Series(xs)
    ys = pd.Series(ys)
    return xs.corr(ys, method='spearman')

In [21]:
SpearmanCorr(heights, weights)

0.5405846262324122

In [22]:
Corr(cleaned.htm3, np.log(cleaned.wtkg2))

0.5317282605983465