In [None]:
import numpy as np
import pandas as pd
import statsmodels.formula.api as smf

%matplotlib inline
import matplotlib.pyplot as plt

from sklearn.linear_model import LinearRegression

# Extract Radon Data and do EDA

In [None]:
df = pd.read_csv('../data/radon.csv')

In [None]:
df[['log_radon', 'county', 'floor']].head(8)

In [None]:
df[['idnum', 'county']].groupby('county').count().head(10)

__Note__: the number of observations is wildly different from county to county

In [None]:
def plot_county_data(X_county, y_county):
    dither_delta = 0.05
    dither = dither_delta * np.random.rand(len(y_county)) - dither_delta/2.
    plt.plot(X_county['floor'] + dither, y_county, 'ko', alpha=0.5)

In [None]:
counties_to_plot = ['LAC QUI PARLE', 'AITKIN', 'KOOCHICHING', 'DOUGLAS', 'CLAY', 'STEARNS', 'RAMSEY', 'ST LOUIS']

# Complete Pooled Linear Regression
Fit a single linear model for all the data. 

In [None]:
def plot_county_lm(lm, county_name, linestyle='b-', linewidth=3):
    # Create the line
    xx = pd.DataFrame(np.linspace(0, 1, 100))
    #yy = lm.intercept_ + lm.coef_[0] * xx
    yy = lm.predict(xx)
    plt.plot(xx, yy, linestyle, linewidth=linewidth)
    plt.ylim([-1, 3])
    plt.title(county_name)
    plt.grid(True)

In [None]:
y = df['log_radon']
X = df[['floor']]
clusters = df['county']

In [None]:
global_lm = LinearRegression()
global_lm.fit(X, y) # Note that I'm only using the floor feature the intercept is fitted by default.

In [None]:
fig, axes = plt.subplots(nrows=2, ncols=4, sharex='all', sharey='all', figsize=(16, 8))
plt.ylabel('log(radon)')
for i in range(0,2):
    for j in range(0,4):
        plt.sca(axes[i, j])
        county_name = counties_to_plot[4*i+j]
        county_mask = (clusters == county_name)
        X_county = X[county_mask]
        y_county = y[county_mask]
        plot_county_lm(global_lm, county_name)
        plot_county_data(X_county, y_county)

# No Pooling Linear Regression
Fit a separate model per cluster.

In [None]:
county_lm = {}
fig, axes = plt.subplots(nrows=2, ncols=4, sharex='all', sharey='all', figsize=(16, 8))
plt.ylabel('log(radon)')
for i in range(0,2):
    for j in range(0,4):
        plt.sca(axes[i, j])
        county_name = counties_to_plot[4*i+j]
        county_mask = (clusters == county_name)
        X_county = X[county_mask]
        y_county = y[county_mask]
        lm = LinearRegression()
        county_lm[county_name] = lm
        lm.fit(X_county, y_county)
        plot_county_lm(global_lm, county_name, 'b--', 1)
        plot_county_lm(lm, county_name, 'r-')
        plot_county_data(X_county, y_county)

# Linear Regression with dummies
Fit a single linear model for all the data with dummies to account *county* effects. 

In [None]:
def plot_county_lm_dummy(lm, counties, county_name, linestyle='k-', linewidth=3):
    # Create the line
    xx=pd.DataFrame(np.linspace(0, 1, 100))
    nx=pd.DataFrame(np.zeros(shape=(100,len(counties))),columns=counties)
    nx[county_name]=1
    nx=pd.DataFrame.merge(nx,xx,left_index=True, right_index=True)
    yy = lm.predict(nx)
    plt.plot(xx, yy, linestyle, linewidth=linewidth)
    plt.ylim([-1, 3])
    plt.title(county_name)
    plt.grid(True)

In [None]:
counties = df['county'].unique()
dummies = pd.DataFrame()
for county in counties:
    dummies[county] = (df['county']==county).apply(lambda x: int(x))

In [None]:
DX = dummies.merge(X,left_index=True, right_index=True)

In [None]:
dummy_lm = LinearRegression(fit_intercept=False)
dummy_lm.fit(DX, y)

In [None]:
fig, axes = plt.subplots(nrows=2, ncols=4, sharex='all', sharey='all', figsize=(16, 8))
plt.ylabel('log(radon)')
for i in range(0,2):
    for j in range(0,4):
        plt.sca(axes[i, j])
        county_name = counties_to_plot[4*i+j]
        county_mask = (clusters == county_name)
        X_county = X[county_mask]
        y_county = y[county_mask]
        plot_county_lm_dummy(dummy_lm, counties, county_name)
        plot_county_lm(county_lm[county_name], county_name, 'r--', 1)
        plot_county_lm(global_lm, county_name, 'b--', 1)
        plot_county_data(X_county, y_county)

 # Linear Mixed Effects Modelling with Random Intercept
 
 Use mixed-effects modelling to model a random intercept for each cluster but learning the slope globally.

In [None]:
X = df[['floor']]
y = df['log_radon']

In [None]:
md = smf.mixedlm("log_radon ~ floor", df, groups=df['county'])
mdf = md.fit()

In [None]:
mdf.params

In [None]:
mdf.random_effects['AITKIN']['Group']

In [None]:
def plot_county_lme(lme, county_name, linestyle='g-', linewidth=3):
    # Create the line
    xx = np.linspace(0, 1, 100)
    yy = lme.params['Intercept'] + lme.params['floor'] * xx + lme.random_effects[county_name]['Group']
    plt.plot(xx, yy, linestyle, linewidth=linewidth)
    plt.ylim([-1, 3])
    plt.title(county_name)
    plt.grid(True)

In [None]:
fig, axes = plt.subplots(nrows=2, ncols=4, sharex='all', sharey='all', figsize=(16, 8))
plt.ylabel('log(radon)')
for i in range(0,2):
    for j in range(0,4):
        plt.sca(axes[i, j])
        county_name = counties_to_plot[4*i+j]
        county_mask = (clusters == county_name)
        X_county = X[county_mask]
        y_county = y[county_mask]
        plot_county_lme(mdf, county_name)
        plot_county_lm_dummy(dummy_lm, counties, county_name, 'k--', 1)
        plot_county_lm(county_lm[county_name], county_name, 'r--', 1)
        plot_county_lm(global_lm, county_name, 'b--', 1)
        plot_county_data(X_county, y_county)

# Linear Mixed Effects Modelling with Random Slope

In [None]:
md_rs = smf.mixedlm("log_radon ~ floor", df, groups=df['county'], re_formula="~floor")
mdf_rs = md_rs.fit()

In [None]:
mdf_rs.random_effects['AITKIN']

In [None]:
def plot_county_random_slope(lme, county_name, linestyle='m-', linewidth=3):
    # Create the line
    xx = np.linspace(0, 1, 100)
    yy = lme.params['Intercept'] + lme.params['floor'] * xx + lme.random_effects[county_name]['Group'] + lme.random_effects[county_name]['floor'] * xx
    plt.plot(xx, yy, linestyle, linewidth=linewidth)
    plt.ylim([-1, 3])
    plt.title(county_name)
    plt.grid(True)

In [None]:
fig, axes = plt.subplots(nrows=2, ncols=4, sharex='all', sharey='all', figsize=(16, 8))
plt.ylabel('log(radon)')
for i in range(0,2):
    for j in range(0,4):
        plt.sca(axes[i, j])
        county_name = counties_to_plot[4*i+j]
        county_mask = (clusters == county_name)
        X_county = X[county_mask]
        y_county = y[county_mask]
        plot_county_random_slope(mdf_rs, county_name)
        plot_county_lme(mdf, county_name, 'g--', 1)
        plot_county_lm_dummy(dummy_lm, counties, county_name, 'k--', 1)
        plot_county_lm(county_lm[county_name], county_name, 'r--', 1)
        plot_county_lm(global_lm, county_name, 'b--', 1)
        plot_county_data(X_county, y_county)