In [4]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import LassoCV
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import scale
from sklearn.pipeline import make_pipeline
from sklearn.cross_validation import train_test_split
from sklearn.cross_validation import KFold
from sklearn.cross_validation import cross_val_score
from sklearn.metrics import mean_squared_error
%matplotlib inline

**Challenge 1**

Generate (fake) data that is linearly related to log(x).

You are making this model up. It is of the form B0 + B1*log(x) + epsilon. (You are making up the parameters.)

Simulate some data from this model.

Then fit two models to it:

quadratic (second degree polynomial)
logarithmic (log(x))
(The second one should fit really well, since it has the same form as the underlying model!)

In [5]:
dat = np.random.randint(1,10)*np.log(np.linspace(1,100))+np.random.randint(10,50)+np.random.normal()

In [6]:
# Setup DataFrame

df = pd.DataFrame(dat, columns=['Y'])
df['int'] = [1 for x in list(range(len(dat)))]
df['X1'] = [x for x in list(range(1,len(dat)+1))]

In [7]:
y = df['Y']
x = df.drop(['Y'],1)

In [8]:
# Quadratic fit

x2 = x**2

lsm = sm.OLS(y,x2)
fit = lsm.fit()
fit.summary()

0,1,2,3
Dep. Variable:,Y,R-squared:,0.589
Model:,OLS,Adj. R-squared:,0.581
Method:,Least Squares,F-statistic:,68.83
Date:,"Tue, 07 Feb 2017",Prob (F-statistic):,7.88e-11
Time:,16:31:03,Log-Likelihood:,-101.87
No. Observations:,50,AIC:,207.7
Df Residuals:,48,BIC:,211.6
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5
,coef,std err,t,P>|t|,[95.0% Conf. Int.]
int,37.0210,0.404,91.550,0.000,36.208 37.834
X1,0.0029,0.000,8.296,0.000,0.002 0.004

0,1,2,3
Omnibus:,42.511,Durbin-Watson:,0.095
Prob(Omnibus):,0.0,Jarque-Bera (JB):,142.188
Skew:,-2.323,Prob(JB):,1.3300000000000002e-31
Kurtosis:,9.831,Cond. No.,1730.0


In [9]:
# Logarithmic fit

logx = df.drop(['Y'], 1)
logx['X1'] = np.log(x['X1'])

lsm = sm.OLS(y,logx)
fit = lsm.fit()
fit.summary()

0,1,2,3
Dep. Variable:,Y,R-squared:,0.995
Model:,OLS,Adj. R-squared:,0.995
Method:,Least Squares,F-statistic:,10430.0
Date:,"Tue, 07 Feb 2017",Prob (F-statistic):,8.290000000000001e-58
Time:,16:31:04,Log-Likelihood:,10.554
No. Observations:,50,AIC:,-17.11
Df Residuals:,48,BIC:,-13.28
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5
,coef,std err,t,P>|t|,[95.0% Conf. Int.]
int,29.7886,0.100,299.357,0.000,29.589 29.989
X1,3.2818,0.032,102.149,0.000,3.217 3.346

0,1,2,3
Omnibus:,70.054,Durbin-Watson:,0.591
Prob(Omnibus):,0.0,Jarque-Bera (JB):,823.022
Skew:,-3.555,Prob(JB):,1.9200000000000002e-179
Kurtosis:,21.56,Cond. No.,12.0


**Challenge 2**

Generate (fake) data from a model of the form B0 + B1*x + B2*x^2 + epsilon. (You are making up the parameters.)

Split the data into a training and test set.

Fit a model to your training set. Calculate mean squared error on your training set. Then calculate it on your test set.

(You could use sklearn.metrics.mean_squared_error.)

In [31]:
logx_2 = logx.drop(['int'],1)

In [32]:
lr1 = make_pipeline(LinearRegression())
X_train, X_test, y_train, y_test = train_test_split(logx_2, y, test_size=.25)
lr1.fit(X_train, y_train)
lr1.score(X_test, y_test)

0.99344396298073245

In [39]:
y_pred_train = lr1.predict(X_train)
y_pred_test = lr1.predict(X_test)

In [40]:
mse_train = mean_squared_error(y_train, y_pred_train)
mse_test = mean_squared_error(y_test, y_pred_test)

In [41]:
mse_train

0.04727610320978403

In [42]:
mse_test

0.016577234138040978

**Challenge 3**

For the data from two (above), try polynomial fits from 0th (just constant) to 7th order (highest term x^7). Over the x axis of model degree (8 points), plot:

training error
test error
R squared
AIC

In [21]:
for degree in range(8):
    est = make_pipeline(PolynomialFeatures(degree), LinearRegression())
    est.fit(logx, y)
    plt.scatter(logx['X1'], y)
    plt.plot(logx['X1'], y)
plt.show()

IndexError: index 1 is out of bounds for axis 0 with size 1

**Challenge 4**

For the data from two (above), fit a model to only the first 5 of your data points (m=5). Then to first 10 (m=10). Then to first 15 (m=15). In this manner, keep fitting until you fit your entire training set. For each step, calculate the training error and the test error. Plot both (in the same plot) over m. This is called a learning curve.

In [29]:
est = make_pipeline(PolynomialFeatures(1), LinearRegression())
est.fit(logx, y)
est.steps[1][1].coef_
# y_pred = [(est.steps[1][1].coef_[1]*x + lr1.steps[0][1].intercept_) for x in X_train['X1']]

array([ 0.        ,  0.        ,  3.28183237])