In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# Linear Regression

Below, we have a simple dataset dealing with mammalian sleep time.  

```
This is an updated and expanded version of the mammals sleep dataset. Updated sleep times and weights were taken from V. M. Savage and G. B. West. A quantitative, theoretical framework for understanding mammalian sleep. Proceedings of the National Academy of Sciences, 104 (3):1051-1056, 2007.
```

Let's use this to examine the relationship between brain and bodyweight in these critters.

In [None]:
critters = pd.read_csv('data/mammals.csv', index_col = 0)

In [None]:
critters.head()

In [None]:
critters.plot('bodywt', 'brainwt', kind = 'scatter')

In [None]:
plt.figure(figsize = (10, 4))
plt.subplot(131)

plt.scatter(critters.bodywt, critters.brainwt)
plt.subplot(132)
plt.hist(critters.bodywt)

plt.subplot(133)
plt.hist(critters.brainwt)

In [None]:
critters['log_bodywt'] = np.log(critters.bodywt)
critters['log_brainwt'] = np.log(critters.brainwt)

In [None]:
plt.figure(figsize = (10, 4))
plt.subplot(131)

plt.scatter(critters.log_bodywt, critters.log_brainwt)
plt.subplot(132)
plt.hist(critters.log_bodywt)

plt.subplot(133)
plt.hist(critters.log_brainwt)

### Similar Model

Use the code below as a starting place to examine additional correlations between the dependent variables `sleep_rem` and `awake`.


In [None]:
log_columns = ['bodywt', 'brainwt',]  # add other relationships in need of transformation
log_critters = critters.copy()
log_critters[log_columns] = log_critters[log_columns].apply(np.log10)

### SciKitLearn Example



In [None]:
from sklearn import feature_selection


lm = LinearRegression()
X = critters.log_bodywt.values.reshape(-1,1)
y = critters.log_brainwt
lm.fit(X, y)

pvals = feature_selection.f_regression(X, y)[1]
predictions = lm.predict(X)
coefficients = lm.coef_
y_int = lm.intercept_
r2 = lm.score(X, y)
print('P-Values: ',pvals)
print('Coefficients: ', coefficients)
print('Intercept: ', y_int)
print('R2 Score: ', r2)

In [None]:
plt.figure(figsize = (10, 5))
plt.subplot(121)
plt.scatter(critters.log_bodywt, critters.log_brainwt)
plt.plot(critters.log_bodywt, predictions)

plt.subplot(122)
plt.hist(predictions - critters.log_brainwt)
print('R2 Score: {:.4f}'.format(lm.score(X, y)))
print('RMSE: {:.4f}'.format(np.sqrt(mean_squared_error(X, y))))
skew = pd.DataFrame([i for i in (predictions - critters.log_brainwt)]).skew()
print('Residual Skew: ', skew, '\nPvalues : ',pvals)

### Modularizing the Evaluation


In [None]:
def get_linear_model_metrics(X, y, algo):
    pvals = feature_selection.f_regression(X, y)[1]
    algo.fit(X, y)
    predictions = lm.predict(X)
    coefficients = lm.coef_
    y_int = lm.intercept_
    r2 = lm.score(X, y)
    residuals = (y - predictions)
    print('P-Values: ',pvals)
    print('Coefficients: ', coefficients)
    print('Intercept: ', y_int)
    print('R2 Score: ', r2)
    plt.hist(residuals)
    return algo

In [None]:
lm = LinearRegression()
get_linear_model_metrics(X = X, y = y, algo = lm)

### Intercept

In [None]:
lm.predict(0)

In [None]:
lm = LinearRegression(fit_intercept=False)
X = critters.log_bodywt.values.reshape(-1,1)
y = critters.log_brainwt
lm.fit(X, y)

In [None]:
lm.predict(0)

In [None]:
plt.figure(figsize = (10, 5))
plt.subplot(121)
plt.scatter(critters.log_bodywt, critters.log_brainwt)
plt.plot(critters.log_bodywt, predictions)

plt.subplot(122)
plt.hist(predictions2 - critters.log_brainwt)
print('R2 Score: {:.4f}'.format(lm.score(X, y)))
print('RMSE: {:.4f}'.format(np.sqrt(mean_squared_error(X, y))))
skew = pd.DataFrame([i for i in (predictions2 - critters.log_brainwt)]).skew()
print('Residual Skew: ', skew)

### Using `LinearRegression`

We have now seen that the data in its earliest state did not allow fo the best linear regression fit.  Attempt to generate two more models using the log-transformed data to see how this changes the model's performance.  Update X and y to match the log-transformed data and add True and False to the loop so we examine each scenario.

In [None]:
X =
y =
loop = []
for boolean in loop:
    print 'y-intercept:', boolean
    lm = LinearRegression(fit_intercept=boolean)
    get_linear_model_metrics(X, y, lm)
    print

### Base Regression Classes

Compare the earlier model to implementations of the `Lasso, Ridge` and `ElasticNet` regularized models.  

In [None]:
from sklearn.linear_model import Lasso, Ridge, ElasticNet

In [None]:
lasso = Lasso()
ridge = Ridge()
enet = ElasticNet()

### CitiBike Example

In [None]:
bikes = pd.read_csv('data/bikeshare.csv')

In [None]:
bikes.head()

In [None]:
from pandas.plotting import scatter_matrix
import seaborn as sns

In [None]:
scatter_matrix(bikes[['temp', 'atemp', 'casual']]);

In [None]:
correlations = bikes[['temp', 'atemp', 'casual']].corr()
print(correlations)
sns.heatmap(correlations)

In [None]:
y = bikes['casual']
x_sets = (
    ['temp'],
    ['atemp'],
    ['temp', 'atemp'],
)

In [None]:
for x in x_sets:
    lm.fit(bikes[x], y)
    print('Score for ', x, '{:.3f}'.format(lm.score(bikes[x], y)), '\ncoefficients', lm.coef_, '\n')  

Even though the 2-variable model temp + atemp has a higher explanation of variance than two variables on their own, and both variables are considered significant (p values approaching 0), we can see that together, their coefficients are wildly different. This can introduce error in how we explain models.

What happens if we use a second variable that isn't highly correlated with temperature, like humidity?

### Multicollinearity with dummy variables


There can be a similar effect from a feature set that is a singular matrix, which is when there is a clear relationship in the matrix (for example, the sum of all rows = 1).

Run through the following code on your own. What happens to the coefficients when you include all weather situations instead of just including all except one?

In [None]:
lm = LinearRegression()
weather = pd.get_dummies(bikes.weathersit)

In [None]:
lm.fit(weather, y)
print(lm.score(weather, y))
print(lm.coef_)

In [None]:
lm.fit(weather[[1, 2, 3]], y)
print(lm.score(weather[[1, 2, 3]], y))
print(lm.coef_)

### Problem

1. Add the dummy variables for the weather situations to the DataFrame.  
2. Find at least two additional features that are not correlated with current features but could be strong indicators for predicting guest riders.
3. Fit a model to these features, describe the results.

---

**Extra**

Generate your model using a `train_test_split`, and evaluate it on the test set.

Do this five times and determine the quality of the average of these models.

---