In [None]:
import pandas as pd

from sklearn import metrics
import matplotlib.pyplot as plt

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, \
    cross_val_predict

In [None]:
stories = pd.read_csv('../data/lobsters_numeric.csv')

In [None]:
stories.head()

In [None]:
stories = stories.set_index('Unnamed: 0')

### Setting up test and train data

In [None]:
train_test_split?

In [None]:
y = stories.score
X = stories.drop(['score'], axis=1).as_matrix()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.33, random_state=42)

### Training our Model

In [None]:
rfr = RandomForestRegressor()
rfr.fit(X_train, y_train)

In [None]:
y_pred = rfr.predict(X_test)

plt.scatter(y_test, y_pred)
plt.xlabel("Scores: $Y_i$")
plt.ylabel("Predicted scores: $\hat{Y}_i$")
plt.title("Scores vs Predicted scores: $Y_i$ vs $\hat{Y}_i$")

In [None]:
metrics.mean_squared_error(y_test, y_pred)

In [None]:
metrics.explained_variance_score(y_test, y_pred)

### Looking at our model with cross validation

In [None]:
rfr_cv = RandomForestRegressor()
predicted = cross_val_predict(rfr_cv, X_train, y_train, cv=10)
fig, ax = plt.subplots()
ax.scatter(y_train, predicted, edgecolors=(0, 0, 0))
ax.plot([y_train.min(), y_train.max()], 
        [y_train.min(), y_train.max()], 'r--', lw=4)
ax.set_xlabel('Measured')
ax.set_ylabel('Predicted')
plt.show()

### Something is definitely fishy... 

Here's a look from the sklearn documentation of how linear regression charts "normally" look.

![](http://scikit-learn.org/stable/_images/sphx_glr_plot_cv_predict_001.png)

### Investigating what features might be overfitting

In [None]:
rfr.estimators_

In [None]:
rfr.feature_importances_

In [None]:
pd.Series(rfr.feature_importances_).hist()

In [None]:
x_cols = stories.drop('score', axis=1).columns

In [None]:
for col, coef in zip(x_cols, rfr.feature_importances_):
    if coef > .1:
        print(col, coef)

### Your Turn

- Is this a good set of features? Do we possibly have leakage? Why or why not?

### Save our model

In [None]:
from sklearn.externals import joblib
joblib.dump(lr, '../data/lobsters_rfr_score_model_overfit.pkl') 