## Notebook 5 - Fit Regression Models on the Tweet Metadata
The purpose of this notebook is to train regression models on the non-text features of the tweets for the first layer of the model stack.

In [None]:
import pandas as pd
import numpy as np
import re
from sklearn.externals import joblib

### Train models with the following features:
1. Year tweet was created
1. Month tweet was created
1. Day of week tweet was created
1. Hour tweet was created
1. Tweet has a hashtag
1. Tweet has a mention
1. Tweet has a url

In [None]:
# import the layer1 training and test sets
df_layer1_train = joblib.load('../data/df_layer1_train.pkl')
df_layer1_test = joblib.load('../data/df_layer1_test.pkl')
y_layer1_train = joblib.load('../data/y_layer1_train.pkl')
y_layer1_test = joblib.load('../data/y_layer1_test.pkl')

In [None]:
# get the desired features 
cols = ['year', 'month', 'hour', 'weekday', 'has_hashtag', 'has_mention', 'has_url']
X_layer1_train = df_layer1_train[cols]
X_layer1_test = df_layer1_test[cols]

In [None]:
from sklearn.linear_model import LinearRegression, Lasso, BayesianRidge, SGDRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor

In [None]:
# function to fit and score a regressor
def fit_and_score(data, regr, return_regr=False):
    '''
    Transforms the data using the pre-fit tfidf and svd, fits the regressor to the transformed 
    training data and then prints train and test scores.
    
    Parameters:
        data - iterable containing X_train, X_test, y_train, y_test
        regr - instantiated regressor
        return_regr - boolean, option to return the fit regressor (default: False)
    
    Returns: optional, regressor fit to the transformed training data
    '''
    
    print('Regressor: {}'.format(regr))
    train_sparse = tfidf.transform(data[0])
    X_train = svd.transform(train_sparse)
    
    regr.fit(X_train, data[2])
    print('Train score: {}'.format(regr.score(X_train, data[2])))
    
    test_sparse = tfidf.transform(data[1])
    X_test = svd.transform(test_sparse)
    
    print('Test score: {}'.format(regr.score(X_test, data[3])))
    
    if return_regr:
        return regr

In [None]:
data = (X_layer1_train, X_layer1_test, y_layer1_train, y_layer1_test)

In [None]:
sgd = SGDRegressor(n_iter=5000, verbose=1)
dtr = DecisionTreeRegressor()
knr = KNeighborsRegressor(n_jobs=-1)
bayes = BayesianRidge(verbose=True)
models = [sgd, dtr, knr, bayes]

In [None]:
for model in models:
    fit_and_score(data, model)
    print()

### Boost the Top Regressors