In [1]:
# Question Characteristics: 
# Tags, Length of Post, Network Characteristics of user, (Language used)
# Outcome: Time till First Answer, Time till first (good) Answer
# (One week from today, how many answers expected to receive? how many good answers expected to receive?)

In [102]:
import dill
import os
import random
import pandas as pd
from pandas import Series
from sklearn import base, model_selection
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge, ElasticNet
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
import numpy as np
import matplotlib.pyplot as plt
import math

from bokeh.layouts import gridplot
from bokeh.plotting import figure, show, output_file

In [62]:
os.chdir('/Users/longy/Documents/TDI/capstone')
df_tag = dill.load(open('df_tag.pkd', 'r'))

In [64]:
data = df_tag[np.isfinite(df_tag['fa_time'])]
X = data.loc[:, data.columns != 'fa_time']
y = data.fa_time
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=42)
names = list(X_train.columns.values[12:])

In [92]:
class ColumnSelectTransformer(base.BaseEstimator, base.TransformerMixin):
    def __init__(self, col_names):
        self.col_names = col_names
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[self.col_names]

class EnsembleTransformer(base.BaseEstimator, base.TransformerMixin):
    def __init__(self, base_estimator, residual_estimator):
        self.base_estimator = base_estimator
        self.residual_estimator = residual_estimator
    def fit(self, X, y):
        self.base_estimator.fit(X, y)
        y_err = y-self.base_estimator.predict(X)
        self.residual_estimator.fit(X, y_err)
        return self
    def transform(self, X):
        all_ests = [self.base_estimator] + [self.residual_estimator]
        return np.array([est.predict(X) for est in all_ests]).T

In [145]:
scaler = StandardScaler()
cst = ColumnSelectTransformer(names)
x_train = cst.transform(X_train)

In [146]:
md_range = np.arange(2,22,2)
msl_range = np.arange(20,35,5)
est = RandomForestRegressor()
rf = model_selection.GridSearchCV(
    est, 
    {'max_depth': md_range,
    'min_samples_leaf': msl_range},
    cv = 10,
    n_jobs = 2,
    scoring = 'neg_mean_squared_error'
)
rf.fit(x_train, y_train)

GridSearchCV(cv=10, error_score='raise',
       estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=2,
       param_grid={'max_depth': array([ 2,  4,  6,  8, 10, 12, 14, 16, 18, 20]), 'min_samples_leaf': array([20, 25, 30])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='neg_mean_squared_error', verbose=0)

In [147]:
rf.best_estimator_

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=18,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=30, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [150]:
tag_rf_best_est = rf.best_estimator_
tag_rf_est = Pipeline([
    ('csv', ColumnSelectTransformer(names)),
#     ('scale', scaler),
    ('rf', tag_rf_best_est)
])

In [151]:
tag_rf_est.fit(X_train, y_train)
dill.dump(tag_rf_est, open('tag_rf_est.pkd', 'w'))
tag_rf_est = dill.load(open('tag_rf_est.pkd', 'r'))
y_pred = tag_rf_est.predict(X_test)

In [161]:
#Length of Post
alpha_range = np.arange(0,1.01,0.05)
cst = ColumnSelectTransformer(['q_body_len'])
x_train = cst.transform(X_train)

In [165]:
est = RandomForestRegressor()
rf = model_selection.GridSearchCV(
    est, 
    {'max_depth': md_range,
    'min_samples_leaf': msl_range},
    cv = 10,
    n_jobs = 2,
    scoring = 'neg_mean_squared_error'
)
rf.fit(x_train, y_train)

GridSearchCV(cv=10, error_score='raise',
       estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=2,
       param_grid={'max_depth': array([ 2,  4,  6,  8, 10, 12, 14, 16, 18, 20]), 'min_samples_leaf': array([20, 25, 30])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='neg_mean_squared_error', verbose=0)

In [167]:
body_len_rf_best_est = rf.best_estimator_
body_len_rf_best_est

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=2,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=25, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [168]:
body_len_est = Pipeline([
    ('csv', ColumnSelectTransformer(['q_body_len'])),
    ('en', body_len_en_best_est)
])
body_len_est.fit(X_train, y_train)
dill.dump(body_len_est, open('body_len_est.pkd', 'w'))
body_len_est = dill.load(open('body_len_est.pkd', 'r'))
body_len_est.predict(X_test)

array([202149.49113533, 203978.33618465, 203259.86134385, ...,
       202655.68931863, 204675.81663075, 202802.65008152])

In [169]:
tag_rf_est.predict(X_test)

array([283711.98396899, 155323.16584223, 127487.2965601 , ...,
       283711.98396899, 244979.05827614, 283711.98396899])