# Predictive models

A set of supporting code snippets for the presentation.

### Plant energy output dataset

Example datasets:

`plant.csv`: containing records of energy output of electricity generator wrt different parameters.

`concrete.csv`: containing records of concrete strength wrt time and cement in mixture.

In [3]:
import plotly.offline as py
import plotly.graph_objs as go
py.init_notebook_mode(True)

import numpy as np
import pandas as ps

csv = ps.read_csv('concrete.csv')
XY = csv.as_matrix()[::1]
X, y = XY[:, (0, 1)], XY[:, -1]

data = [
    go.Scatter3d(
        x = X[:,0], y = X[:,1], z = y, 
        mode='markers', marker={'size': 3})
]

layout = go.Layout(
    title='Dataset',
    autosize=True,
    margin=dict(l=65,r=50,b=65,t=90),
    scene=go.Scene(
        xaxis=dict(title=csv.columns[0]),
        yaxis=dict(title=csv.columns[1]),
        zaxis=dict(title=csv.columns[-1]),
    )
)
fig = go.Figure(data=data, layout=layout)
py.iplot(fig)

In [4]:
import numpy as np
import pandas as ps
from time import time

# Choice of models inspired by
# https://arxiv.org/pdf/1708.05070.pdf
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR, LinearSVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor

from sklearn.preprocessing import RobustScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, train_test_split

def render_model(model, X, y):
    """Evaluates model on the domain of a dataset"""
    resolution = 37
    X1 = np.linspace(min(X[:, 0]), max(X[:, 0]), resolution)
    X2 = np.linspace(min(X[:, 1]), max(X[:, 1]), resolution)

    X1, X2 = np.meshgrid(X1, X2)
    Z = X1 * 0.0

    for i in range(X1.shape[0]):
        for j in range(X1.shape[1]):
            Z[i,j] = model.predict([[X1[i,j], X2[i,j]]])[0]

    return X1, X2, Z

# rendering function
def fnc(model_class='lin', C=1.0, gamma=1.0, n_neighbors=1, 
        n_estimators=10, max_depth=1, min_samples_split=0.5,
       learning_rate=0.01):
    # parameters for different model classes
    lin = {
        'model': [LinearSVR(max_iter=100000)],
        'model__C': [10 ** C],
    }
    knn = {
        'model': [KNeighborsRegressor()],
        'model__n_neighbors': [n_neighbors], 
    }
    svr = {
        'model': [SVR(epsilon=10.0)],
        'model__C': [10 ** C],
        'model__gamma': [10.0 ** gamma],
    }
    tree = {
        'model': [DecisionTreeRegressor()],
        'model__max_depth': [max_depth], 
        'model__min_samples_split': [min_samples_split],
    }
    gbrt = {
        'model': [GradientBoostingRegressor()],
        'model__n_estimators': [n_estimators], 
        'model__learning_rate': [10 ** learning_rate],
    }

    model = {'lin': lin, 'knn': knn, 'svm': svr, 'gbrt': gbrt, 'tree': tree}[model_class]
    
    if model_class == 'tree' or model_class == 'lin':
        pipe = Pipeline([
            ('model', GradientBoostingRegressor()),
        ]) 
    else:
        pipe = Pipeline([
            ('scale', RobustScaler()),
            ('model', GradientBoostingRegressor()),
        ])

    model = GridSearchCV(
        estimator=pipe,
        param_grid=[model],
        n_jobs=-1,
    )

    # read data
    csv = ps.read_csv('concrete.csv')
    XY = csv.as_matrix()

    # split data into inputs and outputs
    X, y = XY[:, :-1], XY[:, -1]

    # split data into training and testing data
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.75)

    start_time = time()

    # search for best hyperparameters
    model.fit(X_train, y_train)

    # evaluate model
    fitting_time = time() - start_time
    train_score = model.best_score_
    test_score = model.score(X_test, y_test)
    
    
    print("Model fit time: %s, val. score: %s, test score: %s" % (fitting_time, train_score, test_score))

    # rendering code
    Xp, Yp, Zp = render_model(model, X, y)
    
    import plotly.offline as py
    import plotly.graph_objs as go
    py.init_notebook_mode(True)

    data = [
        go.Scatter3d(
            x=X[:, 0], y=X[:, 1], z=y,
            mode='markers', marker={'size': 1}),
        go.Surface(
            x=Xp, y=Yp, z=Zp
        )
    ]

    if model_class == 'svm':
        # get the trained linear model
        svm_model = model.best_estimator_.steps[-1][-1]
        # print the weights of the model
        I = svm_model.support_
        data.append(
            go.Scatter3d(
                x=X_train[I, 0], y=X_train[I, 1], z=y_train[I],
                mode='markers', marker={'size': 3}),
        )
    
    # 3d rendering done here using plot.ly
    layout = go.Layout(
        title=model_class,
        autosize=True,
        margin=dict(l=1, r=1, b=40, t=30),
        scene=go.Scene(
            xaxis=dict(title=csv.columns[0]),
            yaxis=dict(title=csv.columns[1]),
            zaxis=dict(title=csv.columns[-1]),
        )
    )
    
    fig = go.Figure(data=data, layout=layout)
    py.iplot(fig)
    #py.plot(fig)
    
    if model_class == 'lin':
        # get the trained linear model
        lin_model = model.best_estimator_.steps[-1][-1]
        # print the weights of the model
        print('Model weights: %s' % dict(zip(csv.columns[:2], lin_model.coef_)))
        
    if model_class == 'svm':
        # get the trained linear model
        svm_model = model.best_estimator_.steps[-1][-1]
        # print the weights of the model
        print('Support vectors: %s' % len(svm_model.support_))
    
    if model_class == 'tree':
        # tree rendering done here
        tree_model = model.best_estimator_.steps[-1][-1]
        
        import graphviz 
        from sklearn import tree
        from IPython.core.display import display
        
        dot_data = tree.export_graphviz(tree_model, out_file=None,
                         feature_names=csv.columns[:2], label='all',
                         filled=True, rounded=True, impurity=False, 
                         special_characters=True)  
        
        graph = graphviz.Source(dot_data) 
        display(graph)

# ignore warnings for clean output
def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn

# interactive part done here
from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets
from ipywidgets import FloatSlider, IntSlider

# short - hand definitions of sliders
fr=lambda x, y: FloatSlider(min=x, max=y, continuous_update=False)
ir=lambda x, y: IntSlider(min=x, max=y, continuous_update=False)

# all interactive cell outputs
interact(lambda C: fnc('lin', C=C), C=fr(-6,5));
interact(lambda n_neighbors: fnc('knn', n_neighbors=n_neighbors), 
         n_neighbors=ir(1,100));
interact(lambda C, gamma: fnc('svm', C=C, gamma=gamma), 
         C=fr(-3,4), gamma=fr(-3, 4));
interact(lambda max_depth, min_samples_split: fnc('tree', max_depth=max_depth, min_samples_split=min_samples_split), 
         max_depth=ir(1,16), min_samples_split=fr(0.01, 1.0));
interact(lambda n_estimators, learning_rate: fnc('gbrt', n_estimators=n_estimators, learning_rate=learning_rate), 
         n_estimators=ir(1,100), learning_rate=fr(-4, 4));