In [None]:
import jarvis

jarvis.setNotebookName('JarvisParameterTuning.ipynb')

ex = jarvis.Experiment('JarvisParameterTuning.ipynb')

ex.groundClient('git')

In [None]:
import sklearn.linear_model as linear_model
import sklearn
import seaborn as sns
import pandas as pd
import numpy as np

# Data Loading

Here I am using built-in data to make a quick example.  In practice I would probably want to download the data from some external source

In [None]:
@jarvis.func
def crawl():
    return sns.load_dataset('titanic')

doCrawl = ex.action(crawl)
titanic_data = ex.artifact('titanic.pkl', doCrawl) 

In [None]:
titanic_data.peek()

# Data Processing

I need to extract some binary features

In [5]:
@jarvis.func
def featurize(df):
    return pd.get_dummies(df)

doFeaturize = ex.action(featurize, [titanic_data])
ft_titanic_data = ex.artifact('ft_titanic.pkl', doFeaturize)

AttributeError: module 'jarvis' has no attribute 'Action'

In [6]:
ft_titanic_data.peek(lambda x: x.head())

NameError: name 'ft_titanic_data' is not defined

# Make the training matrices

In [None]:
@jarvis.func
def separateLabels(df):
    data = df.dropna()
    Y = data['survived'].values
    X = data.drop(['survived'], axis=1).values.astype('float')
    return X, Y

doSepLabels = ex.action(separateLabels, [ft_titanic_data])
X_ft_titanic_data = ex.artifact('x_ft_titanic.pkl', doSepLabels)
Y_ft_titanic_data = ex.artifact('y_ft_titanic.pkl', doSepLabels)

# Train Test Split

In [None]:
@jarvis.func
def trainTestSplit(X, Y, test_size, random_state):
    from sklearn.model_selection import train_test_split
    (X_tr, X_te, Y_tr, Y_te) = train_test_split(X, Y, test_size = test_size, random_state=random_state)
    return (X_tr, X_te, Y_tr, Y_te)

doTrTeSplit = ex.action(trainTestSplit, [X_ft_titanic_data, Y_ft_titanic_data, ex.literal(0.1), ex.literal(42)])
X_tr = ex.artifact('tr_x_ft_titanic.pkl', doTrTeSplit)
X_te = ex.artifact('te_x_ft_titanic.pkl', doTrTeSplit)
Y_tr = ex.artifact('tr_y_ft_titanic.pkl', doTrTeSplit)
Y_te = ex.artifact('te_y_ft_titanic.pkl', doTrTeSplit)

# Model Development

First cut at model development

In [None]:
@jarvis.func
def trainModel(X_tr, Y_tr, n_estimators, min_samples_split):
    from sklearn.ensemble import RandomForestClassifier
    model = RandomForestClassifier(n_estimators=n_estimators, min_samples_split=min_samples_split)
    model.fit(X_tr, Y_tr)
    return model

doTrainModel = ex.action(trainModel, [X_tr, Y_tr, ex.literal(10), ex.literal(2)])
model = ex.artifact('model.pkl', doTrainModel)

In [None]:
@jarvis.func
def scoreModel(model, X_tr, X_te, Y_tr, Y_te):
    tr_acc = "Train Accuracy: {}".format(model.score(X_tr, Y_tr))
    te_acc = "Test Accuracy: {}".format(model.score(X_te, Y_te))
    return (tr_acc + '\n' + te_acc, )

doScoreModel = ex.action(scoreModel, [model, X_tr, X_te, Y_tr, Y_te])
output = ex.artifact('output.txt', doScoreModel)

In [None]:
output.peek(lambda x: print(''.join(x)))

**Error!!!** 

The accuracy is too high!  We must have a feature that contains the label

In [None]:
ft_titanic_data.peek(lambda x: x.dropna().columns)

Notice the **alive_no** and **alive_yes** columns appear to have same data as survived.  Need to drop these columns

# Re-make the training matrices

In [None]:
@jarvis.func
def separateLabels(df):
    data = df.dropna()
    Y = data['survived'].values
    X = data.drop(['survived', 'alive_no', 'alive_yes'], axis=1).values.astype('float')
    return X, Y

doSepLabels = ex.action(separateLabels, [ft_titanic_data])
X_ft_titanic_data = ex.artifact('x_ft_titanic.pkl', doSepLabels)
Y_ft_titanic_data = ex.artifact('y_ft_titanic.pkl', doSepLabels)

# Train Test Split (Again)

In [None]:
doTrTeSplit = ex.action(trainTestSplit, [X_ft_titanic_data, Y_ft_titanic_data, ex.literal(0.1), ex.literal(42)])
X_tr = ex.artifact('tr_x_ft_titanic.pkl', doTrTeSplit)
X_te = ex.artifact('te_x_ft_titanic.pkl', doTrTeSplit)
Y_tr = ex.artifact('tr_y_ft_titanic.pkl', doTrTeSplit)
Y_te = ex.artifact('te_y_ft_titanic.pkl', doTrTeSplit)

# Model Development (Again)

First cut at model development

In [None]:
doTrainModel = ex.action(trainModel, [X_tr, Y_tr, ex.literal(10), ex.literal(2)])
model = ex.artifact('model.pkl', doTrainModel)

In [None]:
doScoreModel = ex.action(scoreModel, [model, X_tr, X_te, Y_tr, Y_te])
output = ex.artifact('output.txt', doScoreModel)

In [None]:
output.peek(lambda x: print(''.join(x)))

In [None]:
output.pull()

# Model selection through search

**To be continued after Aggregation is implemented ...**