In [None]:
import pandas as pd
from IPython.display import display as ipd

# Load Dataset

In [None]:
from sklearn.datasets import load_boston
boston = load_boston()

data = pd.DataFrame(boston.data,columns=boston.feature_names)
data['target'] = pd.Series(boston.target)
ipd(data.sample(5))

In [None]:
data.describe()

# Normal Training

In [None]:
from sklearn.cross_validation import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error 

# Do a train_test_split
x_train, x_test, y_train, y_test = train_test_split(data.iloc[:,:-1], data.iloc[:,-1], test_size=10, random_state=42)

# Create and fit regression
linreg = LinearRegression()
linreg.fit(x_train, y_train)

# Do prediction and calculate mean absolute error
test_pred = linreg.predict(x_test)
mean_absolute_error(y_test, test_pred)

# Integrate ModelDB

In [None]:
!git clone https://github.com/mitdbg/modeldb

In [None]:
!./modeldb/client/python/setup.py

In [None]:
#import pkg_resources
#pkg_resources.require("modeldb==0.0.1a31")
#import modeldb.sklearn_native.ModelDbSyncer as mdb
import modeldb.sklearn_native.ModelDbSyncer as mdb

project = mdb.NewOrExistingProject(name="ModelDB Evaluation", author="Nico", description="using Bosten Housing Dataset")
experiment = mdb.NewOrExistingExperiment(name="Simple model training", description="")
syncer = mdb.Syncer(
    project,
    experiment,
    mdb.NewExperimentRun("Linear Regression"))

In [None]:
import modeldb.sklearn_native.ModelDbSyncer as mdb
from modeldb.sklearn_native import SyncableMetrics
from sklearn.metrics import mean_absolute_error, mean_squared_error

# Do a train_test_split
x_train, x_test, y_train, y_test = mdb.cross_validation.train_test_split_sync(data.iloc[:,:-1], data.iloc[:,-1], test_size=10, random_state=42)

# Create and fit regression
linreg = mdb.linear_model.LinearRegression()
linreg.fit_sync(x_train, y_train)

# Do prediction and calculate mean absolute error
test_pred = linreg.predict_sync(x_test)
mae = SyncableMetrics.compute_metrics(linreg, mean_absolute_error, y_test, test_pred, data.iloc[:,:-1].values,"predictionCol", 'target')

# Sync with the backend service
syncer.sync()

## Test some other models

In [None]:
syncer = mdb.Syncer(project,experiment,mdb.NewExperimentRun("Ridge"))

model = mdb.linear_model.Ridge()
model.fit_sync(x_train, y_train)
test_pred = model.predict_sync(x_test)

SyncableMetrics.compute_metrics(model, mean_absolute_error, y_test, test_pred, data.iloc[:,:-1].values,"predictionCol", 'target')
mse = SyncableMetrics.compute_metrics(model, mean_squared_error, y_test, test_pred, data.iloc[:,:-1].values,"predictionCol", 'target')
syncer.sync()

In [None]:
syncer = mdb.Syncer(project,experiment,mdb.NewExperimentRun("Lasso"))

model = mdb.linear_model.Lasso()
model.fit_sync(x_train,y_train)
test_pred = model.predict_sync(x_test)

mae = SyncableMetrics.compute_metrics(model, mean_absolute_error, y_test, test_pred, data.iloc[:,:-1].values,"predictionCol", 'target')
mse = SyncableMetrics.compute_metrics(model, mean_squared_error, y_test, test_pred, data.iloc[:,:-1].values,"predictionCol", 'target')
syncer.sync()

In [None]:
syncer.set_experiment_run(mdb.NewExperimentRun("ElasticNet"))

model = mdb.linear_model.ElasticNet()
model.fit_sync(x_train, y_train)
test_pred = model.predict_sync(x_test)

mae = SyncableMetrics.compute_metrics(model, mean_absolute_error, y_test, test_pred, data.iloc[:,:-1].values,"predictionCol", 'target')
mse = SyncableMetrics.compute_metrics(model, mean_squared_error, y_test, test_pred, data.iloc[:,:-1].values,"predictionCol", 'target')
syncer.sync()

## Grid Search

In [None]:
import sklearn
from sklearn import svm, datasets
from sklearn.model_selection import GridSearchCV
from sklearn.cross_validation import train_test_split
x_train, x_test, y_train, y_test = train_test_split(data.iloc[:,:-1], data.iloc[:,-1], test_size=10, random_state=42)

tuned_parameters = {
    'n_jobs': (1,2,3)
}

model = sklearn.linear_model.LinearRegression()

clf = GridSearchCV(model, tuned_parameters)
clf.fit(x_train,y_train)
predictions = clf.predict(x_test)

print(sorted(clf.cv_results_.keys()))
from sklearn.metrics import mean_squared_error # Same as the computation above the plot
mean_squared_error(y_test, predictions)

### With ModelDB

In [None]:
clf.score(x_test,y_test)

In [None]:
experiment = mdb.NewOrExistingExperiment(name="Grid Search", description="")
syncer = mdb.Syncer(project,experiment,mdb.NewExperimentRun("ElasticNet"))

model = mdb.linear_model.ElasticNet()
parameters = {
    'alpha': (10,5,1,0.5,0.1),
    'l1_ratio': (1,0.75,0.5,.25,0)
    
}
scorer = sklearn.metrics.make_scorer(mean_absolute_error)

clf = mdb.GridSearchCV(model, parameters, cv=5, scoring=scorer,error_score=100)

# Fit the gridsearch
clf.fit_sync(x_train, y_train)

#test_pred = clf.predict(x_test)
# Compute various metrics on the testing set
#mae = SyncableMetrics.compute_metrics(clf, mean_absolute_error, y_test, test_pred, data.iloc[:,:-1].values,"predictionCol", 'target')
#mse = SyncableMetrics.compute_metrics(clf, mean_squared_error, y_test, test_pred, data.iloc[:,:-1].values,"predictionCol", 'target')
syncer.sync()

### Best Estimator

In [None]:
syncer = mdb.Syncer(project,experiment,mdb.NewExperimentRun("Lasso"))

model = clf.best_estimator_
model.fit_sync(x_train,y_train)
test_pred = model.predict_sync(x_test)

mae = SyncableMetrics.compute_metrics(model, mean_absolute_error, y_test, test_pred, data.iloc[:,:-1].values,"predictionCol", 'target')
mse = SyncableMetrics.compute_metrics(model, mean_squared_error, y_test, test_pred, data.iloc[:,:-1].values,"predictionCol", 'target')
syncer.sync()

# Pipeline

In [None]:
from sklearn.pipeline import Pipeline
from sklearn import decomposition

# Creating the pipeline
pca = decomposition.PCA()
lr = mdb.linear_model.LinearRegression()
pipe = Pipeline(steps=[('pca', pca), ('logistic', lr)])

# Fit the pipeline
pipe.fit_sync(x_train, y_train)

test_pred = pipe.predict(x_test)
# Compute various metrics on the testing set
SyncableMetrics.compute_metrics(pipe, mean_absolute_error, y_test, test_pred, data.iloc[:,:-1].values, "predictionCol",'target')
SyncableMetrics.compute_metrics(pipe, mean_squared_error, y_test, test_pred, data.iloc[:,:-1].values, "predictionCol",'target')

syncer.sync()

# Both

In [None]:
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.grid_search import GridSearchCV
from sklearn.pipeline import Pipeline

from modeldb.sklearn_native.ModelDbSyncer import *

digits = datasets.load_digits()
x = digits.data[:1000]
y = digits.target[:1000]

parameters = {
    'tfidf__use_idf': (True, False),
    'tfidf__norm': ('l1', 'l2'),
    'clf__alpha': (0.00001, 0.000001),
    'clf__penalty': ('l2', 'elasticnet')
}

pipeline = Pipeline([
    ('tfidf', TfidfTransformer()),
    ('clf', SGDClassifier()),
])

clf = GridSearchCV(
    pipeline, parameters, cv=None, scoring='%s_weighted' % 'precision')

clf.fit_sync(x, y)
syncer.sync()
