In [1]:
# "Import" the bootstrap code
%run "little_bootstrap"
%matplotlib inline

# Import the necessary libraries for code in this notebook. 
import os
import copy
import numpy as np
import matplotlib.pyplot as plt
#import seaborn as sns 
import pandas as pd
from sklearn.model_selection import GridSearchCV  
from sklearn import metrics
from sklearn.svm import SVC
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.datasets import load_diabetes, load_digits
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

from utils import *
from model import *
from config import Config
import sys
import torch
import torch.optim as optim
from torch import nn

# Load datasets

In [3]:
# Read the training and testing data
train = pd.read_csv('/Users/jamesmashiyane/Desktop/BoLb/train.csv', header=None)
test = pd.read_csv('/Users/jamesmashiyane/Desktop/BoLb/test.csv', header=None)
Xtrn = train.iloc[:,1]
Ytrn = train.iloc[:,2]
Xtest = test.iloc[:,1]
Ytest = test.iloc[:,2]

Xtrn, Ytrn, Xtest, Ytest = np.asarray(Xtrn), np.asarray(Ytrn), np.asarray(Xtest), np.asarray(Ytest)

In [None]:
#Set the parameters for a grid search by cross-validation
tuned_parameters = [{'kernel': ['rbf'], 'gamma': [1e-3, 1e-4],
                     'C': [1, 10, 100, 1000]},
                    {'kernel': ['linear'], 'C': [1, 10, 100, 1000]}]

In [None]:
# Carry out the grid search, fit the model, and pring out the 'true' f1 score.
clf_full = GridSearchCV(SVC(C=1), tuned_parameters, cv=5, n_jobs=2, scoring='f1_micro')
clf_full.fit(Xtrn, Ytrn)

In [None]:
true_f1 = clf_full.score(Xtest, Ytest)
print('F1 score on full data set after tuning the parameters', true_f1)

In [None]:
# Now carry out the optimization on separate randomly generated
# data sets of different sizes and do a bootstrap on each.
# The histograms plotted give the bounds on estimated F1 scores.

In [None]:
for train_frac in [0.2, 0.5]:
    print('\nRunning bootstrap on {}/10ths of the training and test data'.format(int(train_frac*10)))

    X_train, _, Y_train, _ = train_test_split(Xtrn, Ytrn, test_size=train_frac, random_state=0)
    #_, X_test, _, Y_test = train_test_split(Xtest, Ytest, test_size=train_frac, random_state=0)

    clf_search = GridSearchCV(SVC(), tuned_parameters, cv=5, n_jobs=2, scoring='f1_micro')
    clf_search.fit(X_train, Y_train)

    # GridSearchCV.score does not use frequencies 
    # so it is necessary to redefine a simple 
    # SVC model.
    clf = clf_search.best_estimator_

    l = LBOB()
    l.use_freqs = True
    l.sample_size = 0.6
    l.n_subsamples = 8
    l.n_trials = 30
    # To use the standard SVC scoring, mean accuracy
    #l.set_score_func(lambda x, y, freq: clf.score(x, y, freq))
    # To use an F1 score
    l.score_func = lambda x, y, freq: metrics.f1_score(y_true=y, y_pred=clf.predict(x), sample_weight=freq, average='micro')
    lbob_big_boot(l, Xtest, Ytest)
    print('Independent estimate of F1 Score {:0.5}'.format(l.scores.mean()))
    lbob_histogram(l, actual=true_f1)