**This Jupyter Notebook is a very early draft but adding to github to track changes over time**

# Resources
- Guido and Müller, *Introduction to Machine Learning with Python*, Oct 2016 [[book](https://shop.oreilly.com/product/0636920030515.do)]. You can view this book online for free UW's license with Safari Online. Visit via http://lib.uw.edu with a logged in UW account and search for the book. You should see a "View It" option, which gives you instantaneous online access to the full book.
- Andreas Müller, *Applied Machine Learning*, Spring 2019 [[website](https://www.cs.columbia.edu/~amueller/comsw4995s19/schedule/)]
- Scikit Learn, *User Guide*, [[website](https://scikit-learn.org/stable/user_guide.html)]

## TODO
- generate timeseries data and show full pipeline (smoothing, feature extraction, classification)

# K-Fold

In [10]:
# Some general includes we need
import numpy as np
import pandas as pd

In [11]:
# Some utility functions

def print_folds(cross_validator, input_features, class_labels, trial_num):
    # Iterate through and print out the splits
    fold_cnt = 0
    for train_idx, test_idx in skf.split(X, class_labels):
        print("FOLD", fold_cnt)
        print("\tTRAINING SET:")
        for i in train_idx:
            print("\t\t{} {}".format(class_labels[i], trial_num[i]))

        print("\tTEST SET:")
        for i in test_idx:
            print("\t\t{} {}".format(class_labels[i], trial_num[i]))
        fold_cnt += 1

In [12]:
# Setup the test data
# X is our full dataset
X = np.array([[1, 2], [3, 4], [1, 2], [9, 11], [1, 2], [3, 4]])

# Y contains our class labels for the dataset
class_labels = np.array(["At Rest", "At Rest", "At Rest", "Baseball Throw", "Baseball Throw", "Baseball Throw"])
trial_num = np.array([1,2,3,1,2,3])
feature_names = ["feature1", "feature2"]

# Make X and Y into a Dataframe so we can print it out nicely :)
df = pd.DataFrame(X, columns=feature_names)
df['TrialNum'] = trial_num
df['ClassLabel'] = class_labels

display(df)




Unnamed: 0,feature1,feature2,TrialNum,ClassLabel
0,1,2,1,At Rest
1,3,4,2,At Rest
2,1,2,3,At Rest
3,9,11,1,Baseball Throw
4,1,2,2,Baseball Throw
5,3,4,3,Baseball Throw


In [15]:
from sklearn.model_selection import KFold

# Set the number of folds
num_folds = 3
skf = KFold(n_splits=num_folds)
 
print(skf) # print out the KFold settings
print_folds(skf, X, class_labels, trial_num)

KFold(n_splits=3, random_state=None, shuffle=False)
FOLD 0
	TRAINING SET:
		At Rest 3
		Baseball Throw 1
		Baseball Throw 2
		Baseball Throw 3
	TEST SET:
		At Rest 1
		At Rest 2
FOLD 1
	TRAINING SET:
		At Rest 1
		At Rest 2
		Baseball Throw 2
		Baseball Throw 3
	TEST SET:
		At Rest 3
		Baseball Throw 1
FOLD 2
	TRAINING SET:
		At Rest 1
		At Rest 2
		At Rest 3
		Baseball Throw 1
	TEST SET:
		Baseball Throw 2
		Baseball Throw 3


In [16]:
# Set the number of folds but this time shuffle them!
skf = KFold(n_splits=num_folds, shuffle=True)
 
print(skf) # print out the KFold settings
print_folds(skf, X, class_labels, trial_num)

# But we still have a problem because are training and test sets are imbalanced

KFold(n_splits=3, random_state=None, shuffle=True)
FOLD 0
	TRAINING SET:
		At Rest 2
		At Rest 3
		Baseball Throw 2
		Baseball Throw 3
	TEST SET:
		At Rest 1
		Baseball Throw 1
FOLD 1
	TRAINING SET:
		At Rest 1
		At Rest 2
		At Rest 3
		Baseball Throw 1
	TEST SET:
		Baseball Throw 2
		Baseball Throw 3
FOLD 2
	TRAINING SET:
		At Rest 1
		Baseball Throw 1
		Baseball Throw 2
		Baseball Throw 3
	TEST SET:
		At Rest 2
		At Rest 3


In [None]:
# So, we use stratified k-fold to balance our classes between folds
skf = StratifiedKFold(n_splits=num_folds, shuffle=True)
print_folds(skf, X, class_labels, trial_num)


In [None]:
# See: https://scikit-learn.org/stable/modules/cross_validation.html#cross-validation
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import datasets
from sklearn import svm

iris = datasets.load_iris()
iris.data.shape, iris.target.shape
#print(type(iris.data[0]), iris.target[0])
iris.feature_names

In [None]:
# Pandas
# See: 
# - https://pandas.pydata.org/pandas-docs/stable/getting_started/10min.html
# - DataFrames from Python structures: https://pbpython.com/pandas-list-dict.html
import pandas as pd

# convert scikitlearn dataset to a pandas dataset
data = datasets.load_iris()
df = pd.DataFrame(data['data'], columns=data['feature_names'])
df['target'] = data['target']
df

# Visualizing Features and Relationships

## Pairplots

In [None]:
# Draw scatterplots for joint relationships and histograms for univariate distributions
# See: https://seaborn.pydata.org/generated/seaborn.pairplot.html
import seaborn as sns
sns.set(style="ticks", color_codes=True)
iris = sns.load_dataset("iris")
g = sns.pairplot(iris)
#print(iris)
# type(iris)

In [None]:
# Show different levels of a categorical variable by the color of plot elements:
g = sns.pairplot(iris, hue="species")

In [None]:
# change the colors and use markers + plot distribitions along diagonal rather than histograms
g = sns.pairplot(iris, hue="species", diag_kind='kde', palette="husl", markers=["o", "s", "D"])

# Parameter Tuning

In [None]:
# Parameter estimation using grid search with cross validation
# From: https://scikit-learn.org/stable/auto_examples/model_selection/plot_grid_search_digits.html
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.svm import SVC

print(__doc__)

# Loading the Digits dataset
digits = datasets.load_digits()

# To apply an classifier on this data, we need to flatten the image, to
# turn the data in a (samples, feature) matrix:
n_samples = len(digits.images)
X = digits.images.reshape((n_samples, -1))
y = digits.target

# Split the dataset in two equal parts
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.5, random_state=0)

# Set the parameters by cross-validation
tuned_parameters = [{'kernel': ['rbf'], 'gamma': [1e-3, 1e-4],
                     'C': [1, 10, 100, 1000]},
                    {'kernel': ['linear'], 'C': [1, 10, 100, 1000]}]

scores = ['precision', 'recall']

for score in scores:
    print("# Tuning hyper-parameters for %s" % score)
    print()

    clf = GridSearchCV(SVC(), tuned_parameters, cv=5,
                       scoring='%s_macro' % score)
    clf.fit(X_train, y_train)

    print("Best parameters set found on development set:")
    print()
    print(clf.best_params_)
    print()
    print("Grid scores on development set:")
    print()
    means = clf.cv_results_['mean_test_score']
    stds = clf.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, clf.cv_results_['params']):
        print("%0.3f (+/-%0.03f) for %r"
              % (mean, std * 2, params))
    print()

    print("Detailed classification report:")
    print()
    print("The model is trained on the full development set.")
    print("The scores are computed on the full evaluation set.")
    print()
    y_true, y_pred = y_test, clf.predict(X_test)
    print(classification_report(y_true, y_pred))
    print()

# Note the problem is too easy: the hyperparameter plateau is too flat and the
# output model is the same for precision and recall with ties in quality.