In [1]:
import os

class MCI_Constants(object):
    # ------------------------------------------------------------------
    #    Directories
    # ------------------------------------------------------------------
    DATASETS_MCI = "datasets/mci"
    OUTPUT = "output"

    # ------------------------------------------------------------------
    #     Training files
    # ------------------------------------------------------------------
    YA_SINGLE_BREAKFAST_SCOOP = "YA-Single-Breakfast-Scoop-Aggregate.csv"
    YA_SINGLE_BREAKFAST_SELECT = "YA-Single-Breakfast-Select-Aggregate.csv"
    YA_SINGLE_BREAKFAST_SELECT_STIR_SCOOP = "YA-Single-Breakfast-Select-Stir-Scoop-Aggregate.csv"
    YA_SINGLE_BREAKFAST_STIR = "YA-Single-Breakfast-Stir-Aggregate.csv"

    # ------------------------------------------------------------------
    #     Testing files
    # ------------------------------------------------------------------
    OA_BREAKFAST_SCOOP = "OA-Breakfast-Scoop-Aggregate.csv"
    OA_BREAKFAST_SELECT = "OA-Breakfast-Select-Aggregate.csv"
    OA_BREAKFAST_STIR = "OA-Breakfast-Stir-Aggregate.csv"
    YA_SINGLE_LUNCH_SCOOP = "YA-Single-Lunch-Scoop-Aggregate.csv"
    YA_SINGLE_LUNCH_SELECT = "YA-Single-Lunch-Select-Aggregate.csv"
    YA_SINGLE_LUNCH_STIR = "YA-Single-Lunch-Stir-Aggregate.csv"

    # ------------------------------------------------------------------
    #     Input files
    # ------------------------------------------------------------------
    OA_BREAKFAST = "OA-Breakfast.csv"
    YA_SINGLE_LUNCH = "YA-Single-Lunch.csv"

    # ------------------------------------------------------------------
    #     Classifier model file paths
    # ------------------------------------------------------------------
    def get_nearest_neighbors_clf_path(task_name):
        return os.path.join("models", task_name, "knnCLF.joblib.pkl")
    def get_random_forest_clf_path(task_name):
        return os.path.join("models", task_name, "rfCLF.joblib.pkl")
    def get_svm_clf_path(task_name):
        return os.path.join("models", task_name, "svmCLF.joblib.pkl")

    # ------------------------------------------------------------------
    #     DEFAULTS
    # ------------------------------------------------------------------
    DEFAULT_DATASET_PATH = DATASETS_MCI
    DEFAULT_INPUT_FILENAME = OA_BREAKFAST
    DEFAULT_OUTPUT_DIR = OUTPUT
    DEFAULT_TRAIN_FILE = YA_SINGLE_BREAKFAST_SELECT

In [2]:
import pandas as pd

# base data load function
# returns CSV file as a Pandas data frame
def load_mci_file(dataset_path=MCI_Constants.DEFAULT_DATASET_PATH, filename=MCI_Constants.DEFAULT_TRAIN_FILE):
    csv_path = os.path.join(dataset_path, filename)
    return pd.read_csv(csv_path)    

In [3]:
def write_output_file(filename, data):
    output_path = os.path.join(MCI_Constants.DEFAULT_OUTPUT_DIR, filename)
    final_data = "\n".join(filter(None, data))
    # print(final_data)
    
    f = open(output_path, 'w+')
    f.write(final_data)
    f.close
    
    print("Output written to file at:", output_path)
    return

In [4]:
from sklearn.preprocessing import StandardScaler
import numpy as np

# retrieve the value fields and subsequent labels we want to process
def get_value_label_matrices(data_set):
    train_labels = data_set["Label"].copy().as_matrix()
    train_values_raw = data_set[[
        "Mean-Mag", 
        "Variance-Mag", 
        "Skewness-Mag", 
        "Kurtosis-Mag", 
        "RMS-Mag"]].copy().as_matrix()

    # scale sample values
    scaler = StandardScaler()
    train_values = scaler.fit_transform(train_values_raw.astype(np.float64))

    # should match
    print(len(train_values), "values + ", len(train_labels), "labels") 
    return train_values, train_labels

In [5]:
# double check to make sure the records we know about return the correct results
def verify_known_values(clf, train_set, train_values, print_sample_info=True):
    # ------------------------------------------------------------------
    #     Known "Select" sample
    # ------------------------------------------------------------------
    ya8_select_coffee_orig = train_set.as_matrix()[3]
    ya8_select_coffee = train_values[3] 
    if (print_sample_info):
        print("")
        print("Sample name (YA8 select-coffee):", ya8_select_coffee_orig[0])
        print("Orig mean (0.833351):", ya8_select_coffee_orig[4])
        print("Scaled mean (0.09575075):", ya8_select_coffee[0])
        print(ya8_select_coffee)

    # ------------------------------------------------------------------
    #     Known "Stir" sample
    # ------------------------------------------------------------------
    ya11_stir_mug2_orig = train_set.as_matrix()[24]
    ya11_stir_mug2 = train_values[0] 
    if (print_sample_info):
        print("")
        print("Sample name (YA11 stir-mug-2):", ya11_stir_mug2_orig[0])
        print("Orig mean (0.816233):", ya11_stir_mug2_orig[4])
        print("Scaled mean (-0.13537523):", ya11_stir_mug2[0])
        print(ya11_stir_mug2)
    
    # ------------------------------------------------------------------
    #     Known "Scoop" sample
    # ------------------------------------------------------------------
    ya5_scoop_orig = train_set.as_matrix()[12]
    ya5_scoop = train_values[3] 
    if (print_sample_info):
        print("")
        print("Sample name (YA5 scoop-and-spread-butter):", ya5_scoop_orig[0])
        print("Orig mean (0.890791):", ya5_scoop_orig[4])
        print("Scaled mean (0.09575075):", ya5_scoop[0])
        print(ya5_scoop)

    # ------------------------------------------------------------------
    #     Confirm predictions
    # ------------------------------------------------------------------
    print("")
    prediction = clf.predict([ya8_select_coffee]) 
    print("Prediction for YA8 select-coffee:", prediction)
    prediction = clf.predict([ya11_stir_mug2]) 
    print("Prediction for YA11 stir-mug-2:", prediction)
    prediction = clf.predict([ya5_scoop]) 
    print("Prediction for YA5 scoop-and-spread-butter:", prediction)
    return

In [6]:
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix

# cross validate classifier predictions for a set of values and expected labels
# returns set of predicted labels
def cross_validate(clf, values, labels, is_multi=False):
    validator_num = 20
    scoring_method = "accuracy"
    
    predicted_labels = cross_val_predict(clf, values, labels, cv=validator_num)
    if (not is_multi):
        actual_preds = confusion_matrix(labels, predicted_labels)
        perfect_preds = confusion_matrix(labels, labels)

        print("")
        print("Actual Predictions:")
        print(actual_preds)
        print("Perfect Predictions:")
        print(perfect_preds)
    
    score = cross_val_score(clf, values, labels, cv=validator_num, scoring=scoring_method)
    print("")
    print("Classifier accuracy:", score)
    return predicted_labels

In [7]:
from sklearn.metrics import precision_score, recall_score, f1_score

# ------------------------------------------
#
# PRECISION:
# What percentage of the time is a positive hit truly an accurate hit
#
# ------------------------------------------
#
# RECALL:
# What percentage of true positives are found
#
# ------------------------------------------
#
# F-SCORE:
# Harmonic mean of precision and recall
#
# ------------------------------------------

# calculating precision, recall, and f-score for a set of predicted labels vs. actual labels
def evaluate_predictions(task_labels, label_predictions, is_multi=False):
    if (not is_multi):
        precision = precision_score(task_labels, label_predictions)
        recall = recall_score(task_labels, label_predictions)
        fscore = f1_score(task_labels, label_predictions)

        print("")
        print("Default Precision:", precision)
        print("Default Recall:", recall)
        print("Default F-score:", fscore)

    precision = precision_score(task_labels, label_predictions, average='micro')
    recall = recall_score(task_labels, label_predictions, average='micro')
    fscore = f1_score(task_labels, label_predictions, average='micro')

    print("")
    print("Micro Precision:", precision)
    print("Micro Recall:", recall)
    print("Micro F-score:", fscore)

    precision = precision_score(task_labels, label_predictions, average='macro')
    recall = recall_score(task_labels, label_predictions, average='macro')
    fscore = f1_score(task_labels, label_predictions, average='macro')

    print("")
    print("Macro Precision:", precision)
    print("Macro Recall:", recall)
    print("Macro F-score:", fscore)

    precision = precision_score(task_labels, label_predictions, average='weighted')
    recall = recall_score(task_labels, label_predictions, average='weighted')
    fscore = f1_score(task_labels, label_predictions, average='weighted')

    print("")
    print("Weighted Precision:", precision)
    print("Weighted Recall:", recall)
    print("Weighted F-score:", fscore)
    return

In [8]:
from sklearn.model_selection import train_test_split

# split the dataset into training (80%) and testing (20%) partitions
def split_train_test_matrices(dataset):
    train_set, test_set = train_test_split(dataset, test_size=0.2, random_state=42)
    print(len(train_set), "train +", len(test_set), "test")

    # from the original training set, parse out the values / labels
    train_values, train_labels = get_value_label_matrices(train_set)
    test_values, test_labels = get_value_label_matrices(test_set)
    return train_values, train_labels, test_values, test_labels

# get cross-val predictions from a single classifier
def cross_validate_classifier_predictions(clf, values, labels):
    # cross validation with confusion matrix
    predicted_labels = cross_validate(clf, values, labels)
    # calculating precision, recall, and f-score
    evaluate_predictions(labels, predicted_labels)
    return

def validate_classifier_predictions(clf, values, labels):
    predicted_labels = clf.predict(values)
    evaluate_predictions(labels, predicted_labels)
    return

In [9]:
from sklearn.ensemble import RandomForestClassifier

# apply random forest classifier to training set
def train_rf_classifier(values, labels, limit_label=None, estimators=100):
    if (limit_label is not None):
        labels = (labels == limit_label)
    
    rfCLF = RandomForestClassifier(n_estimators=estimators) # 100)
    rfCLF = rfCLF.fit(values, labels) # train_values, train_labels)
    cross_validate_classifier_predictions(rfCLF, values, labels)
    return rfCLF

In [10]:
from sklearn import svm

# apply SVM classifier to training set
def train_svm_classifier(values, labels, limit_label=None):
    if (limit_label is not None):
        labels = (labels == limit_label)
    
    svmCLF = svm.SVC()
    svmCLF.fit(values, labels) 
    cross_validate_classifier_predictions(svmCLF, values, labels)
    return svmCLF

In [11]:
from sklearn.neighbors import KNeighborsClassifier

# apply nearest neighbors classifier to training set
def train_knn_classifier(values, labels, limit_label=None, n_neighbors=3):
    if (limit_label is not None):
        labels = (labels == limit_label)
    
    knnCLF = KNeighborsClassifier(n_neighbors)
    knnCLF = knnCLF.fit(values, labels)
    cross_validate_classifier_predictions(knnCLF, values, labels)
    return knnCLF

In [12]:
def get_trained_classifier_predictions(data_set, clf):
    values, labels = get_value_label_matrices(data_set)
    predictions = clf.predict(values)
    
    results = [ None ] * (len(data_set) + 1)
    results[0] = "Source,Prediction"
    
    for i in range (0, len(data_set) - 1):
        data_item = data_set.as_matrix()[i]
        results[i+1] = str(data_item[0]) + "," + str(predictions[i])
        # print("Source:", data_item[0], "Prediction:", predictions[i])
    return results

In [13]:
from sklearn.externals import joblib

def write_clf_to_file(clf, filename):
    _ = joblib.dump(clf, filename, compress=9)
    return

def load_clf_from_file(filename):
    clf2 = joblib.load(filename)
    return clf2

In [14]:
# print("# ---------------------------------------------------")
# print("    K-NEAREST WITH MULTI-CLASS LABELS    ")
# print("# ---------------------------------------------------")

# primary_task_labels = (train_labels == 1)
# secondary_task_labels = (train_labels == 2)
# tertiary_task_labels = (train_labels == 3)
# train_labels_multi = np.c_[primary_task_labels, secondary_task_labels, tertiary_task_labels]

# knnCLF = knnCLF.fit(train_values, train_labels_multi)
# verify_known_values(knnCLF, train_set, train_values, False)

# predicted_labels = cross_validate(knnCLF, train_values, train_labels_multi, True)
# evaluate_predictions(train_labels_multi, predicted_labels, True)