This file performs classification on the trips dataframe

Imports

In [None]:
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import StratifiedKFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_auc_score
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

Read in data

In [None]:
kinematics = pd.read_feather('generated_data/reduced_kinematics.feather')

The following performs classification using a DecisionTreeClassifier

In [None]:
# Puts labels in order from most to least trips for confusion matrix
class_labels = np.array(kinematics['agent_name'].value_counts().index)

# print('Num agents: ', len(df['Agent_ID'].unique()))
# print()
# print('Num trips: ', len(df))

# print(df['agent_name'].value_counts())
# print()

df = kinematics.drop(columns=['Agent_ID', 'Start_time', 'End_time', 'modality'])
df = df[[c for c in df if c not in ['agent_name']] + ['agent_name']]

df = df.to_numpy()

# columns: accuracy, ROC_AUC, F1 score
stats = np.zeros((5, 3))

# Cycle 5 folds
kf = StratifiedKFold(shuffle=True)
for k, (train_index, test_index) in enumerate(kf.split(df[:, :-1], df[:, -1])):
    print('Fold number %d' % k)

    # Divide data
    train = np.vstack([df[i] for i in train_index])
    train_X = train[:, :-1]
    train_Y = train[:, -1]
    test = np.vstack([df[i] for i in test_index])
    test_X = test[:, :-1]
    test_Y = test[:, -1]

    # Create Decision Tree classifier
    clf = DecisionTreeClassifier()
    clf.fit(train_X, train_Y)

    acc = clf.score(test_X, test_Y)
    print('Accuracy: %.3f' % acc)
    stats[k][0] = acc

    y_pred = clf.predict_proba(test_X)
    auc = roc_auc_score(test_Y, y_pred, multi_class='ovr')
    print('AUC: %.3f' % auc)
    stats[k][1] = auc

    y_pred = clf.predict(test_X)
    f1 = f1_score(test_Y, y_pred, average='macro')
    print('F1 Score: %.3f' % f1)
    stats[k][2] = f1

    fig, ax = plt.subplots(figsize=(10, 10))

    cf = confusion_matrix(test_Y, y_pred, labels=class_labels)
    disp = ConfusionMatrixDisplay(confusion_matrix=cf, display_labels=class_labels)
    disp.plot(ax=ax)
    ax.tick_params(axis='x', labelrotation=90)
    #plt.savefig('figures/dtree{}.pdf'.format(k), format='pdf', bbox_inches='tight')
    plt.show()

    print()

print('Average +- 1 std dev')
print('Accuracy: %.3f +- %.3f' % (np.mean(stats[:, 0]), np.std(stats[:, 0])))
print('AUC: %.3f +- %.3f' % (np.mean(stats[:, 1]), np.std(stats[:, 1])))
print('F1 Score: %.3f +- %.3f' % (np.mean(stats[:, 2]), np.std(stats[:, 2])))

The following performs classification using a Weighted Random Guess

In [None]:
# Randomly guesses a label, based on the proportion of the labels in the data
def weighted_random_guess(data):
    # Generate random index in the data
    index = random.randint(0, len(data) - 1)
    return data[index][-1]


# Puts labels in order from most to least trips for confusion matrix
class_labels = np.array(kinematics['agent_name'].value_counts().index)

df = kinematics.drop(columns=['Agent_ID', 'Start_time', 'End_time', 'modality'])
df = df[[c for c in df if c not in ['agent_name']] + ['agent_name']]

df = df.to_numpy()

stats = np.zeros((5, 3))

# Cycle 5 folds
kf = StratifiedKFold(shuffle=True)
for k, (train_index, test_index) in enumerate(kf.split(df[:, :-1], df[:, -1])):
    print('Fold number %d' % k)

    # Divide data
    train = np.vstack([df[i] for i in train_index])
    train_X = train[:, :-1]
    train_Y = train[:, -1]
    test = np.vstack([df[i] for i in test_index])
    test_X = test[:, :-1]
    test_Y = test[:, -1]

    # Create weighted random guesses
    y_pred = []
    for i in range(len(test_Y)):
        y_pred.append(weighted_random_guess(df))
    y_pred = np.array(y_pred)

    # Calculate accuracy
    hits = 0
    for i in range(len(y_pred)):
        if y_pred[i] == test_Y[i]:
            hits += 1
    acc = hits / len(y_pred)
    print('Accuracy: %.3f' % acc)
    stats[k][0] = acc

    # This is essentially a waste of time: ROC_AUC is always 0.5 for random guessing
    y_pred_probs = np.zeros(shape=(len(test_Y), len(class_labels)))
    for i in range(len(class_labels)):
        y_pred_probs[:, i] = np.count_nonzero(df[:, -1] == class_labels[i]) / len(df)
    auc = roc_auc_score(test_Y, y_pred_probs, multi_class='ovr')
    print('AUC: %.3f' % auc)
    stats[k][1] = auc

    f1 = f1_score(test_Y, y_pred, average='macro')
    print('F1 Score: %.3f' % f1)
    stats[k][2] = f1

    fig, ax = plt.subplots(figsize=(10, 10))

    cf = confusion_matrix(test_Y, y_pred, labels=class_labels)
    disp = ConfusionMatrixDisplay(confusion_matrix=cf, display_labels=class_labels)
    disp.plot(ax=ax)
    ax.tick_params(axis='x', labelrotation=90)
    #plt.savefig('figures/weighted{}.pdf'.format(k), format='pdf', bbox_inches='tight')
    plt.show()

    print()

print('Average +- 1 std dev')
print('Accuracy: %.3f +- %.3f' % (np.mean(stats[:, 0]), np.std(stats[:, 0])))
print('AUC: %.3f +- %.3f' % (np.mean(stats[:, 1]), np.std(stats[:, 1])))
print('F1 Score: %.3f +- %.3f' % (np.mean(stats[:, 2]), np.std(stats[:, 2])))

The following performs classification using a True Random Guess

In [None]:
# Puts labels in order from most to least trips for confusion matrix
class_labels = np.array(kinematics['agent_name'].value_counts().index)

df = kinematics.drop(columns=['Agent_ID', 'Start_time', 'End_time', 'modality'])
df = df[[c for c in df if c not in ['agent_name']] + ['agent_name']]

df = df.to_numpy()

stats = np.zeros((5, 3))

# Cycle 5 folds
kf = StratifiedKFold(shuffle=True)
for k, (train_index, test_index) in enumerate(kf.split(df[:, :-1], df[:, -1])):
    print('Fold number %d' % k)

    # Divide data
    train = np.vstack([df[i] for i in train_index])
    train_X = train[:, :-1]
    train_Y = train[:, -1]
    test = np.vstack([df[i] for i in test_index])
    test_X = test[:, :-1]
    test_Y = test[:, -1]

    # Create unweighted random guesses
    y_pred = []
    for i in range(len(test_Y)):
        y_pred.append(class_labels[random.randint(0, len(class_labels) - 1)])
    y_pred = np.array(y_pred)

    # Calculate accuracy
    hits = 0
    for i in range(len(y_pred)):
        if y_pred[i] == test_Y[i]:
            hits += 1
    acc = hits / len(y_pred)
    print('Accuracy: %.3f' % acc)
    stats[k][0] = acc

    # This is essentially a waste of time: ROC_AUC is always 0.5 for random guessing
    y_pred_probs = np.zeros(shape=(len(test_Y), len(class_labels)))
    for i in range(len(class_labels)):
        y_pred_probs[:, i] = np.count_nonzero(df[:, -1] == class_labels[i]) / len(df)
    auc = roc_auc_score(test_Y, y_pred_probs, multi_class='ovr')
    print('AUC: %.3f' % auc)
    stats[k][1] = auc

    f1 = f1_score(test_Y, y_pred, average='macro')
    print('F1 Score: %.3f' % f1)
    stats[k][2] = f1

    fig, ax = plt.subplots(figsize=(10, 10))

    cf = confusion_matrix(test_Y, y_pred, labels=class_labels)
    disp = ConfusionMatrixDisplay(confusion_matrix=cf, display_labels=class_labels)
    disp.plot(ax=ax)
    ax.tick_params(axis='x', labelrotation=90)
    #plt.savefig('figures/weighted{}.pdf'.format(k), format='pdf', bbox_inches='tight')
    plt.show()

    print()

print('Average +- 1 std dev')
print('Accuracy: %.3f +- %.3f' % (np.mean(stats[:, 0]), np.std(stats[:, 0])))
print('AUC: %.3f +- %.3f' % (np.mean(stats[:, 1]), np.std(stats[:, 1])))
print('F1 Score: %.3f +- %.3f' % (np.mean(stats[:, 2]), np.std(stats[:, 2])))