In [1]:
import pandas as pd
import numpy as np
import pygaze
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import seaborn as sns
import scipy
import glob
from tqdm import tqdm
from sklearn.cluster import DBSCAN
import detectors
import gazeplotter
from collections import defaultdict
# import local lib
import eye_metrics_utils
import data_utils
import gaze_entropy

In [2]:
import warnings
# warnings.filterwarnings(action='once')
warnings.filterwarnings('ignore')

In [3]:
def run_all(df_data):
    df_x = df_data.copy()
    if (data_utils.check_percentage_null(df_x) < 0.5): # if missing value > 50%, remove
        return None
    
    time = np.array(df_data['Start Time (secs)'].tolist())

    Efix = eye_metrics_utils.detect_fixations(df_x)
    Eblk = eye_metrics_utils.detect_blinks(df_x)
    Esac = eye_metrics_utils.detect_saccades(df_x)
    Emsac = eye_metrics_utils.detect_microsaccades(df_x)
#     print(Efix)
    X = np.array(Efix).T[3:].T
    Hs, Ht = gaze_entropy.entropy(X)
    total_time = time[-1] - time[0]
    
    return Efix, Hs, Ht, total_time, Eblk, Esac, Emsac
    

In [4]:
csv_files = glob.glob("data/*.csv")

In [5]:
csv_files_one = [v for v in csv_files if "One Gaze-Vergence" in v]
csv_files_two = [v for v in csv_files if "Two Gaze-Vergence" in v]
csv_files_three = [v for v in csv_files if "Three Go-Around Gaze-Vergence" in v]

# classification

In [None]:
df_par = pd.read_csv("participant.csv")
group = [df_par[df_par['Group'].str.contains("1")]['ID'].tolist(), df_par[df_par['Group'].str.contains("2")]["ID"].tolist()]
group = [[i[-3:] for i in v] for v in group]
group

In [None]:
feature_groups = []
for g in group:
    trials = []
    for csv_files in [csv_files_one, csv_files_two, csv_files_three]:
        ret = defaultdict(list)
        for csv in csv_files:
            par_id = csv[14:17]
            if par_id not in g:
                continue
                
            df_data = pd.read_csv(csv)
            print(csv, len(df_data))

            for v in data_utils.data_slicing(df_data, stride = 1200):
                r = run_all(v)
                if r != None:
                    Efix, Hs, Ht, total_time, Eblk, Esac, Emsac = r
                    ret["Eblk"].append(Eblk)
                    ret["Efix"].append(Efix)
                    ret["Esac"].append(Esac)
                    ret["Emsac"].append(Emsac)
#                     ret["trans_matrix"].append(trans_matrix)
                    ret["Hs"].append(Hs)
                    ret["Ht"].append(Ht)
                    ret["total_time"].append(total_time)
        trials.append(ret)
    feature_groups.append(trials)

In [None]:
df_x = pd.DataFrame()
for j, g in enumerate(feature_groups):
    fix_dur = []
    for i, p in enumerate(g[1]['Efix']):
        fix_dur = np.append(fix_dur,np.mean(np.array(p).T[2]))
        
    Hs = g[1]['Hs']
    Ht = g[1]['Ht']
    
    fix_rate = np.array([len(v) for v in g[1]['Efix']])/np.array(g[1]['total_time'])
    blk_rate = np.array([len(v) for v in g[1]['Eblk']])/np.array(g[1]['total_time'])
    sac_rate = np.array([len(v) for v in g[1]['Esac']])/np.array(g[1]['total_time'])
    msac_rate = np.array([len(v) for v in g[1]['Emsac']])/np.array(g[1]['total_time'])

    print(len(fix_dur), len(fix_rate), len(sac_rate), len(blk_rate), len(msac_rate))
    group = j*np.ones_like(Hs)
    df = pd.DataFrame(zip(fix_dur, Hs, Ht, fix_rate, blk_rate, sac_rate, msac_rate, group), 
                        columns=["fix_dur", "Hs", "Ht", "fix_rate", "blk_rate", "sac_rate", "msac_rate", "group"]).astype({"group":"int"})
    df_x = pd.concat([df_x, df])
    
df_x


# classification models

In [None]:
len(df_x)

In [None]:
df_x.dtypes

In [None]:
df_x = df_x[~(df_x.iloc[:,:7] == 0).any(axis=1)]
len(df_x)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

In [None]:
X = df_x[['fix_dur', 'Hs', 'Ht', "fix_rate", "blk_rate", "sac_rate", "msac_rate"]].values
# X = df_x[['fix_dur', "fix_rate"]].values

y = df_x[['group']].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=5)

clf = LogisticRegression(random_state=1).fit(X_train, y_train)
# clf.predict(X_test)
clf.score(X_test, y_test)

In [None]:
features = ['fix_dur', 'Hs', 'Ht', "fix_rate", "blk_rate", "sac_rate", "msac_rate"]
print(clf.classes_)
for i in range(len(features)):
    print(features[i] + "      \t: " + str(round(clf.coef_[0][i],3)))

In [None]:
y_pred = clf.predict(X_test)
target_names = ['group 1', 'group 2']
print(classification_report(y_test, y_pred, target_names=target_names))

In [None]:
y_score = clf.decision_function(X_test)

In [None]:
print("AUC: ", roc_auc_score(y_test, y_score))

In [None]:
fpr, tpr, _ = roc_curve(y_test, y_score)
roc_auc = auc(fpr, tpr)

In [None]:
plt.figure()
lw = 2
plt.plot(fpr, tpr, color='darkorange',
         lw=lw, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic example')
plt.legend(loc="lower right")
plt.show()

In [None]:
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_moons, make_circles, make_classification
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.metrics import roc_auc_score, roc_curve, auc

In [None]:
h = .02  # step size in the mesh

names = ["Nearest Neighbors", "Linear SVM", "RBF SVM", "Gaussian Process",
         "Decision Tree", "Random Forest", "Neural Net", "AdaBoost",
         "Naive Bayes", "QDA"]

classifiers = [
    KNeighborsClassifier(2),
    SVC(kernel="linear", C=0.025),
    SVC(gamma=2, C=1),
    GaussianProcessClassifier(1.0 * RBF(1.0)),
    DecisionTreeClassifier(max_depth=5),
    RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
    MLPClassifier(alpha=1, max_iter=1000),
    AdaBoostClassifier(),
    GaussianNB(),
    QuadraticDiscriminantAnalysis()]

In [None]:
figure = plt.figure(figsize=(27, 9))
i = 1

# preprocess dataset, split into training and test part
X = StandardScaler().fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.4, random_state=42)

x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5
y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5
xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                     np.arange(y_min, y_max, h))

# just plot the dataset first
cm = plt.cm.RdBu
cm_bright = ListedColormap(['#FF0000', '#0000FF'])
ax = plt.subplot(1, len(classifiers) + 1, i)
ax.set_title("Input data")
# Plot the training points
ax.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=cm_bright,
           edgecolors='k')
# Plot the testing points
ax.scatter(X_test[:, 0], X_test[:, 1], c=y_test, cmap=cm_bright, alpha=0.6,
           edgecolors='k')
ax.set_xlim(xx.min(), xx.max())
ax.set_ylim(yy.min(), yy.max())
ax.set_xticks(())
ax.set_yticks(())
i += 1

# iterate over classifiers
for name, clf in zip(names, classifiers):
    ax = plt.subplot(1, len(classifiers) + 1, i)
    clf.fit(X_train, y_train)
    score = clf.score(X_test, y_test)
    y_pred = clf.predict(X_test)
    
    if hasattr(clf, "decision_function"):
        y_score = clf.decision_function(X_test)
    else:
        y_score = clf.predict_proba(X_test)[:, 1]
    

    print(name)
    print("AUC: ", roc_auc_score(y_test, y_score))
    target_names = ['group 1', 'group 2']
    print(classification_report(y_test, y_pred, target_names=target_names))

    # Plot the decision boundary. For that, we will assign a color to each
    # point in the mesh [x_min, x_max]x[y_min, y_max].
#     if hasattr(clf, "decision_function"):
#         Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()])
#     else:
#         Z = clf.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 1]

#     # Put the result into a color plot
#     Z = Z.reshape(xx.shape)
#     ax.contourf(xx, yy, Z, cmap=cm, alpha=.8)

#     # Plot the training points
#     ax.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=cm_bright,
#                edgecolors='k')
#     # Plot the testing points
#     ax.scatter(X_test[:, 0], X_test[:, 1], c=y_test, cmap=cm_bright,
#                edgecolors='k', alpha=0.6)

#     ax.set_xlim(xx.min(), xx.max())
#     ax.set_ylim(yy.min(), yy.max())
#     ax.set_xticks(())
#     ax.set_yticks(())
#     ax.set_title(name)
#     ax.text(xx.max() - .3, yy.min() + .3, ('%.2f' % score).lstrip('0'),
#             size=15, horizontalalignment='right')
#     i += 1

plt.tight_layout()
plt.show()

In [None]:
a = [[1,3,4,5,6,2,1,3,5,2,3,1],[1,2,3,1,2,3,4,5,3,2,2,3]]

a = StandardScaler().fit_transform(a)