In [1]:
from mongo_connection import Mongo_connection
import numpy as np
import pandas as pd
import itertools
import pair_transition_analysis
import granger_causation_test
from matplotlib import pyplot as plt
from collections import defaultdict
import roi_config
import fixation
import hypothesis_testing
import bulk_run
import anova
import re
import utils

[INFO] Created a mongodb instance.
[INFO] Currently connected to eye_tracking_db/fixation_sequences.
[INFO] Found 50 documents.


In [2]:
mongo = Mongo_connection()
mongo.connect()

[INFO] Created a mongodb instance.
[INFO] Currently connected to eye_tracking_db/fixation_sequences.
[INFO] Found 50 documents.


In [77]:
documents = mongo.find({})
document = documents[49]
print("trial: {}, group: {}, pID: {}".format(document["trial"], document["group"], document["pID"]))

d_data = document["data"]
df_data = pd.DataFrame(d_data)
df_data = fixation.merge_consecutive_fixations_in_same_roi(df_data)
transitions, L = pair_transition_analysis.encode_transition(df_data["roi"], "fix")

trial: 2, group: 1, pID: 032


In [78]:
transitions

'XAXAXAZXZXZXZXZXAZXZXZAXZXZXZXAXSXSXSXZXSXAZXSXSXZXASXSXSXSXSXSXSXAXAX'

In [40]:
def find_pos(ch, s):
    return [m.start() for m in re.finditer(ch, s)]

def create_first_order_transition_matrix(sequence):
    d = {}
    for c in sequence:
        if c in list(d.keys()):
            continue
        d_char = defaultdict(int)
        list_char_pos = find_pos(c, sequence)
        for pos in list_char_pos:
            if pos == len(sequence) - 1: # reach the end of the string
                continue
            next_char = sequence[pos + 1]
            d_char[next_char] += 1
        d[c] = d_char

    df_count = pd.DataFrame(d).reindex(columns=["A", "S", "X", "Z"]).sort_index().fillna(0)
    
    return df_count

In [10]:
create_first_order_transition_matrix(transitions)

(     A     S     X    Z
 A  0.0   5.0   6.0  5.0
 S  3.0   0.0  26.0  0.0
 X  9.0  24.0   0.0  6.0
 Z  4.0   0.0   6.0  0.0,
         A         S         X         Z
 A  0.0000  0.172414  0.157895  0.454545
 S  0.1875  0.000000  0.684211  0.000000
 X  0.5625  0.827586  0.000000  0.545455
 Z  0.2500  0.000000  0.157895  0.000000)

In [11]:
def create_empty_matrix():
    permutations = list(itertools.product("ASXZ", repeat=3))
    good_permutations = []
    for c in permutations:
        if c[0] == c[1] or c[1] == c[2]:
            continue
        good_permutations.append(c)
        
    d = defaultdict(dict)
    for perm in good_permutations:
        first_2gram_transition = perm[0] + perm[1]
        d[first_2gram_transition][perm[2]] = 0
        
    return d

In [39]:
def create_second_order_transition_matrix(sequence):
    d = create_empty_matrix()
    processed_first_2gram_transitions = []
    for i in range(len(sequence) - 1):
        first_2gram_transition = sequence[i:i+2]
        if first_2gram_transition in processed_first_2gram_transitions: # if the 2gram is already processed, then next
            continue
        
        list_pos = find_pos(first_2gram_transition, sequence)
        for pos in list_pos:
            if pos == len(sequence) - 2: # reach the end of the string
                continue
            d[first_2gram_transition][sequence[pos + 2]] += 1
            
        processed_first_2gram_transitions.append(first_2gram_transition)
            
    df_count = pd.DataFrame(d).sort_index()
    
    return df_count

In [170]:
def make_prob_matrix(df_count):
    list_sum = df_count.sum(axis = 0)
    for i, s in enumerate(list_sum):
        if s == 0:
            list_sum[i] = 1 # avoid divide 0
            
    df_prob = df_count/list_sum
    
    return df_prob

In [25]:
def exclude_Z(df):
    df_copy = df.copy()
    cols = list(df_copy.columns)
    for c in cols:
        if "Z" in c:
            df_copy = df_copy.drop(columns = [c])
    df_copy = df_copy.drop(["Z"])
    
    return df_copy

In [37]:
exclude_Z(df_count)

Unnamed: 0,AS,AX,SA,SX,XA,XS
A,0.0,3.0,,2.0,,0.0
S,,1.0,0.0,9.0,1.0,
X,1.0,,0.0,,5.0,12.0


In [168]:
df_count= create_second_order_transition_matrix(transitions)
df_count

Unnamed: 0,AS,AX,AZ,SA,SX,SZ,XA,XS,XZ,ZA,ZS,ZX
A,0.0,3.0,0.0,,2.0,0.0,,0.0,1.0,,0.0,3.0
S,,1.0,0.0,0.0,9.0,0.0,1.0,,0.0,0.0,,2.0
X,1.0,,3.0,0.0,,0.0,5.0,12.0,10.0,1.0,0.0,
Z,0.0,1.0,,0.0,2.0,,3.0,0.0,,0.0,0.0,8.0


In [169]:
df_prob = make_prob_matrix(df_count)
df_prob

here
here
here


Unnamed: 0,AS,AX,AZ,SA,SX,SZ,XA,XS,XZ,ZA,ZS,ZX
A,0.0,0.6,0.0,,0.153846,0.0,,0.0,0.090909,,0.0,0.230769
S,,0.2,0.0,0.0,0.692308,0.0,0.111111,,0.0,0.0,,0.153846
X,1.0,,1.0,0.0,,0.0,0.555556,1.0,0.909091,1.0,0.0,
Z,0.0,0.2,,0.0,0.153846,,0.333333,0.0,,0.0,0.0,0.615385


In [32]:
x = df_prob.values.reshape(-1)
print(x)
x[~np.isnan(x)]

[0.66666667 0.22222222 0.5               nan 0.13043478        nan
        nan 0.11538462 0.5               nan        nan 0.16666667
        nan 0.77777778 0.         0.         0.73913043        nan
 0.33333333        nan 0.         0.2               nan 0.33333333
 0.33333333        nan 0.5        0.6               nan        nan
 0.66666667 0.88461538 0.5        0.4               nan        nan
 0.         0.                nan 0.4        0.13043478        nan
 0.         0.                nan 0.4               nan 0.5       ]


array([0.66666667, 0.22222222, 0.5       , 0.13043478, 0.11538462,
       0.5       , 0.16666667, 0.77777778, 0.        , 0.        ,
       0.73913043, 0.33333333, 0.        , 0.2       , 0.33333333,
       0.33333333, 0.5       , 0.6       , 0.66666667, 0.88461538,
       0.5       , 0.4       , 0.        , 0.        , 0.4       ,
       0.13043478, 0.        , 0.        , 0.4       , 0.5       ])

In [17]:
df_count/df_count.sum(axis = 0)

Unnamed: 0,AS,AX,AZ,SA,SX,SZ,XA,XS,XZ,ZA,ZS,ZX
A,0.0,0.25,0.142857,,0.428571,0.6,,0.1,0.571429,,0.0,0.416667
S,,0.375,0.285714,0.0,0.428571,0.0,0.0,,0.0,0.166667,,0.083333
X,0.5,,0.571429,1.0,,0.4,0.75,0.6,0.428571,0.5,0.5,
Z,0.5,0.375,,0.0,0.142857,,0.25,0.3,,0.333333,0.5,0.5


In [152]:
pd.DataFrame(d).sort_index().fillna(0)

Unnamed: 0,AS,AX,AZ,SA,SX,SZ,XA,XS,XZ,ZA,ZS,ZX
A,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
S,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
X,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Z,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [153]:
df_count/df_count.sum(axis = 1)

Unnamed: 0,A,AS,AX,AZ,S,SA,SX,SZ,X,XA,XS,XZ,Z,ZA,ZS,ZX
A,,,,,,,,,,,,,,,,
S,,,,,,,,,,,,,,,,
X,,,,,,,,,,,,,,,,
Z,,,,,,,,,,,,,,,,


In [32]:
trans_matrix = pair_transition_analysis.create_transition_count_matrix(transitions)
trans_matrix

t,A,S,X,Z
t+1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
A,0,1,12,12
S,2,0,10,2
X,16,8,0,12
Z,7,5,14,0


In [188]:
trans_matrix.values.reshape(-1)

array([0.        , 0.02941176, 0.1       , 0.1       , 0.16666667,
       0.        , 0.775     , 0.2       , 0.83333333, 0.82352941,
       0.        , 0.7       , 0.        , 0.14705882, 0.125     ,
       0.        ])

In [256]:
documents = mongo.find({})
X = []
Y = []

for document in documents:
    print("trial: {}, group: {}, pID: {}".format(document["trial"], document["group"], document["pID"]))
    
    d_data = document["data"]
    df_data = pd.DataFrame(d_data)
    df_data = fixation.merge_consecutive_fixations_in_same_roi(df_data)
    transitions, L = pair_transition_analysis.encode_transition(df_data["roi"], "fix")

    df_count = create_second_order_transition_matrix(transitions)
#     df_count = create_first_order_transition_matrix(transitions)
#     df_count = exclude_Z(df_count)
    df_prob = make_prob_matrix(df_count)
    x = df_prob.values.reshape(-1)
    x = x[~np.isnan(x)]
    X.append(x)
    Y.append(document["rating"])

trial: 1, group: 2, pID: 001
trial: 1, group: 2, pID: 002
trial: 1, group: 1, pID: 003
trial: 1, group: 1, pID: 004
trial: 1, group: 2, pID: 005
trial: 1, group: 2, pID: 006
trial: 1, group: 1, pID: 007
trial: 1, group: 1, pID: 008
trial: 1, group: 2, pID: 009
trial: 1, group: 1, pID: 010
trial: 1, group: 2, pID: 011
trial: 1, group: 2, pID: 014
trial: 1, group: 2, pID: 016
trial: 1, group: 2, pID: 017
trial: 1, group: 2, pID: 019
trial: 1, group: 2, pID: 020
trial: 1, group: 2, pID: 021
trial: 1, group: 2, pID: 022
trial: 1, group: 1, pID: 023
trial: 1, group: 2, pID: 025
trial: 1, group: 2, pID: 026
trial: 1, group: 1, pID: 027
trial: 1, group: 1, pID: 029
trial: 1, group: 1, pID: 032
trial: 2, group: 2, pID: 001
trial: 2, group: 2, pID: 002
trial: 2, group: 1, pID: 003
trial: 2, group: 1, pID: 004
trial: 2, group: 2, pID: 005
trial: 2, group: 2, pID: 006
trial: 2, group: 1, pID: 007
trial: 2, group: 1, pID: 008
trial: 2, group: 2, pID: 009
trial: 2, group: 1, pID: 010
trial: 2, grou

In [257]:
X

[array([0.66666667, 0.22222222, 0.5       , 0.13043478, 0.        ,
        0.11538462, 0.5       , 0.        , 0.16666667, 0.77777778,
        0.        , 0.        , 0.73913043, 0.        , 0.33333333,
        0.        , 0.2       , 0.33333333, 0.33333333, 0.5       ,
        0.6       , 0.        , 0.66666667, 0.88461538, 0.5       ,
        0.4       , 0.        , 0.        , 0.        , 0.4       ,
        0.13043478, 0.        , 0.        , 0.4       , 0.        ,
        0.5       ]),
 array([0.        , 0.57894737, 0.5       , 0.28571429, 0.        ,
        0.17391304, 0.        , 0.        , 0.5       , 0.42105263,
        0.5       , 0.25      , 0.61904762, 0.        , 0.15      ,
        0.5       , 0.        , 0.5       , 0.75      , 0.        ,
        0.75      , 1.        , 0.75      , 0.73913043, 0.5       ,
        1.        , 1.        , 0.25      , 0.        , 0.        ,
        0.0952381 , 0.1       , 0.08695652, 0.        , 0.        ,
        0.        ]),
 arr

In [258]:
np.array(X).shape

(50, 36)

# Classification

In [259]:
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_moons, make_circles, make_classification
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.metrics import roc_auc_score, roc_curve, auc
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

In [260]:
# X = StandardScaler().fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3)

clf = LogisticRegression(random_state=1).fit(X_train, y_train)
y_pred = clf.predict(X_test)
print(clf.score(X_test, y_test))
print(classification_report(y_pred, y_test))

0.8
              precision    recall  f1-score   support

           0       0.86      0.75      0.80         8
           1       0.75      0.86      0.80         7

    accuracy                           0.80        15
   macro avg       0.80      0.80      0.80        15
weighted avg       0.81      0.80      0.80        15



In [272]:
clf = AdaBoostClassifier()
scores = cross_val_score(clf, X, Y, cv=5, scoring = "accuracy")
scores

array([0.5, 0.8, 0.5, 0.5, 0.9])

In [200]:
h = .02  # step size in the mesh

names = ["Logistic Regression","Nearest Neighbors", "Linear SVM", "RBF SVM", "Gaussian Process",
         "Decision Tree", "Random Forest", "Neural Net", "AdaBoost",
         "Naive Bayes", "QDA"]

classifiers = [
    LogisticRegression(random_state=1),
    KNeighborsClassifier(2),
    SVC(kernel="linear", C=0.025),
    SVC(gamma=2, C=1),
    GaussianProcessClassifier(1.0 * RBF(1.0)),
    DecisionTreeClassifier(max_depth=5),
    RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
    MLPClassifier(alpha=1, max_iter=1000),
    AdaBoostClassifier(),
    GaussianNB(),
    QuadraticDiscriminantAnalysis()]

In [275]:
for name, clf in zip(names, classifiers):
    scores = cross_val_score(clf, X, Y, cv=4, scoring = "accuracy")
    print(f"{name}: {scores}, mean: {np.mean(scores)}, std: {np.std(scores)}")

Logistic Regression: [0.61538462 0.46153846 0.83333333 0.91666667], mean: 0.7067307692307693, std: 0.17927958461821855
Nearest Neighbors: [0.61538462 0.53846154 0.75       0.5       ], mean: 0.6009615384615384, std: 0.09555099478053745
Linear SVM: [0.46153846 0.46153846 0.91666667 0.91666667], mean: 0.6891025641025641, std: 0.22756410256410253
RBF SVM: [0.61538462 0.53846154 0.66666667 0.58333333], mean: 0.6009615384615384, std: 0.046749846621201345
Gaussian Process: [0.53846154 0.46153846 0.58333333 0.75      ], mean: 0.5833333333333334, std: 0.10562344264925692
Decision Tree: [0.53846154 0.38461538 0.58333333 0.75      ], mean: 0.5641025641025641, std: 0.1301929231512173
Random Forest: [0.53846154 0.69230769 0.41666667 0.75      ], mean: 0.5993589743589745, std: 0.13078337027630235
Neural Net: [0.53846154 0.61538462 0.75       0.91666667], mean: 0.7051282051282051, std: 0.14369558827858905
AdaBoost: [0.46153846 0.46153846 0.58333333 0.83333333], mean: 0.5849358974358975, std: 0.15178



In [226]:
# iterate over classifiers
for name, clf in zip(names, classifiers):
    clf.fit(X_train, y_train)
    score = clf.score(X_test, y_test)
    y_pred = clf.predict(X_test)
    
    if hasattr(clf, "decision_function"):
        y_score = clf.decision_function(X_test)
    else:
        y_score = clf.predict_proba(X_test)[:, 1]

    thresh = 0.5

    pred = []
    for v in y_score:
        if v > thresh:
            pred.append(1)
        else:
            pred.append(0)
    

    print(name)
    print("AUC: ", roc_auc_score(y_test, y_score))
    target_names = ['group 1', 'group 2']
    print(classification_report(y_test, pred, target_names=target_names))

Logistic Regression
AUC:  0.8571428571428572
              precision    recall  f1-score   support

     group 1       0.58      1.00      0.74         7
     group 2       1.00      0.38      0.55         8

    accuracy                           0.67        15
   macro avg       0.79      0.69      0.64        15
weighted avg       0.81      0.67      0.63        15

Nearest Neighbors
AUC:  0.5
              precision    recall  f1-score   support

     group 1       0.46      0.86      0.60         7
     group 2       0.50      0.12      0.20         8

    accuracy                           0.47        15
   macro avg       0.48      0.49      0.40        15
weighted avg       0.48      0.47      0.39        15

Linear SVM
AUC:  0.8571428571428572
              precision    recall  f1-score   support

     group 1       0.47      1.00      0.64         7
     group 2       0.00      0.00      0.00         8

    accuracy                           0.47        15
   macro avg       

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Neural Net
AUC:  0.8928571428571428
              precision    recall  f1-score   support

     group 1       0.71      0.71      0.71         7
     group 2       0.75      0.75      0.75         8

    accuracy                           0.73        15
   macro avg       0.73      0.73      0.73        15
weighted avg       0.73      0.73      0.73        15

AdaBoost
AUC:  0.9464285714285714
              precision    recall  f1-score   support

     group 1       0.78      1.00      0.88         7
     group 2       1.00      0.75      0.86         8

    accuracy                           0.87        15
   macro avg       0.89      0.88      0.87        15
weighted avg       0.90      0.87      0.87        15

Naive Bayes
AUC:  0.8392857142857143
              precision    recall  f1-score   support

     group 1       0.71      0.71      0.71         7
     group 2       0.75      0.75      0.75         8

    accuracy                           0.73        15
   macro avg       0.

