In [2]:
%load_ext autoreload
%autoreload 
%reload_ext autoreload

In [3]:
from sklearn.datasets import load_iris
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_fscore_support
from features import topological_features, aggregate_features, get_vars, extract_features
import pickle
import rolx
import numpy as np
import utils
import random

def get_scores(train_pred, train_true, val_pred, val_true, test_pred, test_true):
    train_accuracy = np.mean(train_pred == train_true)
    print (train_accuracy)

    train_f1 =  precision_recall_fscore_support(train_true, train_pred)
    print (train_f1[0][1])
    print (train_f1[1][1])
    print (train_f1[2][1])
    
    val_accuracy = np.mean(val_pred == val_true)
    print (val_accuracy)
    
    val_f1 =  precision_recall_fscore_support(val_true, val_pred)
    print (val_f1[0][1])
    print (val_f1[1][1])
    print (val_f1[2][1])

    test_accuracy = np.mean(test_pred == test_true)
    print (test_accuracy)
    
    test_f1 =  precision_recall_fscore_support(test_true, test_pred)
    print (test_f1[0][1])
    print (test_f1[1][1])
    print (test_f1[2][1])


In [4]:
def get_rolx(fname, fname_extended, roles=3):
    G, dict_to_graph, graph_to_dict = rolx.load_graph_igraph(fname, fname_extended)
    H, R = rolx.extract_rolx_roles(G, roles)
    print(H.shape, R.shape)
    H.tolist()

    adj_mat = G.get_adjacency()
    _, video_dict_list, graph_to_dict, neighbors, fields = get_vars(fname, fname_extended)
    # np.save('rolx_features', H)
    # H = np.load('rolx_features.npy')
    
    return adj_mat, H, video_dict_list, graph_to_dict, neighbors, fields

def get_features(adj_mat, H, video_dict_list, graph_to_dict, neighbors, fields, agg_flag=False):
    X = []
    y = []
    pos_data = []
    neg_data = []
    for row in range(adj_mat.shape[0]):
        H_row = np.array(H[row]).flatten()
        for col in range(adj_mat.shape[1]):
            H_total = np.array(H[col][0]).flatten() + H_row
            # print 'pre concatenated', type(H_total), H_total

            # flag for adding into agg and topo features
            if agg_flag:
                local_features = extract_features(video_dict_list, graph_to_dict, neighbors, fields, row, col) 
                # skip if doesnt exist
                if not local_features:
                    continue

                H_total = np.concatenate([H_total, local_features]) 
                # print 'after concatenated', type(H_total), H_total

            if adj_mat[row][col] > 0:
                pos_data.append((H_total, adj_mat[row][col]))
            else:
                neg_data.append((H_total, adj_mat[row][col]))
    
    return pos_data, neg_data

In [5]:
fname = './dataset/0222/0.txt'
fname_extended = './dataset/0222/1.txt'

adj_mat, H, video_dict_list, graph_to_dict, neighbors, fields = get_rolx(fname, fname_extended)
pos_data, neg_data = get_features(adj_mat, H, video_dict_list, graph_to_dict, neighbors, fields)

Creating Vertex Features matrix


  x_star, residuals, rank, s = lstsq(A, w)


V is a 3356 by 485 matrix.
Node-role matrix is of dimensions 3356 by 3
[[0.         0.0199036  0.00744173]
 [0.         0.02791589 0.03119776]
 [0.         0.01000588 0.01839252]
 ...
 [0.         0.00740323 0.01428153]
 [0.         0.00737681 0.01464255]
 [0.         0.00737681 0.01464255]]
[[3.97698502e+04 1.14131367e-01 2.00000000e+01 ... 1.14373638e-03
  1.14373638e-03 2.00000000e+01]
 [6.67290721e+04 1.22706739e-01 4.00000000e+00 ... 2.08546433e-04
  2.08546433e-04 4.00000000e+00]
 [0.00000000e+00 1.11362115e-01 3.00000000e+00 ... 1.85490526e-04
  1.85490526e-04 3.00000000e+00]
 ...
 [3.33333333e-01 5.58823529e-01 4.00000000e+00 ... 2.06357331e-04
  2.06357331e-04 4.00000000e+00]
 [0.00000000e+00 5.58823529e-01 4.00000000e+00 ... 1.98521122e-04
  1.98521122e-04 4.00000000e+00]
 [0.00000000e+00 5.58823529e-01 4.00000000e+00 ... 1.98521122e-04
  1.98521122e-04 4.00000000e+00]]
[[0.00408449 0.         0.01162538 0.         0.         0.
  0.         0.01162491]
 [0.00836937 0.0790076

In [6]:
fname_test = './dataset/080327/0.txt'
fname_test_extended = './dataset/080327/1.txt'

adj_mat_test, H_test, video_dict_list_test, graph_to_dict_test, neighbors_test, fields_test = get_rolx(fname_test, fname_test_extended)
pos_data_test, neg_data_test = get_features(adj_mat_test, H_test, video_dict_list_test, graph_to_dict_test, neighbors_test, fields_test)


Creating Vertex Features matrix


  x_star, residuals, rank, s = lstsq(A, w)


V is a 4330 by 408 matrix.




Node-role matrix is of dimensions 4330 by 3
[[0.19925109 0.16108741 0.00058692]
 [0.07929716 0.13149653 0.05354079]
 [0.19925109 0.16108741 0.00058692]
 ...
 [0.09832276 0.13102633 0.05499859]
 [0.1229468  0.13840692 0.06407499]
 [0.08100141 0.12920092 0.08111056]]
[[1.27952413e+04 8.77933545e-02 2.00000000e+01 ... 3.99064480e-04
  3.99064480e-04 2.00000000e+01]
 [7.14285714e-01 8.07364935e-02 1.00000000e+01 ... 2.10219785e-04
  2.10219785e-04 1.00000000e+01]
 [1.27952413e+04 8.77933545e-02 2.00000000e+01 ... 3.99064480e-04
  3.99064480e-04 2.00000000e+01]
 ...
 [1.06668301e+01 5.27777778e-01 1.10000000e+01 ... 2.52185559e-04
  2.52185559e-04 1.10000000e+01]
 [1.25640720e+00 5.50724638e-01 1.20000000e+01 ... 2.52082882e-04
  2.52082882e-04 1.20000000e+01]
 [0.00000000e+00 5.06666667e-01 9.00000000e+00 ... 1.95368248e-04
  1.95368248e-04 9.00000000e+00]]
[[0.         0.         0.02484668 0.         0.         0.
  0.         0.02484638]
 [0.03682221 0.12328052 0.12870904 0.15966718 0.1

In [7]:
def split_data_balanced(pos_data, neg_data):
    # creates positive and negative dataset for more uniform distribution of data
    X = [pos_data[i][0] for i in range(len(pos_data))]
    Y = [pos_data[i][1] for i in range(len(pos_data))]

    random_indices = sorted(random.sample(range(len(neg_data)), len(X)))
    X_neg = [neg_data[i][0] for i in random_indices]
    Y_neg = [neg_data[i][1] for i in random_indices]

    X.extend(X_neg)
    Y.extend(Y_neg)

    X_array = np.array(X)
    Y_array = np.array(Y)
    
    print (X_array.shape, Y_array.shape)
    from sklearn.preprocessing import normalize
    # change this line to change the number of features
    X_array = X_array[:, np.r_[:3]]
    print (X_array.shape)

    # runs training by splitting train/test sets
    return train_test_split(X_array, Y_array, test_size=0.2, random_state=42)

In [8]:
def split_data(pos_data, neg_data):
    # runs training by splitting train/test sets
    X = [pos_data[i][0] for i in range(len(pos_data))]
    Y = [pos_data[i][1] for i in range(len(pos_data))]

    X_neg = [neg_data[i][0] for i in range(len(neg_data))]
    Y_neg = [neg_data[i][1] for i in range(len(neg_data))]

    X.extend(X_neg)
    Y.extend(Y_neg)

    X_array = np.array(X)
    Y_array = np.array(Y)
#     X_array = X_array[:, np.r_[:3]]

    X_train, X_test, y_train, y_test = train_test_split(X_array, Y_array, test_size=0.01, random_state=42)

    test_zero_vals = np.argwhere(y_test == 0)
    test_one_vals = np.argwhere(y_test == 1)
    print ('test zero vals', len(test_zero_vals), 'test one vals', len(test_one_vals))

    zero_vals = np.argwhere(y_train == 0)
    one_vals = np.argwhere(y_train == 1)
    random_indices = zero_vals[sorted(random.sample(range(len(zero_vals)), len(one_vals)))]
    random_indices = np.concatenate([random_indices, one_vals]).reshape(-1)

    X_train = X_train[random_indices]
    y_train = y_train[random_indices]
    print (X_train.shape, y_train.shape)

    train_zero_vals = np.argwhere(y_train == 0)
    train_one_vals = np.argwhere(y_train == 1)
    print ('train zero vals', len(train_zero_vals), 'train one vals', len(train_one_vals))
    return X_train, X_test, y_train, y_test

In [9]:
X_train, X_val, y_train, y_val = split_data(pos_data, neg_data)
_, X_test, _, y_test = split_data(pos_data_test, neg_data_test)

test zero vals 112185 test one vals 443
(83300, 3) (83300,)
train zero vals 41650 train one vals 41650
test zero vals 187235 test one vals 254
(57420, 3) (57420,)
train zero vals 28710 train one vals 28710


In [10]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(n_estimators=100, max_depth=2,
                              random_state=0)
clf.fit(X_train, y_train)

print ('random forest')
train_predictions = clf.predict(X_train)
val_predictions = clf.predict(X_val)
test_predictions = clf.predict(X_test)

get_scores(train_predictions, y_train, val_predictions, y_val, test_predictions, y_test)
np.savetxt('dataset/results.txt', test_predictions)

random forest
0.6023409363745498
0.713970182219768
0.3414885954381753
0.4620032157996459
0.8628848954078915
0.010379945162553859
0.35891647855530473
0.020176384747160717
0.44181258633839854
0.0017657726448410804
0.7283464566929134
0.0035230042656916507


In [11]:
clf = LogisticRegression(random_state=0, solver='lbfgs').fit(X_train, y_train)
print ('logistic regression')
# makes predictions
train_predictions = clf.predict(X_train)
val_predictions = clf.predict(X_val)
test_predictions = clf.predict(X_test)

get_scores(train_predictions, y_train, val_predictions, y_val, test_predictions, y_test)
np.savetxt('dataset/results.txt', test_predictions)

logistic regression
0.5843337334933973
0.6176204667983792
0.4428331332533013
0.5158222980437123
0.7323667294100934
0.006841126461211477
0.4650112866817156
0.013483881525118637
0.6828293926577026
0.0024368928775503344
0.5708661417322834
0.004853069147867996


In [12]:
from sklearn import svm

svm.SVC(kernel='rbf')
clf.fit(X_train, y_train)

print ('svm_rbf')
# makes predictions
train_predictions = clf.predict(X_train)
val_predictions = clf.predict(X_val)
test_predictions = clf.predict(X_test)

train_preds = [pred > 0.5 for pred in train_predictions]
test_preds = [pred > 0.5 for pred in test_predictions]

get_scores(train_predictions, y_train, val_predictions, y_val, test_predictions, y_test)
# np.savetxt('dataset/results.txt', test_predictions)

svm_rbf
0.5843337334933973
0.6176204667983792
0.4428331332533013
0.5158222980437123
0.7323667294100934
0.006841126461211477
0.4650112866817156
0.013483881525118637
0.6828293926577026
0.0024368928775503344
0.5708661417322834
0.004853069147867996


In [13]:
from sklearn import svm

svm.SVC(kernel='linear')
clf.fit(X_train, y_train)

print ('svm linear')
# makes predictions
train_predictions = clf.predict(X_train)
val_predictions = clf.predict(X_val)
test_predictions = clf.predict(X_test)

train_preds = [pred > 0.5 for pred in train_predictions]
test_preds = [pred > 0.5 for pred in test_predictions]

get_scores(train_predictions, y_train, val_predictions, y_val, test_predictions, y_test)
# np.savetxt('dataset/results.txt', test_predictions)

svm linear
0.5843337334933973
0.6176204667983792
0.4428331332533013
0.5158222980437123
0.7323667294100934
0.006841126461211477
0.4650112866817156
0.013483881525118637
0.6828293926577026
0.0024368928775503344
0.5708661417322834
0.004853069147867996


In [14]:
from sklearn.neighbors import KNeighborsClassifier
clf = KNeighborsClassifier(n_neighbors=3)
clf.fit(X_train, y_train)

print ('knn')
# makes predictions
train_predictions = clf.predict(X_train)
val_predictions = clf.predict(X_val)
test_predictions = clf.predict(X_test)

train_preds = [pred > 0.5 for pred in train_predictions]
test_preds = [pred > 0.5 for pred in test_predictions]

get_scores(train_predictions, y_train, val_predictions, y_val, test_predictions, y_test)

knn
0.9587755102040816
0.9246788460683647
0.9989195678271309
0.9603665574073219
0.855044926661221
0.024830969903667805
0.9367945823927766
0.048379575658661696
0.8346943020657211
0.0018151761693300055
0.2204724409448819
0.0036007072817874942


In [15]:
from sklearn.naive_bayes import BernoulliNB
clf = BernoulliNB()
clf.fit(X_train, y_train)

print ('naive bayes')
# makes predictions
train_predictions = clf.predict(X_train)
val_predictions = clf.predict(X_val)
test_predictions = clf.predict(X_test)

train_preds = [pred > 0.5 for pred in train_predictions]
test_preds = [pred > 0.5 for pred in test_predictions]

get_scores(train_predictions, y_train, val_predictions, y_val, test_predictions, y_test)

naive bayes
0.5272388955582233
0.5202629087856543
0.69937575030012
0.5966673153145772
0.3606829562808538
0.004074844074844075
0.6636568848758465
0.008099954541071453
0.2321042834512958
0.0017403966162806823
0.9881889763779528
0.0034746736437005965
