In [573]:
import pandas as pd
import numpy as np
import os
from scipy.sparse import csr_matrix
from seqlearn.hmm import MultinomialHMM
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score



In [552]:
fully_annotated_files = os.listdir('fullyAnnotatedSequences')
partially_annotated_files = os.listdir('partiallyAnnotatedSequences')
feature_files = full_annotated_files + partially_annotated_files
files = [filename.replace('sent','dat') for filename in feature_files]

del files[50]
len(files)

100

In [532]:
# Snigdha Chaturvedi's Features
df = pd.read_csv('sequences_features/'+ files[0])
df['filename'] = files[0].replace('.dat','.sent')
i = 1
df['sequence'] = i
lengths = [len(df)]
for file in files[1:]:
    i += 1
    small_df = pd.read_csv("sequences_features/"+file)
    lengths.append(len(small_df))
    small_df['filename'] = file.replace('.dat','.sent')
    small_df['sequence'] = i
    df = df.append(small_df)
df['unique_ID'] = df["filename"].map(str) + df["1:sentId"]  

snigdha_feature_vars = ['29:posFramesFired',
 '30:negFramesFired',
 '31:otherFramesFired',
 '32:posFramewrtCharFired',
 '33:negFramewrtCharFired',
 '34:otherFrameswrtCharFired','unique_ID']
snigdha_features = df[snigdha_feature_vars]
len(snigdha_features)


792

In [533]:
# mapping from sequence number to filename
mapping = pd.read_csv('sequence_dictionary.txt', sep = ':::', header = None, engine = 'python')
mapping.columns = ['sequence','filename']


map_dict = {}
for index,row in mapping.iterrows():
    map_dict[row['sequence']] = row['filename']


In [534]:
# Very Fun Team's features

vft_features = pd.read_csv('fully_features.csv')
sequences = vft_features['sequence'].tolist()
seq_filenames = []
for seq in sequences:    
    seq_filenames.append(map_dict[seq])
vft_features['filename'] = seq_filenames
vft_features['unique_ID'] = vft_features['filename'].map(str) + vft_features['sid']


vft_feature_vars = ['pos_acts_toeachother', 'neg_acts_toeachother',
               'pos_acts_together', 'neg_acts_together',
               'pos_char1_acts','neg_char1_acts', 
               'pos_char2_acts','neg_char2_acts',
               'pos','neg']
len(vft_features)

792

In [535]:
merged_features = vft_features.merge(snigdha_features, on=['unique_ID'])
merged_features.to_csv("merged_features.csv")
len(merged_features)

792

In [536]:
# get target labels
annotations = pd.read_csv('our_annotations.txt', sep = ':::', engine='python')
sequences = annotations['sequence'].tolist()
seq_filenames = []
for seq in sequences:    
    seq_filenames.append(map_dict[seq])
annotations['filename'] = seq_filenames
annotations['unique_ID'] = annotations['filename'].map(str) + annotations['sid']
annotations = annotations[['unique_ID','manualLabel']]

labelled_features = merged_features.merge(annotations, on=['unique_ID'])
labelled_features.to_csv("labelled_features.csv")
len(labelled_features)

792

In [578]:
small_features = labelled_features.loc[labelled_features['manualLabel'].isin(['p','n'])]
small_features.to_csv('filtered_labelled_features.csv')
len(small_features)


603

In [538]:
all_feature_vars = snigdha_feature_vars + vft_feature_vars
del all_feature_vars[6]

def split_stuff(df):
    y = np.asarray(df['manualLabel'])
    y = y.astype('<U5')
    lengths = np.asarray(df.groupby('sequence').count()['manualLabel'].tolist())    
    features_df = df[all_feature_vars]
    features_matrix = np.asmatrix(features_df)
    features_sparse = csr_matrix(features_matrix)
    X = features_sparse
    return X,y,lengths

In [539]:
# create training and testing sets
# note that we split the data by sequence id so as to not break up sequences between train and test

all_indices = list(range(1,101))
test_indices = np.random.choice(all_indices, 20, replace=False)
train_indices = [index for index in all_indices if index not in test_indices]

test = small_features.loc[labelled_features['sequence'].isin(test_indices)]
train = small_features.loc[labelled_features['sequence'].isin(train_indices)]
print('test: ', test.shape)
print('train: ', train.shape)

test:  (123, 22)
train:  (480, 22)


In [540]:
X_train, y_train, lengths_train = split_stuff(train)
X_test, y_test, lengths_test = split_stuff(test)

In [541]:
print('X' ,X_train.shape)
print('y ',y_train.shape)
print('lengths ',lengths_train.shape)

X (480, 16)
y  (480,)
lengths  (80,)


In [542]:
print('X' ,X_test.shape)
print('y ',y_test.shape)
print('lengths ',lengths_test.shape)

X (123, 16)
y  (123,)
lengths  (20,)


In [None]:
def namestr(obj, namespace):
    return [name for name in namespace if namespace[name] is obj]

In [564]:
%%time
accuracies = []
precision_0 = []
precision_1 = []
recall_0 = []
recall_1 = []
f1_0 = []
f1_1 = []
f1s = []

for i in list(range(10000)):
    
    test_indices = np.random.choice(all_indices, 20, replace=False)
    train_indices = [index for index in all_indices if index not in test_indices]

    test = small_features.loc[labelled_features['sequence'].isin(test_indices)]
    train = small_features.loc[labelled_features['sequence'].isin(train_indices)]

    X_train, y_train, lengths_train = split_stuff(train)
    X_test, y_test, lengths_test = split_stuff(test)

    clf = MultinomialHMM()
    #print("Training %s" % clf)
    clf.fit(X_train, y_train, lengths_train)

    y_pred = clf.predict(X_test, lengths_test)
    acc = accuracy_score(y_test, y_pred)
    accuracies.append(acc)
    
    p = precision_score(y_test, y_pred, average = None)
    precision_0.append(p[0])
    precision_1.append(p[1])
    
    f1 = f1_score(y_test, y_pred, average = None)
    f1_0.append(f1[0])
    f1_1.append(f1[1])
    f1s.append(np.mean(f1))
    
    r = recall_score(y_test, y_pred, average = None)
    recall_0.append(r[0])
    recall_1.append(r[1])
    #print("Accuracy: %.3f" % (100 * accuracy_score(y_test, y_pred)))

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


CPU times: user 2min 19s, sys: 1.1 s, total: 2min 20s
Wall time: 2min 24s


In [571]:
for metric in [accuracies,precision_0,precision_1,recall_0,recall_1,f1_0,f1_1,f1s]:
    print(namestr(metric,globals())[1])
    print('Mean: ', np.mean(metric))
    print('Std Dev: ',np.std(metric))

accuracies
Mean:  0.7823664506184783
Std Dev:  0.07174002850740326
precision_0
Mean:  0.0
Std Dev:  0.0
precision_1
Mean:  0.7823678234807778
Std Dev:  0.0717421333785864
recall_0
Mean:  0.0
Std Dev:  0.0
recall_1
Mean:  0.9999983050847459
Std Dev:  0.00016948305063558216
f1_0
Mean:  0.0
Std Dev:  0.0
f1_1
Mean:  0.8760505584645453
Std Dev:  0.04593093091383474
f1s
Mean:  0.43802527923227264
Std Dev:  0.02296546545691737


In [588]:
negs = [label for label in y if label == 'n']
pos = [label for label in y if label == 'p']

In [589]:
len(negs)

131

In [590]:
len(pos)

472

In [591]:
131 + 472

603

In [592]:
131/603

0.21724709784411278

In [593]:
792-603

189