In [2]:
from __future__ import unicode_literals
from __future__ import print_function
import glob, os, sys,re
import numpy as np
import spacy
from sklearn.utils import shuffle

reload(sys)
sys.setdefaultencoding('utf8')

nlp = spacy.load('en')

import sys,json,gzip,random,re,math
from collections import Counter
import numpy as np
from sklearn.cross_validation import KFold
from sklearn import linear_model
from scipy import sparse
from sklearn.preprocessing import normalize
from sklearn import cross_validation
from sklearn.grid_search import GridSearchCV

from __future__ import print_function
import tensorflow as tf
from tensorflow.contrib import rnn
from tensorflow.contrib.rnn import BasicLSTMCell
from tensorflow.python.ops.rnn import dynamic_rnn

<h2> Load in files from the Liu et al. 2017 dataset and read vocab </h2>

In [3]:
def get_liu_transcript_paths():
    pathname="/data/corpora/ted/transcripts_clean/"
    files = [pathname + f for f in os.listdir(pathname)]
    talk_names_path = "talk_names.txt"
    talk_names = open(talk_names_path).read().replace('.txt','.html').split('\n')
    talk_names = ['isabel_allende_how_to_live_passionately_no_matter_your_age.html' if t == 'isabelle_allende_how_to_live_passionately_no_matter_your_age.html' else t for t in talk_names]
    talk_names = [t for t in talk_names if t not in ['', 'test.html', '\r']]
    transcripts = [pathname + t for t in talk_names]
    return(transcripts)

transcripts = get_liu_transcript_paths()
len(transcripts)

904

In [4]:
def readLIWC(filename):
    liwc_vocab={}
    regex_liwc={}
    liwc={}
    
    # Liu excludes "affective or emotional processes"
    invalid={}
    invalid["31"]=1
    invalid["32"]=1
    invalid["33"]=1
    invalid["34"]=1
    invalid["35"]=1

    file=open(filename)
    for i in range(75):
        line=file.readline()
        cols=re.split("\s+", line.rstrip())
        if line.rstrip() != "%":
            idd="%s" % cols[0]
            label=cols[1]
            liwc_vocab[idd]=label
    for line in file:
        cols=line.rstrip().split("\t")
        term=cols[0]
        valid=[]
        for x in cols[1:]:
            if x not in invalid:
                valid.append(x)
        cats=["LIWC_%s" % liwc_vocab[x] for x in valid]

        if term.endswith("*"):
            pref=term[0:2]
            if pref not in regex_liwc:
                regex_liwc[pref]={}
            regex_liwc[pref][term]=cats
        else:
            liwc[term]=cats
            
    return (liwc_vocab, regex_liwc, liwc)

# get all flat LIWC categories for a word
def getLIWC(word):
    vals=[]
    if word in liwc:
        vals.extend(liwc[word])
    if len(word) > 1:
        pref=word[0:2]
        if pref in regex_liwc:
            cands=regex_liwc[pref]
            for cand in cands:
                if re.match(cand, word) != None:
                    vals.extend(regex_liwc[pref][cand])
    return vals

def get_words(transcript_path):
    words = []
    text = open(transcript_path).read()
    lines = text.split("\n")
    lines = [l for l in lines if l != '']
    paragraphs = [line.split("\t")[1] for line in lines]
    paragraphs = [p.replace("(Laughter)","").replace("(Applause)","") for p in paragraphs]
    for p in paragraphs:
        words += p.split(' ')
    return words
    

def get_vocab(transcript_path_list):
    vocab = {}
    
    words = []
    for t in transcript_path_list:
        words += get_words(t)
        #words += t.get_words()
        
    fid = 1
    
    for key in liwc_vocab:
        feat="LIWC_%s" % (liwc_vocab[key])
        vocab[feat]=fid
        fid+=1
        
    counts=Counter()
    
    for word in words:
        counts[word.lower()]+=1
    
    for word in counts:
        count=counts[word]
        #if count >= 500:
        if count >= 500000:
            vocab[word.lower()]=fid
            fid+=1
            
    return vocab

In [5]:
liwc_path = '/data/corpora/LIWC/LIWC.txt'
liwc_vocab, regex_liwc, liwc = readLIWC(liwc_path)
vocab = get_vocab(transcripts)

<h2> Create "Transcript" object to compute features from text files </h2>

In [9]:
class Transcript:
    def __init__(self, filename):
        self.filename = filename
        self.sequences = self.get_sequences()
        self.sentences = self.get_sentence_list()
        
    def get_text(self):
        return open(self.filename).read()
    
    def get_split_lines(self):
        text = self.get_text()
        lines = text.split("\n")
        lines = [l for l in lines if l != '']
        return [line.split("\t") for line in lines]
    
    def get_paragraphs(self):
        split_lines = self.get_split_lines()
        return [l[1] for l in split_lines]
        
    def get_timestamps(self):
        split_lines = self.get_split_lines()
        return [l[0] for l in split_lines]
        
    def get_sentences(self, paragraph):
        #par = unicode(paragraph).encode("utf-8")
        paragraph = paragraph.replace('(Laughter)','')
        sentences = []
        doc = nlp(paragraph)
        for sentence in doc.sents:
            words=[]
            for word in sentence:
                if re.search("\S", word.string) != None:
                    words.append(word.string)
            text=' '.join(words)
            if re.match("^\(.*?\)$", text) != None or re.search("\w", text) == None:
                continue
            sentences.append(text)
        return sentences
    
    def get_words(self):
        words = []
        paragraphs = [p.replace("(Laughter)","").replace("(Applause)","") for p in self.get_paragraphs()]
        for p in paragraphs:
            words += p.split(' ')
        return words
    
    def get_sentences_with_applause_interspersed(self, paragraph):
        paragraph = paragraph.replace('(Laughter)','')
        lines = paragraph.split("(Applause)")
        last_line = lines[-1]; lines = lines[0:-1]  # Don't add applause after last line
        sentences = []
        for line in lines:
            sentences += self.get_sentences(line)
            sentences.append("(Applause)")
        sentences += self.get_sentences(last_line)
        return sentences
      
    def get_sequences(self):
        sequences = []
        split_lines = self.get_split_lines()
        for line in split_lines:
            if "(Applause)" in line[1] and line[1] != "(Applause)":
                timestamp = line[0]; paragraph = line[1]
                sentences = self.get_sentences_with_applause_interspersed(paragraph)
                sequences.append((timestamp, paragraph, sentences))
                #special_cases.append(line[1])
            elif line[1] == "(Applause)":
                timestamp = line[0]; paragraph = line[1]; sentences = ["(Applause)"]
                sequences.append((timestamp, paragraph, sentences))
            else:
                timestamp = line[0]; paragraph = line[1]; sentences = self.get_sentences(paragraph)
                sequences.append((timestamp, paragraph, sentences))
        return sequences
    
    def get_sentence_list(self):
        all_sentences = []
        seqs = self.sequences[0:-1] #exclude last paragraph, which may be final applause
        for sequence_index, sequence in enumerate(seqs): 
            sentences = sequence[2]
            for sentence_index, s in enumerate(sentences):
                if s != "(Applause)":
                    features = np.concatenate((self.get_dense_liwc_features(s),self.get_word_vec_features(s)))
                    # If this is not the last sentence in the seq, check if next sentence in this sequence is applause
                    if sentence_index < len(sentences)-1 and sentences[sentence_index+1] == "(Applause)":
                        applause_follows = 1
                    # If this is the last sentence of this sequence, check first sentence of next sequence
                    elif sentence_index == len(sentences) -1 and sequence_index < len(seqs) - 1:
                        next_sentences = seqs[sequence_index+1][2]
                        if len(next_sentences) > 0 and next_sentences[0] == "(Applause)":
                            applause_follows = 1
                        else:
                            applause_follows = 0    
                    else:
                        applause_follows = 0
                    all_sentences.append((s,features,applause_follows))
        return all_sentences
    
    def count_applause_instances(self):
        c = 0
        sentence_list = self.sentences
        for s in sentence_list:#[0:-1]:
            if s[2] == 1:
                c += 1
        return c
    
    def get_applause_yes_sentences(self):
        return [s for s in self.sentences if s[2] == 1]
    
    def get_applause_no_sentences(self):
        no_sentences = [s for s in self.sentences if s[2] == 0]
        shuffle(no_sentences)
        return no_sentences[0:len(self.get_applause_yes_sentences())]
    
    def get_sparse_liwc_features(self, sentence):
        counts = {}
        text = sentence.lower().split(' ')
        for word in text:
            cats=getLIWC(word)
            for cat in cats:
                if vocab[cat] in counts:
                    counts[vocab[cat]] += 1  # = 1
                else:
                    counts[vocab[cat]] = 1.
            if word in vocab:
                if vocab[word] in counts:
                    counts[vocab[word]] += 1  # = 1
                else:
                    counts[vocab[word]] = 1
        return counts
    
    def get_dense_liwc_features(self,sentence):
        a = np.zeros(len(vocab))
        sparse_feats = self.get_sparse_liwc_features(sentence)
        for k in sparse_feats.keys():
            a[k-1] = float(sparse_feats[k]) / len(sentence)
        return a
        
    def get_word_vec_features(self,sentence):
        return nlp(sentence).vector

In [10]:
# Store objects so we don't have to recompute features 

import pickle

def save_transcripts(filename="transripts.pkl"):
    with open(filename, "wb") as f:
        pickle.dump(all_transcripts, f)

def load_transcripts(filename="transcripts.pkl"):
    with open(filename, 'rb') as f:
        return pickle.load(f)

transcripts_path = 'transcripts.pkl'

if os.path.exists(transcripts_path):
    all_transcripts = load_transcripts(transcripts_path)
    
else:
    all_transcripts = [Transcript(f) for f in transcripts]
    save_transcripts(transcripts_path)
    
all_transcripts = shuffle(all_transcripts)

<h2> Run Logistic Regression Models </h2>

In [23]:
from sklearn.cross_validation import KFold
from sklearn import linear_model
from scipy import sparse
from sklearn.preprocessing import normalize
from sklearn import cross_validation
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import precision_recall_fscore_support

def get_balanced_applause_features(all_transcripts):
    applause_yes = []
    applause_no = []
    for t in all_transcripts:
        for a in t.get_applause_yes_sentences():
            applause_yes.append(a[1])
        for a in t.get_applause_no_sentences():
            applause_no.append(a[1])
    applause_yes = np.array(applause_yes)
    applause_no = np.array(applause_no)
    return applause_yes, applause_no

def get_unbalanced_applause_features(all_transcripts):
    applause_yes = []
    applause_no = []
    for t in all_transcripts:
        for a in t.get_applause_yes_sentences():
            applause_yes.append(a[1])
        for a in t.sentences:
            if a[2] == 0:
                applause_no.append(a[1])
    applause_yes = np.array(applause_yes)
    applause_no = np.array(applause_no)
    return applause_yes, applause_no

def get_balanced_train_data_and_labels(all_transcripts):
    applause_yes, applause_no = get_balanced_applause_features(all_transcripts)
    X = np.vstack([applause_yes,applause_no])
    Y = np.concatenate([np.ones(len(applause_yes)),np.zeros(len(applause_no))])
    return X, Y

def get_unbalanced_train_data_and_labels(all_transcripts):
    applause_yes, applause_no = get_unbalanced_applause_features(all_transcripts)
    X = np.vstack([applause_yes,applause_no])
    Y = np.concatenate([np.ones(len(applause_yes)),np.zeros(len(applause_no))])
    return X, Y

<h3> Logistic Regression on balanced data following Liu et al. 2017</h3>

In [19]:
total_true = []
total_pred = []

X,Y = get_balanced_train_data_and_labels(all_transcripts)
X,Y = shuffle(X,Y)
kf = KFold(len(Y), n_folds=10)

for train_index, test_index in kf:
    X_train, X_test = X[train_index][:,73:], X[test_index][:,73:]
    y_train, y_test = Y[train_index], Y[test_index]
    
    logreg=linear_model.LogisticRegression(penalty='l1',C=4)
    clf = logreg
    #clf = GridSearchCV(logreg, {'C':(0.001, .01, .1, 1, 3, 4, 5)}, cv=3)
    #clf = GridSearchCV(logreg, {'C':(3.5,4,4.5)}, cv=3)
    clf.fit(X_train, y_train)
    
    #print(clf.best_params_, len(y_train))
    y_true, y_pred = y_test, clf.predict(X_test)
    for i in range(len(y_true)):
        
        total_true.append(y_true[i])
        total_pred.append(y_pred[i])

total_correct = np.sum(np.array(total_true) == np.array(total_pred))
acc=float(total_correct) / len(total_true)
std=math.sqrt( (acc * (1-acc)) / len(total_true) )
str("Accuracy: %.3f +/- %.3f (%s/%s)" % (acc, 1.96*std, total_correct, len(total_true)))

'Accuracy: 0.722 +/- 0.014 (2826/3912)'

In [20]:
precision, recall, f1, support = [l[1] for l in precision_recall_fscore_support(total_true, total_pred)]
str("Precision: %.3f | Recall: %.3f | F1: %.3f" % (precision, recall, f1))

'Precision: 0.747 | Recall: 0.673 | F1: 0.708'

In [None]:
clf.best_params_

<h3> Run cross validation again with clean split on full talks rather than sentences </h3>

In [25]:
total_true = []
total_pred = []

kf = KFold(len(all_transcripts), n_folds=10)

for train_index, test_index in kf:
    t_train, t_test = np.array(all_transcripts)[train_index], np.array(all_transcripts)[test_index]
    X_train, y_train = get_balanced_train_data_and_labels(t_train)
    X_test, y_test = get_balanced_train_data_and_labels(t_test)

    X_train, y_train = shuffle(X_train, y_train)
    X_test, y_test = shuffle(X_test, y_test)

    X_train, X_test = X_train[:,73:], X_test[:,73:]
    
    logreg=linear_model.LogisticRegression(penalty='l1',C=3)
    clf = logreg

    #clf = GridSearchCV(logreg, {'C':(3,3.5,4,4.5)}, cv=3)
    clf.fit(X_train, y_train)
    
    y_true, y_pred = y_test, clf.predict(X_test)
    for i in range(len(y_true)):
        
        total_true.append(y_true[i])
        total_pred.append(y_pred[i])

total_correct = np.sum(np.array(total_true) == np.array(total_pred))
acc=float(total_correct) / len(total_true)
std=math.sqrt( (acc * (1-acc)) / len(total_true) )
str("Accuracy: %.3f +/- %.3f (%s/%s)" % (acc, 1.96*std, total_correct, len(total_true)))

'Accuracy: 0.710 +/- 0.014 (2778/3912)'

In [26]:
precision, recall, f1, support = [l[1] for l in precision_recall_fscore_support(total_true, total_pred)]
str("Precision: %.3f | Recall: %.3f | F1: %.3f" % (precision, recall, f1))

'Precision: 0.732 | Recall: 0.664 | F1: 0.696'

In [28]:
#clf.best_params_

<h3> Train on balanced subset, test on full talks </h3>

In [596]:
total_true = []
total_pred = []

for train_index, test_index in kf:
    t_train, t_test = np.array(all_transcripts)[train_index], np.array(all_transcripts)[test_index]
    X_train, y_train = get_balanced_train_data_and_labels(t_train)
    X_test, y_test = get_unbalanced_train_data_and_labels(t_test)

    X_train, y_train = shuffle(X_train, y_train)
    X_test, y_test = shuffle(X_test, y_test)

    X_train, X_test = X_train[:,73:], X_test[:,73:]
    
    logreg=linear_model.LogisticRegression(penalty='l1')
    clf = logreg

    clf = GridSearchCV(logreg, {'C':(3,3.5,4,4.5)}, cv=3)
    clf.fit(X_train, y_train)
    
    y_true, y_pred = y_test, clf.predict(X_test)
    for i in range(len(y_true)):
        
        total_true.append(y_true[i])
        total_pred.append(y_pred[i])

total_correct = np.sum(np.array(total_true) == np.array(total_pred))
acc=float(total_correct) / len(total_true)
std=math.sqrt( (acc * (1-acc)) / len(total_true) )
str("Accuracy: %.3f +/- %.3f (%s/%s)" % (acc, 1.96*std, total_correct, len(total_true)))

'Accuracy: 0.553 +/- 0.003 (71471/129189)'

In [597]:
precision, recall, f1, support = [l[1] for l in precision_recall_fscore_support(total_true, total_pred)]
str("Precision: %.3f | Recall: %.3f | F1: %.3f" % (precision, recall, f1))

'Precision: 0.022 | Recall: 0.663 | F1: 0.043'

In [598]:
clf.best_params_

{u'C': 4.5}

<h3> Train on full talks, test on full talks </h3>
This takes a while

In [599]:
total_true = []
total_pred = []

for train_index, test_index in kf:
    t_train, t_test = np.array(all_transcripts)[train_index], np.array(all_transcripts)[test_index]
    X_train, y_train = get_unbalanced_train_data_and_labels(t_train)
    X_test, y_test = get_unbalanced_train_data_and_labels(t_test)

    X_train, y_train = shuffle(X_train, y_train)
    X_test, y_test = shuffle(X_test, y_test)

    X_train, X_test = X_train[:,73:], X_test[:,73:]
    
    logreg=linear_model.LogisticRegression(penalty='l1')
    clf = logreg
    clf = GridSearchCV(logreg, {'C':(0.001, .01, .1, 1, 3, 5)}, cv=3)
    #clf = GridSearchCV(logreg, {'C':(3,3.5,4,4.5)}, cv=3)
    clf.fit(X_train, y_train)
    
    y_true, y_pred = y_test, clf.predict(X_test)
    for i in range(len(y_true)):      
        total_true.append(y_true[i])
        total_pred.append(y_pred[i])

total_correct = np.sum(np.array(total_true) == np.array(total_pred))
acc=float(total_correct) / len(total_true)
std=math.sqrt( (acc * (1-acc)) / len(total_true) )
str("Accuracy: %.3f +/- %.3f (%s/%s)" % (acc, 1.96*std, total_correct, len(total_true)))

'Accuracy: 0.985 +/- 0.001 (127233/129189)'

In [617]:
precision, recall, f1, support = [l[1] for l in precision_recall_fscore_support(total_true, total_pred)]
str("Precision: %.3f | Recall: %.3f | F1: %.3f" % (precision, recall, f1))

'Precision: 0.000 | Recall: 0.000 | F1: 0.000'

In [601]:
clf.best_params_

{u'C': 0.001}

<h1> Train LSTM over series of 5 Sentences </h1>

In [11]:
def get_applause_indices(transcript):
    a = [s[2] for s in transcript.sentences]
    return list(np.nonzero(a)[0])

def get_non_applause_indices(transcript):
    return [index for index, s in enumerate(transcript.sentences) if s[2] == 0]

def extract_features_from_sentence_list(l):
    return np.array([s[1] for s in l])

def extract_applause_sequences(transcript,num_sentences):
    applause_indices = [i for i in get_applause_indices(transcript) if i > num_sentences-1]
    sentence_lists = [transcript.sentences[i-num_sentences+1:i+1] for i in applause_indices]
    feature_lists = [extract_features_from_sentence_list(s) for s in sentence_lists]
    return np.array(feature_lists)

def extract_non_applause_sequences(transcript,num_sentences,n):
    non_applause_indices = [i for i in get_non_applause_indices(transcript) if i > num_sentences-1]
    non_applause_indices = shuffle(non_applause_indices)[0:n]
    sentence_lists = [transcript.sentences[i-num_sentences+1:i+1] for i in non_applause_indices]
    return np.array([extract_features_from_sentence_list(s) for s in sentence_lists])

In [12]:
n_features = 373
n_categories = 1
n_hidden=200
num_sentences = 5
number_of_layers=1
learning_rate = 0.1
batch_size = 10

<h3> Extract applause with a context of 4 previous sentences and train LSTM on balanced data </h3>

In [13]:
def get_balanced_lstm_applause_features(all_transcripts, num_sentences):
    applause_yes = None
    applause_no = None
    for t in all_transcripts:
        positives = extract_applause_sequences(t,num_sentences)
        negatives = extract_non_applause_sequences(t,num_sentences,len(positives))
        if len(positives) != 0:
            if applause_yes is None:
                applause_yes = positives
                applause_no = negatives
            else:
                applause_yes = np.vstack([applause_yes, positives])
                applause_no = np.vstack([applause_no, negatives])
    return applause_yes, applause_no

def format_label(l):
    return np.array(list(np.zeros(num_sentences-1)) + [l]).reshape((num_sentences,1))

def train_tf(train_data, train_labels):
    for i in range(len(train_data)):
        sess.run(optimizer,feed_dict={x:train_data[i],y:train_labels[i].reshape((5,1)),seq_length:num_sentences})
        
def evaluate_on_data(test_data, test_labels):
    num_correct = 0
    for i in range(len(test_data)):
        num_correct += sess.run(accuracy,feed_dict={x:test_data[i],y:test_labels[i].reshape((5,1)),seq_length:num_sentences})
    return(num_correct / len(test_data))

def predict_on_data(test_data):
    return [np.round(sess.run(prediction,feed_dict={x:test_data[i],seq_length:num_sentences}))[0][0] for i in range(len(test_data))]

def get_balanced_lstm_features_and_labels(all_transcripts, num_sentences):
    applause_yes, applause_no = get_balanced_lstm_applause_features(all_transcripts, num_sentences)
    X = np.vstack([applause_yes,applause_no])
    Y = np.concatenate([np.ones(len(applause_yes)),np.zeros(len(applause_no))])
    Y = np.array([format_label(l) for l in Y]).reshape((len(Y),-1))
    return X, Y

In [14]:
# Set up tensorflow graph

tf.reset_default_graph()
x = tf.placeholder(tf.float32, [None, n_features])
y = tf.placeholder(tf.float32, [None, n_categories])
seq_length = tf.placeholder(tf.int32)

weights = {'out': tf.Variable(tf.random_normal([n_hidden, n_categories])),}
biases = {'out': tf.Variable(tf.random_normal([n_categories]))}

fc1 = tf.reshape(x, [1, -1, n_features])
lstm_fw_cell = BasicLSTMCell(n_hidden, forget_bias=1.0, state_is_tuple=True)
lstm_fw_multicell = tf.contrib.rnn.MultiRNNCell([lstm_fw_cell] * number_of_layers, state_is_tuple=True)

output_fw, state_fw = tf.nn.dynamic_rnn(lstm_fw_multicell,fc1,dtype='float32',sequence_length=tf.reshape(seq_length, [1]))
outputs_fw = tf.reshape(output_fw, [seq_length, n_hidden])
out = tf.matmul(outputs_fw, weights['out']) + biases['out']
pred = tf.gather(out,[num_sentences-1])
end_label = tf.gather(y,[num_sentences-1])
#end_label = tf.reshape(tf.gather(tf.reshape(y,[-1]),[num_sentences-1]),[1,1])

cost = tf.nn.sigmoid_cross_entropy_with_logits(logits=pred, labels=end_label)
cost2 = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=out, labels=y))
optimizer = tf.train.AdamOptimizer().minimize(cost)#(learning_rate=0.1, l2_regularization_strength=0.1).minimize(cost)
#optimizer = tf.train.FtrlOptimizer(learning_rate=learning_rate, l2_regularization_strength=1.0).minimize(cost)
prediction = tf.sigmoid(pred)
correct_pred = tf.equal(tf.greater_equal(prediction,0.5), tf.equal(end_label,1))
accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))

init = tf.global_variables_initializer()
sess = tf.Session()
sess.run(init)

In [15]:
#kf = KFold(len(all_transcripts), n_folds=10)

In [17]:
total_true = []
total_pred = []

for train_index, test_index in kf:
    t_train, t_test = np.array(all_transcripts)[train_index], np.array(all_transcripts)[test_index]
    
    X_train, y_train = get_balanced_lstm_features_and_labels(t_train, num_sentences=5)
    X_test, y_test = get_balanced_lstm_features_and_labels(t_test, num_sentences=5)
    X_train, y_train = shuffle(X_train, y_train)
    X_test, y_test = shuffle(X_test, y_test)
    #X_train, X_test = X_train[:,:,73:], X_test[:,:,73:]
    
    best_acc = 0
    best_weights = np.zeros(n_hidden)
    best_bias = 0.
    val_index = int(len(X_train)*0.9)
    val_data, val_labels = (X_train[val_index:], y_train[val_index:])
    train_data, train_labels = (X_train[0:val_index], y_train[0:val_index])
    
    max_epochs = 10
    sess.run(init)
    for i in range(max_epochs):
        train_data, train_labels = shuffle(train_data, train_labels)
        train_tf(train_data, train_labels)
        acc = evaluate_on_data(val_data, val_labels)
        if acc > best_acc: 
            best_acc = acc
            best_weights = sess.run(weights['out'])
            best_bias = sess.run(biases['out'])
        
    assign_weights_op = tf.assign(weights['out'], best_weights)
    assign_bias_op = tf.assign(biases['out'], best_bias)
    sess.run(assign_weights_op)
    y_true, y_pred = y_test[:,num_sentences-1], predict_on_data(X_test)
    
    for i in range(len(y_true)):      
        total_true.append(y_true[i])
        total_pred.append(y_pred[i])

total_correct = np.sum(np.array(total_true) == np.array(total_pred))
acc=float(total_correct) / len(total_true)
std=math.sqrt( (acc * (1-acc)) / len(total_true) )
str("Accuracy: %.3f +/- %.3f (%s/%s)" % (acc, 1.96*std, total_correct, len(total_true)))

KeyboardInterrupt: 

In [None]:
precision, recall, f1, support = [l[1] for l in precision_recall_fscore_support(total_true, total_pred)]
str("Precision: %.3f | Recall: %.3f | F1: %.3f" % (precision, recall, f1))

<h2> Implement LSTM again in Keras to compare </h2>

In [18]:
import keras
from keras.models import Sequential
from keras.layers import Dense, Activation, Convolution2D, MaxPooling2D, Flatten, Dropout
import keras.optimizers
from keras.models import load_model
import keras.regularizers
from keras.regularizers import l2, l1

In [19]:
from keras.layers import LSTM
from keras.layers import regularizers
from keras.layers import Bidirectional

def initialize_lstm_model():
    model = Sequential()
    model.add(LSTM(n_hidden,input_shape=(None,373),kernel_regularizer=regularizers.l1(0.),return_sequences=False))
    #model.add(Bidirectional(LSTM(100,  kernel_regularizer=regularizers.l2(0.0001), return_sequences=False),input_shape=(None, 300)))
    #model.add(Dropout(0.5))
    model.add(Activation("relu"))
    #model.add(LSTM(50,return_sequences=False))
    #model.add(Dropout(0.5))
    #model.add(Activation("relu"))
    model.add(Dense(1))
    model.add(Activation('sigmoid'))
    optimizer = keras.optimizers.Adam()
    model.compile(optimizer=optimizer,loss='binary_crossentropy',metrics=['accuracy'])
    return model

In [20]:
#train_data_2 = np.array([t [3:,:] for t in train_data])
#test_data_2 = np.array([t [3:,:] for t in test_data])
#train_data_2.shape

In [21]:
#Keras model

#tf.reset_default_graph()

total_true = []
total_pred = []

for train_index, test_index in kf:  
    t_train, t_test = np.array(all_transcripts)[train_index], np.array(all_transcripts)[test_index]
    
    X_train, y_train = get_balanced_lstm_features_and_labels(t_train, num_sentences=5)
    X_test, y_test = get_balanced_lstm_features_and_labels(t_test, num_sentences=5)
    X_train, y_train = shuffle(X_train, y_train)
    X_test, y_test = shuffle(X_test, y_test)
    
    # Don't include labels for the whole sequence as we're only predicting at the end of the sequence
    y_train = np.array([l[-1] for l in y_train])
    y_test = np.array([l[-1] for l in y_test])
    
    best_acc = 0
    m = initialize_lstm_model()
    val_index = int(len(X_train)*0.9)
    val_data, val_labels = (X_train[val_index:], y_train[val_index:])
    train_data, train_labels = (X_train[0:val_index], y_train[0:val_index])
    
    max_epochs = 30; e = 0
    while e < max_epochs:                            
        m.fit(X_train[0:val_index], y_train[0:val_index], nb_epoch=1,batch_size=1,shuffle=True)
        acc = m.evaluate(val_data, val_labels)[1]
        if acc > best_acc: 
            best_acc = acc
            m.save('best_model.h5')
        e+=1
        
    m = load_model('best_model.h5')
    
    y_true, y_pred = y_test, np.round(m.predict(X_test))
    for i in range(len(y_true)):
        total_true.append(y_true[i])
        total_pred.append(y_pred[i][0])
    
total_correct = np.sum(np.array(total_true) == np.array(total_pred))
acc=float(total_correct) / len(total_true)
std=math.sqrt( (acc * (1-acc)) / len(total_true) )
str("Accuracy: %.3f +/- %.3f (%s/%s)" % (acc, 1.96*std, total_correct, len(total_true)))

'Accuracy: 0.632 +/- 0.015 (2418/3828)'

In [24]:
precision, recall, f1, support = [l[1] for l in precision_recall_fscore_support(total_true, total_pred)]
str("Precision: %.3f | Recall: %.3f | F1: %.3f" % (precision, recall, f1))

'Precision: 0.662 | Recall: 0.537 | F1: 0.593'