In [1]:
import pandas as pd
import random
import numpy as np
import networkx as nx
import nltk
import tensorflow as tf
import collections
import math
import os.path
from six.moves import xrange
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import MiniBatchKMeans
from sklearn.cluster import Birch
import  glob
from pprint import pprint
import pickle
import os
import time
import datetime

In [2]:
Doc = collections.namedtuple('Doc',['investigator', 'amount', 'abstract', 'p_num', 'p_name'])
def load2(fn, i):
    
    doc = nltk.data.load(fn)
    wrds = nltk.tokenize.wordpunct_tokenize(doc)
    try:
        investigator = " ".join(wrds[wrds.index("Investigator")+2:
                          min(wrds.index('@' if '@' in wrds else "Abstract", 
                                          wrds.index("Investigator"))-1, 
                              wrds.index('(', wrds.index("Investigator")))])
    except ValueError:
        return load2(filenames[i+1], i+1)
    amount = int(wrds[wrds.index("Amt") + 4: wrds.index('(', wrds.index("Amt"))][0])
    pg = wrds[wrds.index('Program', wrds.index('Sponsor'))+2: wrds.index('Fld')]
    abstract = " ".join(wrds[wrds.index('Abstract')+2:])
    try:
        return Doc(investigator, amount, abstract, pg[0], " ".join(pg[1:]))
    except ValueError:
        pg = wrds[wrds.index(':', wrds.index('Program'))+1: wrds.index('Fld', wrds.index('Program'))]
        return Doc(investigator, amount, abstract, pg[0], " ".join(pg[1:]))


In [218]:
class TextCNN(object):
    
    def __init__(self, sequence_length, num_classes, vocab_size,
                embeddings_size, filter_sizes, num_filters, embeds,
                l2_reg_lambda):
        
        self.input_x = tf.placeholder(tf.int32, [None, sequence_length], name="input_x")
        self.input_y = tf.placeholder(tf.float32, [None, num_classes], name="input_y")
        self.dropout_keep_prob = tf.placeholder(tf.float32, name="dropout_keep_prob")
        
        l2_loss = tf.constant(0.0)
        
        with tf.device('/cpu:0'), tf.name_scope("embedding"):
            W = tf.Variable(tf.constant(embeds), name='W')
            self.embedded_chars = tf.nn.embedding_lookup(W, self.input_x)
            self.embedded_chars_expanded = tf.expand_dims(self.embedded_chars, -1)
        
        
        pooled_outputs = []
        
        for i, filter_size in enumerate(filter_sizes):
            with tf.name_scope("conv-maxpool-%s" % filter_size):
                filter_shape = [filter_size, embeddings_size, 1, num_filters]
                W = tf.Variable(tf.truncated_normal(filter_shape, stddev=.1), name="W")
                b = tf.Variable(tf.constant(.1, shape=[num_filters]), name="b")
                conv = tf.nn.conv2d(
                    self.embedded_chars_expanded, W, strides=[1,1,1,1], 
                    padding="VALID", name="conv")
                h = tf.nn.relu(tf.nn.bias_add(conv, b), name="relu")
                pooled = tf.nn.max_pool(
                    h, ksize=[1,sequence_length - filter_size + 1, 1, 1],
                    strides=[1,1,1,1], padding="VALID", name="pool")
                pooled_outputs.append(pooled)
        
        num_filters_total = num_filters * len(filter_sizes)
        self.pooled_outputs = pooled_outputs
        self.h_pool = tf.concat(3, pooled_outputs),
        self.h_pool_flat = tf.reshape(self.h_pool, [-1, num_filters_total])
        
        with tf.name_scope("dropout"):
            self.h_drop = tf.nn.dropout(self.h_pool_flat, self.dropout_keep_prob)
        
        with tf.name_scope("output"):
            W = tf.Variable(
                tf.truncated_normal([num_filters_total, num_classes], stddev=.1), name="W")
            b = tf.Variable(tf.constant(.1, shape=[num_classes]), name="b")
            
            l2_loss += tf.nn.l2_loss(W)
            l2_loss += tf.nn.l2_loss(b)
            
            self.scores = tf.nn.xw_plus_b(self.h_drop, W, b, name="scores")
            self.predictions = tf.argmax(self.scores, 1, name="predictions")
        
        with tf.name_scope("loss"):
            losses = tf.nn.softmax_cross_entropy_with_logits(self.scores, self.input_y)
            self.loss = tf.reduce_mean(losses)
        
        with tf.name_scope("accuracy"):
            correct_predictions = tf.equal(self.predictions, tf.argmax(self.input_y, 1))
            self.accuracy = tf.reduce_mean(tf.cast(correct_predictions, "float"), name="accuracy")
    
    

In [244]:
def train(embeddings, dictionary, data, batch_size, seq_len, dropout_keep_prob):
    with tf.Graph().as_default():
        
        session = tf.Session()
        with session.as_default():
            cnn = TextCNN(
            sequence_length = seq_len,
            num_classes=len(unique),
            vocab_size = embeddings.shape[0],
            embeddings_size = embeddings.shape[1],
            filter_sizes=[3,4,5],
            num_filters=128, embeds=embeddings,
             l2_reg_lambda=.2)

            global_step = tf.Variable(0, name="global_step", trainable=False)
            optimizer = tf.train.AdamOptimizer(.0001)
            grads_and_vars = optimizer.compute_gradients(cnn.loss)
            train_op = optimizer.apply_gradients(grads_and_vars, global_step=global_step)


            grad_summaries = []
            for g, v in grads_and_vars:
                if g is not None:
                    grad_hist_summary = tf.histogram_summary("{}/grad/hist".format(v.name), g)
                    sparsity_summary = tf.scalar_summary("{}/grad/sparsity".format(v.name), tf.nn.zero_fraction(g))
                    grad_summaries.append(grad_hist_summary)
                    grad_summaries.append(sparsity_summary)
            grad_summaries_merged = tf.merge_summary(grad_summaries)

            # Output directory for models and summaries
            timestamp = str(int(time.time()))
            out_dir = os.path.abspath(os.path.join(os.path.curdir, "runs", timestamp))
            print("Writing to {}\n".format(out_dir))

            # Summaries for loss and accuracy
            loss_summary = tf.scalar_summary("loss", cnn.loss)
            acc_summary = tf.scalar_summary("accuracy", cnn.accuracy)

            # Train Summaries
            train_summary_op = tf.merge_summary([loss_summary, acc_summary, grad_summaries_merged])
            train_summary_dir = os.path.join(out_dir, "summaries", "train")
            train_summary_writer = tf.train.SummaryWriter(train_summary_dir, sess.graph)

            # Dev summaries
            dev_summary_op = tf.merge_summary([loss_summary, acc_summary])
            dev_summary_dir = os.path.join(out_dir, "summaries", "dev")
            dev_summary_writer = tf.train.SummaryWriter(dev_summary_dir, sess.graph)

            # Checkpoint directory. Tensorflow assumes this directory already exists so we need to create it
            checkpoint_dir = os.path.abspath(os.path.join(out_dir, "checkpoints"))
            checkpoint_prefix = os.path.join(checkpoint_dir, "model")
            if not os.path.exists(checkpoint_dir):
                os.makedirs(checkpoint_dir)
            saver = tf.train.Saver(tf.all_variables())
            
            #session.run(tf.initialize_all_variables())
            saver.restore(session, './runs/1481476533/checkpoints/model-100')

            def train_step(x_batch, y_batch, dropout_keep_prob):
                feed_dict = {
                    cnn.input_x: x_batch,
                    cnn.input_y: y_batch,
                    cnn.dropout_keep_prob: dropout_keep_prob
                }
               
                _, step, summaries, loss, accuracy = session.run(
                    [train_op, global_step, train_summary_op, cnn.loss, cnn.accuracy],
                    feed_dict)
                time_str = datetime.datetime.now().isoformat()
                print("{}: step {}, loss {:g}, acc {:g}".format(time_str, step, loss, accuracy))
                train_summary_writer.add_summary(summaries, step)

            
            def dev_step(x_batch, y_batch, writer=None):
                feed_dict = {
                  cnn.input_x: x_batch,
                  cnn.input_y: y_batch,
                  cnn.dropout_keep_prob: 1.0
                }
                step, summaries, loss, accuracy = session.run(
                    [global_step, dev_summary_op, cnn.loss, cnn.accuracy],
                    feed_dict)
                time_str = datetime.datetime.now().isoformat()
                print("{}: step {}, loss {:g}, acc {:g}".format(time_str, step, loss, accuracy))
                if writer:
                    writer.add_summary(summaries, step)

            for i in range(2):
                batches = batcher(data, dictionary, batch_size, seq_len)

                for batch in batches:
                    x_batch, y_batch = batch['x'], batch['y']
                    train_step(x_batch, y_batch, dropout_keep_prob)
                    current_step = tf.train.global_step(session, global_step)
                    
                    if current_step % 50 == 0:
                        print("\nEvaluation:")
                        dev_set = data.sample(frac=.1)
                        dev_batch = dev_batcher(dev_set, dictionary, seq_len)
                        x_dev, y_dev = dev_batch['x'], dev_batch['y']
                        dev_step(x_dev, y_dev, writer=dev_summary_writer)
                        print("")
                    if current_step == 100:
                        path = saver.save(session, checkpoint_prefix, global_step=current_step)
                        print("Saved model checkpoint to {}\n".format(path))

In [None]:
def complete(network, dictionary, data):
    data_c = list([data[0]])
    for index, clust in enumerate(data[:-1]):
            i = 0
            ed = KB.get_edge_data(clust, data[index+1])
            while ed and i < 5:
                data_c.append(ed['number'])
                ed = KB.get_edge_data(ed['number'], data[index+1])
                i += 1
            data_c.append(data[index+1])
    return data_c

In [225]:
unique = pd.unique(joint.p_num)
label_dict = {unique[i]:i for i in range(len(unique))}

In [234]:
def dev_batcher(data, dictionary, seq_len):
    def to_dict(x, seq_len):
        bt = []
        for ab in x.values:
            bt += [[dictionary[wrd]  if wrd in dictionary else dictionary['UNK']
                          for wrd in ab.split(" ")][:seq_len]]
        return np.array(bt)
                    
    matrix = np.zeros((len(data), len(unique)))
    for index, p_num in enumerate(data.p_num.values):
        matrix[index][label_dict[p_num]] = 1
    return {'x': to_dict(data.abstract, seq_len), 'y': matrix}

In [129]:
def batcher(data, dictionary, batch_size, seq_len):
    def to_dict(x, seq_len):
        bt = []
        for ab in x.values:
            bt += [[dictionary[wrd]  if wrd in dictionary else dictionary['UNK']
                          for wrd in ab.split(" ")][:seq_len]]
        return np.array(bt)
                    
    i = 0
    for i in range(0, len(data), batch_size):
        c = data.iloc[i:i+batch_size]
        matrix = np.zeros((batch_size, len(unique)))
        for index, p_num in enumerate(c.p_num.values):
            
            matrix[index][label_dict[p_num]] = 1
        yield {'x': to_dict(c.abstract, seq_len), 'y': matrix}

In [7]:
tree = pickle.load(open('rev_dic.pkl', 'rb'))
dictionary = pickle.load(open('dic.pkl', 'rb'))

In [8]:
embeddings = np.load('./fembd.npy')
network = nx.read_gpickle('./network.gpickle')

In [157]:
filenames = [fn for fn in glob.iglob('./text/Part*/*/*/*.txt', recursive=False)]
submission_tups = [load2(filenames[i], i) for i in range(100000)]

In [224]:
submissions = pd.DataFrame(submission_tups, columns = Doc._fields, )
submissions.drop_duplicates(inplace=True)
train_groups = submissions.groupby("p_name")

joint = train_groups.filter(lambda x: len(x) > 1000)
joint = joint.loc[joint.apply(lambda x: len(x['abstract'].split(" ")) > 120, axis=1)]

In [228]:
train_data = joint.iloc[:12000]
test_data = joint.iloc[25000:]

In [231]:
train_data = train_data.sample(frac=1)

In [245]:
test = dev_batcher(train_data.iloc[:100], dictionary, 120)

In [236]:
test = train(embeddings[:31849], dictionary, train_data, 50, 120, .6)

Writing to /home/lenny/Documents/Language/runs/1481477599

2016-12-11T12:33:20.265272: step 101, loss 2.57123, acc 0.2
2016-12-11T12:33:20.555859: step 102, loss 2.44694, acc 0.2
2016-12-11T12:33:20.837377: step 103, loss 2.47892, acc 0.14
2016-12-11T12:33:21.121308: step 104, loss 2.6272, acc 0.16
2016-12-11T12:33:21.407936: step 105, loss 2.67007, acc 0.1
2016-12-11T12:33:21.692658: step 106, loss 2.64082, acc 0.22
2016-12-11T12:33:21.986696: step 107, loss 2.55216, acc 0.22
2016-12-11T12:33:22.280616: step 108, loss 2.47494, acc 0.2
2016-12-11T12:33:22.565416: step 109, loss 2.45269, acc 0.18
2016-12-11T12:33:22.857920: step 110, loss 2.60065, acc 0.14
2016-12-11T12:33:23.207120: step 111, loss 2.38928, acc 0.24
2016-12-11T12:33:23.480412: step 112, loss 2.67184, acc 0.2
2016-12-11T12:33:23.752982: step 113, loss 2.55468, acc 0.2
2016-12-11T12:33:24.030355: step 114, loss 2.72852, acc 0.12
2016-12-11T12:33:24.313245: step 115, loss 2.49087, acc 0.22
2016-12-11T12:33:24.593323: step 

ValueError: invalid literal for int() with base 10: 'y'

In [172]:
elen(list(filter(lambda key: type(key) == str, list(dictionary.keys()))))

31849