In [2]:
from tensorflow import keras
import matplotlib.pyplot as plt
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import * 
from sklearn.preprocessing import LabelEncoder, StandardScaler
from collections import Counter, namedtuple
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from sklearn.model_selection import StratifiedShuffleSplit, ShuffleSplit


## extra imports to set GPU options
import tensorflow as tf
 
# TensorFlow wizardry
config = tf.ConfigProto()
 
# Don't pre-allocate memory; allocate as-needed
config.gpu_options.allow_growth = True
 
# Only allow a total of half the GPU memory to be allocated
config.gpu_options.per_process_gpu_memory_fraction = 0.4
 
# Create a session with the above options specified.
session = tf.Session(config=config)
tf.keras.backend.set_session(session)

In [3]:
#constants
maxlen = 300
bonus_info_max=10000
max_sentences = 10
MAX_SEQUENCE_LENGTH = 30
vocab_size=40000


In [4]:
#helper functions for loading stuff
from enum import Enum
from os import listdir
import json as j
from collections import namedtuple

#non-error prone training?
def get_embedding_weights(input_dim, word_index, path="/home/hellrich/keras-test/embeddings/bio_nlp_win2_", suffix="d.vec",
               embedding_dim=200):
    def load_embedding_text(dim):
        w2e = {}
        for line in open(path+str(dim)+suffix):
            line = line.strip().split(" ")
            if len(line) > 2:
                w = line[0]
                e = np.asarray(line[1:], dtype="float32")
                w2e[w] = e
        return w2e
    
    index2word = {v : k for k, v in word_index.items()}
    weights = np.zeros((input_dim, embedding_dim))
    loaded = load_embedding_text(embedding_dim)
    entries = 0
    for i in range(1,input_dim):
        w = index2word[i]
        if w in loaded:
            weights[i] = np.array(loaded[w])
            entries += 1
    return weights

def make_embeddings(input_dim,input_length,weights,embedding_dim=200,trainable=False):
    _weights = [np.copy(x) for x in weights]
    return Embedding(input_dim, embedding_dim, weights=[weights],
                      input_length=input_length, trainable=trainable) #input_dim + 1 ?

BonusInfo = namedtuple('BonusInfo', ["meshTags","genes", "organisms"])

def parse_json(path="json-output"):
    pmid2info = {}
    for json_file in [x for x in listdir(path) if x.endswith(".json")]:
        with open(path+"/"+json_file, "r", encoding="utf8") as open_json_file:
            json = j.load(open_json_file)
            if "meshTags" in json:
                meshTags = " ".join(json["meshTags"])
            else:
                meshTags = " "
            if "genes" in json:
                genes = " ".join(json["genes"])
            else:
                genes = " "
            if "organisms" in json:
                organisms = " ".join(json["organisms"])
            else:
                organisms = " "
            #abstract = json["abstract"]
            #title = json["title"]
            pmid = json["pubmedId"]
            pmid2info[pmid] = BonusInfo(meshTags, genes, organisms)
    return pmid2info


def read_data(data_file = "20180622processedGoldStandardTopics.tsv",  
              maxlen=None, vocab_size=None, sentence_wise=None, bonus_info_max=False):
    texts = []
    labels = []
    pmids = []
    with open(data_file) as data_file:
        line_number = 0
        for line in data_file:
            if line_number > 0:
                number, trec_topic_number, trec_doc_id, pm_rel_desc, disease_desc, \
                gene1_annotation_desc, gene1_name, gene2_annotation_desc, \
                gene2_name, gene3_annotation_desc, gene3_name, demographics_desc, \
                other_desc, relevance_score, title, abstract, major_mesh, \
                minor_mesh, trec_topic_disease, trec_topic_age, trec_topic_sex, \
                trec_topic_other1, trec_topic_other2, trec_topic_other3 = line.split("\t")
                
                text = title.replace("[","").replace("]","") + " " + abstract
                text = text.lower().replace(". ", " . ").replace(", ", " , ").\
                        replace("? ", " ? ").replace("! ", " ! ")
                texts.append(text)
                
                if pm_rel_desc == 'Human PM' or pm_rel_desc == 'Animal PM':
                    labels.append(1.)
                else:
                    labels.append(0.)
                pmids.append(trec_doc_id)    
            line_number += 1
                   
    #fold_repeats:
    text2indices = {}
    for i, text in enumerate(texts):
        if text in text2indices:
            text2indices[text].append(i)
        else:
            text2indices[text] = [i]
    for text, indices in text2indices.items():
        if len(indices) > 1:
            is_pm = False
            for i in indices:
                if labels[i] == 1.:
                    is_pm = True
            #remove all but first
            for i in indices[1:]:
                labels[i] = None
                texts[i] = None
                pmids[i] = None
            if is_pm:
                labels[indices[0]] = 1.
    texts = [t for t in texts if t is not None]
    labels = np.array([x for x in labels if x is not None])
    pmids = [x for x in pmids if x is not None]     
    
    #see https://machinelearningmastery.com/prepare-text-data-deep-learning-keras/
    if not vocab_size:
        vocab_size = len({y for x in texts for y in x.split()})
    t = Tokenizer(num_words=vocab_size)
    t.fit_on_texts(texts)
    
    if sentence_wise:
        max_sentences = sentence_wise[0]
        max_sentence_length = sentence_wise[1]
        list_of_list_of_sentences = [text.split(" . ") for text in texts]
        #trimming number of sentences
        trimmed_list_of_list_of_sentences = []
        for x in list_of_list_of_sentences:
            if len(x) > max_sentences:
                trimmed_list_of_list_of_sentences.append(x[:max_sentences])
            else:
                if len(x) < max_sentences:
                    to_pad = max_sentences - len(x)
                    for i in range(to_pad):
                        x.append([" "])
                trimmed_list_of_list_of_sentences.append(x)
        list_of_sentence_matrices = [t.texts_to_sequences(x) for x in trimmed_list_of_list_of_sentences]
        #trimming sentences for length
        trimmed_list_of_sentence_matrices = [keras.preprocessing.sequence.pad_sequences(
                            x, maxlen = max_sentence_length) for x in list_of_sentence_matrices]
        texts = np.asarray(trimmed_list_of_sentence_matrices)
    else:
        texts = t.texts_to_sequences(texts)
        if maxlen:
            texts = keras.preprocessing.sequence.pad_sequences(texts, maxlen=maxlen)
            
    #bonus_info as bow
    meshTags = []
    genes = []
    organisms = []
    pmid2info = parse_json()
    max_size = max(int(vocab_size/10), bonus_info_max)
    for pmid in pmids:
        info = pmid2info[pmid]
        meshTags.append(info.meshTags)
        genes.append(info.genes)
        organisms.append(info.organisms)
    num_meshTags = min(len({y for x in meshTags for y in x.split()}),max_size)
    print(num_meshTags)
    mesh_t = Tokenizer(num_words=num_meshTags)
    mesh_t.fit_on_texts(meshTags)
    meshTags = mesh_t.texts_to_matrix(meshTags, mode="binary")
    num_genes =  min(len({y for x in genes for y in x.split()}),max_size)
    print(num_genes)
    gene_t = Tokenizer(num_words=num_genes)
    gene_t.fit_on_texts(genes)
    genes = gene_t.texts_to_matrix(genes, mode="binary")
    num_orga =  min(len({y for x in organisms for y in x.split()}),max_size)
    print(num_orga)
    orga_t = Tokenizer(num_words=num_orga)
    orga_t.fit_on_texts(organisms)
    organisms = orga_t.texts_to_matrix(organisms, mode="binary")
    
    return texts, labels, t.word_index, meshTags, genes, organisms

In [25]:
SmartTuner = namedtuple("SmartTuner",["start_tuning_timestep","timestep2mod"])

def experiment(model_provider, texts, labels, nfold=1, stratify=True, max_epochs=9, 
               optimizer="adam", fine_tuner=None):
    if stratify:
        splitter = StratifiedShuffleSplit(n_splits=10, test_size=0.1, random_state=0)
    else:
        splitter = ShuffleSplit(n_splits=10, test_size=0.1, random_state=0)
        
    fold=0
    accs=[]
    for train_index, test_index in splitter.split(np.zeros(len(labels)), labels): #could be used for 10 fold
        if fold >= nfold:
            break
        fold += 1
        #get split
        if not type(texts) == list:
            texts = [texts]
        train_x = []
        test_x = []
        for x in texts:
            train_x.append(x[train_index])
            test_x.append(x[test_index])
        train_y = labels[train_index]
        test_y = labels[test_index]
         
        model=model_provider()
        model.compile(loss='binary_crossentropy',
                    optimizer=optimizer,
                    metrics=['acc'])
        
        for i in range(1,max_epochs+1):
            if fine_tuner:
                if i in fine_tuner.timestep2mod:
                    lr = float(keras.backend.get_value(model.optimizer.lr)) * fine_tuner.timestep2mod[i]
                    keras.backend.set_value(model.optimizer.lr, lr)
                    if i == fine_tuner.start_tuning_timestep:
                        model.layers[1].trainable = True
                        model.compile(loss='binary_crossentropy',
                        optimizer=optimizer,
                        metrics=['acc'])  
            model.fit(train_x, train_y, epochs=1, batch_size=50, verbose=0)
            re = model.evaluate(train_x, train_y, verbose=0)
            re_acc, re_loss = re[1],re[0]
            te = model.evaluate(test_x, test_y, verbose=0)
            test_acc, test_loss = te[1], te[0]

            line = "Epoch: "+"{:2}".format(fold)+"-"+str(i)+"\tRe-Acc: "+"{:.3f}".format(re_acc)+"\tTest-Acc: "+"{:.3f}".format(test_acc)+"\tRe-Loss: "+"{:.3f}".format(re_loss)+"\tTest-Loss: "+"{:.3f}".format(test_loss)
            print(line)
        accs.append(test_acc) #no block scope has benefits...
    acc = sum(accs) / nfold
    print("X-Validation Accuracy:","{:.3f}".format(acc))
    return acc
    


In [None]:
#simple net (whole document as input)
def simple_embedding_net(pre_trained=False):
    embedding_in = Input(shape=(maxlen,), dtype='int32')
    if not pre_trained:
        embedding_layer = Embedding(input_dim=40000, output_dim=20, input_length=maxlen)(embedding_in)
    else
        raise Exception("Not implemented")
    embedding_layer = Flatten()(embedding_layer)
    embedding_layer = Dropout(0.5)(embedding_layer)

    inputs = [Input(shape=(meshTags.shape[1],)), 
              Input(shape=(genes.shape[1],)), Input(shape=(organisms.shape[1],))]

    concatenated = Dropout(0.2)(keras.layers.concatenate([embedding_layer] + inputs))
    output = Dense(1, activation='sigmoid')(concatenated)

    model = keras.Model([embedding_in] + inputs, output)
    return model

In [29]:
#bonus info + sentence and document level lstms, see https://github.com/keras-team/keras/issues/5516#issuecomment-295016548
def two_level_lstm(embedding_weights, vocab_size, maxlen, use_bonus_info=True):
    def builder():
        e = make_embeddings(input_dim=vocab_size, input_length=MAX_SEQUENCE_LENGTH, weights=embedding_weights)
    
        # Encode each timestep
        bonus_info = [Input(shape=(meshTags.shape[1],)), 
                  Input(shape=(genes.shape[1],)), Input(shape=(organisms.shape[1],))] if use_bonus_info else None

        in_sentence = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int64')
        embedded_sentence = e(in_sentence)
        lstm_sentence = LSTM(64)(embedded_sentence)
        encoded_model = keras.Model(in_sentence, lstm_sentence)

        sequence_input = Input(shape=(max_sentences, MAX_SEQUENCE_LENGTH), dtype='int64')
        seq_encoded = TimeDistributed(encoded_model)(sequence_input)
        seq_encoded = Dropout(0.2)(seq_encoded)

        # Encode entire sentence
        seq_encoded =  Dropout(0.2)(LSTM(64)(seq_encoded))

        if bonus_info:
            seq_encoded = Dropout(0.5)(keras.layers.concatenate([seq_encoded] + bonus_info))

        # Prediction
        out_layer = Dense(1, activation='sigmoid')(seq_encoded)
        model = keras.Model([sequence_input] + bonus_info, out_layer)
        return model
    return builder


In [8]:
#loading stuff for cell below
texts, labels, word_index, meshTags, genes, organisms = read_data(maxlen=maxlen, vocab_size=vocab_size, 
                     sentence_wise=(max_sentences,MAX_SEQUENCE_LENGTH), bonus_info_max=bonus_info_max)
embedding_weights = get_embedding_weights(input_dim=vocab_size, word_index=word_index)

7537
10000
335


In [None]:
#erreicht Epoch: 8	Re-Acc: 0.875	Test-Acc: 0.784	Re-Loss: 0.300	Test-Loss: 0.465
#mit SmartTuner(4, {4: 0.5, 5: 4, 6: .7, 7: .7, 8: .7, 9: .7, 10: .7}) #higher boost&decay, gut ab 7i

tuners =[SmartTuner(4, {4: 0.5, 5: 4, 6: .7, 7: .7, 8: .7, 9: .7, 10: .7})]
model_provider = two_level_lstm(embedding_weights, vocab_size, maxlen, use_bonus_info=True)
for tuner in tuners:
    x = experiment(model_provider, [texts, meshTags, genes, organisms], labels, 
                   fine_tuner=tuner, max_epochs=8, nfold=10)
    print(x)


Epoch:  1-1	Re-Acc: 0.739	Test-Acc: 0.718	Re-Loss: 0.513	Test-Loss: 0.528
Epoch:  1-2	Re-Acc: 0.774	Test-Acc: 0.743	Re-Loss: 0.471	Test-Loss: 0.495
Epoch:  1-3	Re-Acc: 0.789	Test-Acc: 0.754	Re-Loss: 0.447	Test-Loss: 0.475
Epoch:  1-4	Re-Acc: 0.797	Test-Acc: 0.761	Re-Loss: 0.426	Test-Loss: 0.464
Epoch:  1-5	Re-Acc: 0.818	Test-Acc: 0.758	Re-Loss: 0.411	Test-Loss: 0.459
Epoch:  1-6	Re-Acc: 0.832	Test-Acc: 0.762	Re-Loss: 0.379	Test-Loss: 0.455
Epoch:  1-7	Re-Acc: 0.858	Test-Acc: 0.767	Re-Loss: 0.367	Test-Loss: 0.448
Epoch:  1-8	Re-Acc: 0.859	Test-Acc: 0.769	Re-Loss: 0.339	Test-Loss: 0.444
Epoch:  2-1	Re-Acc: 0.739	Test-Acc: 0.711	Re-Loss: 0.518	Test-Loss: 0.538
Epoch:  2-2	Re-Acc: 0.769	Test-Acc: 0.744	Re-Loss: 0.464	Test-Loss: 0.500
Epoch:  2-3	Re-Acc: 0.800	Test-Acc: 0.765	Re-Loss: 0.447	Test-Loss: 0.485
Epoch:  2-4	Re-Acc: 0.800	Test-Acc: 0.765	Re-Loss: 0.427	Test-Loss: 0.479
Epoch:  2-5	Re-Acc: 0.820	Test-Acc: 0.751	Re-Loss: 0.401	Test-Loss: 0.481


In [None]:
from collections import namedtuple
SmartTuner = namedtuple("SmartTuner",["start_tuning_timestep","timestep2mod"])

tuners =[ SmartTuner(11,{}), #no finetuning
        SmartTuner(3, {3: 0.5, 4: 3, 5: .8, 6: .8, 7: .8, 8: .8, 9: .8, 10: .8}), #original
    #SmartTuner(3, {3: 0.5, 4: 4, 5: .7, 6: .7, 7: .7, 8: .7, 9: .7, 10: .7}) #higher boost&decay, gut ab 7i
        ]

embedding_weights = get_embedding_weights(input_dim=vocab_size, word_index=word_index)
for tuner in tuners:
    print(tuner)
    e = make_embeddings(input_dim=vocab_size, input_length=MAX_SEQUENCE_LENGTH, weights=embedding_weights)  
    bonus_info = [Input(shape=(meshTags.shape[1],)), 
              Input(shape=(genes.shape[1],)), Input(shape=(organisms.shape[1],))]

    in_sentence = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int64')
    embedded_sentence = e(in_sentence)
    #conv_sentence = embedded_sentence
    #conv_sentence =  Dropout(0.2)(embedded_sentence)
    #conv_sentence =  Conv1D(256, 3, activation='relu')(conv_sentence)
    #conv_sentence = GlobalMaxPooling1D()(conv_sentence)
    #encoded_model = keras.Model(in_sentence, conv_sentence)


    lstm_sentence = LSTM(64)(embedded_sentence)
    encoded_model = keras.Model(in_sentence, lstm_sentence)

    sequence_input = Input(shape=(max_sentences, MAX_SEQUENCE_LENGTH), dtype='int64')
    seq_encoded = TimeDistributed(encoded_model)(sequence_input)
    seq_encoded = Dropout(0.2)(seq_encoded)

    # Encode entire sentence
    seq_encoded =  Dropout(0.2)(LSTM(64)(seq_encoded))
    concat = Dropout(0.5)(keras.layers.concatenate([seq_encoded] + bonus_info))

    # Prediction
    out_layer = Dense(1, activation='sigmoid')(concat)
    model = keras.Model([sequence_input] + bonus_info, out_layer)
    r = 

In [None]:
#non-error prone training with bonus info as strings
#loading stuff for cell below
max_sentences = 10
MAX_SEQUENCE_LENGTH = 30
vocab_size=40000

#bionlp
path = "/home/hellrich/keras-test/embeddings/bio_nlp_win2_"
embedding_dim = 200
suffix = "d.vec"

texts, labels, word_index, meshTags, genes, organisms, dim_mesh, len_mesh, dim_genes, len_genes, dim_organisms, len_organisms = read_data(maxlen=maxlen, vocab_size=vocab_size, 
                                      return_word_index=True, sentence_wise=(max_sentences,MAX_SEQUENCE_LENGTH), 
                                                                  bonus_info_strings=True)

def get_embedding_weights(input_dim, word_index, path="/home/hellrich/keras-test/embeddings/glove.6B.", suffix="d.txt",
               embedding_dim=200):
    def load_embedding_text(dim):
        w2e = {}
        for line in open(path+str(dim)+suffix):
            line = line.strip().split(" ")
            if len(line) > 2:
                w = line[0]
                e = np.asarray(line[1:], dtype="float32")
                w2e[w] = e
        return w2e
    
    index2word = {v : k for k, v in word_index.items()}
    weights = np.zeros((input_dim, embedding_dim))
    loaded = load_embedding_text(embedding_dim)
    entries = 0
    for i in range(1,input_dim):
        w = index2word[i]
        if w in loaded:
            weights[i] = np.array(loaded[w])
            entries += 1
    return weights

def make_embeddings(input_dim,input_length,weights,embedding_dim=200,trainable=False):
    _weights = [np.copy(x) for x in weights]
    return Embedding(input_dim, embedding_dim, weights=[weights],
                      input_length=input_length, trainable=trainable) #input_dim + 1 ?



from collections import namedtuple
SmartTuner = namedtuple("SmartTuner",["start_tuning_timestep","timestep2mod"])

tuners =[ #SmartTuner(11,{}), #no finetuning
        #SmartTuner(3, {3: 0.5, 4: 3, 5: .8, 6: .8, 7: .8, 8: .8, 9: .8, 10: .8}), #original
    SmartTuner(3, {3: 0.5, 4: 4, 5: .7, 6: .7, 7: .7, 8: .7, 9: .7, 10: .7}), #higher boost&decay, gut ab 7i
    SmartTuner(5, {5: 0.5, 6: 4, 7: .7, 8: .7, 9: .7, 10: .7}) #late and aggressive
        ]


for tuner in tuners:
    print(tuner)
    e = make_embeddings(input_dim=vocab_size, input_length=maxlen,weights=embedding_weights) 

    bonus_info = [
       Input(shape=(len_mesh,)),
       Input(shape=(len_genes,)),
       Input(shape=(len_organisms,))
    ]
    
    embedded_bonus_info = [
        Flatten()(Embedding(dim_mesh, 20, input_length=len_mesh)(bonus_info[0])),
        Flatten()(Embedding(dim_genes, 20, input_length=len_genes)(bonus_info[1])),
        Flatten()(Embedding(dim_organisms, 20, input_length=len_organisms)(bonus_info[2]))
    ]
        
    
    in_sentence = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int64')
    embedded_sentence = e(in_sentence)
    #conv_sentence = embedded_sentence
    #conv_sentence =  Dropout(0.2)(embedded_sentence)
    #conv_sentence =  Conv1D(256, 3, activation='relu')(conv_sentence)
    #conv_sentence = GlobalMaxPooling1D()(conv_sentence)
    #encoded_model = keras.Model(in_sentence, conv_sentence)


    lstm_sentence = LSTM(64)(embedded_sentence)
    encoded_model = keras.Model(in_sentence, lstm_sentence)

    sequence_input = Input(shape=(max_sentences, MAX_SEQUENCE_LENGTH), dtype='int64')
    seq_encoded = TimeDistributed(encoded_model)(sequence_input)
    seq_encoded = Dropout(0.2)(seq_encoded)

    # Encode entire sentence
    seq_encoded =  Dropout(0.2)(LSTM(64)(seq_encoded))
    concat = Dropout(0.5)(keras.layers.concatenate([seq_encoded] + embedded_bonus_info))

    # Prediction
    out_layer = Dense(1, activation='sigmoid')(concat)
    model = keras.Model([sequence_input] + bonus_info, out_layer)
    r = stratified_experiment(model, [texts, meshTags, genes, organisms], labels, smart_fine_tune=tuner, max_epochs=10)