In [1]:

import sys
import tensorflow as tf
import tensorflow_hub as hub
import pandas as pd
import numpy as np
import os
from os.path import isfile,join
import xml.etree.cElementTree as ET
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
import time
import re
import pickle
import gensim
import nltk
import os
import math
from PyRouge.pyrouge import Rouge

nltk.download('stopwords')
nltk.download('punkt')

import FileProcess as fp


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


('Found embeddings: ', 31105, '/', 42514)
('word_index len = ', 41522)
('time-taken for FP: ', 1.8994669914245605)


In [2]:
## Parameters

embedding_size = 50
learning_rate = 0.1
context_window = 5
cnn_filter_size = 100
threshold = 0.5
batch_size = 10
vocabulary_size=400000
#embedding_filename = "glove_6B_200d.txt"
embedding_filename = "word2vec.model"   ##Trying with pre-trained word2vec model and training it on legal dataset
training_epochs=1
module_url = "https://tfhub.dev/google/universal-sentence-encoder/2" #@param ["https://tfhub.dev/google/universal-sentence-encoder/2", "https://tfhub.dev/google/universal-sentence-encoder-large/3"]
k = 20

In [3]:
dirpath = "corpus/fulltext/"
files = [f for f in os.listdir(dirpath) if isfile(join(dirpath, f))]

files_2006=[];
files_2007=[];
files_2008=[];
files_2009=[];

for file in files:
    if file.split('_')[0]=='06':
        files_2006.append(file);
    if file.split('_')[0]=='07':
        files_2007.append(file);
    if file.split('_')[0]=='08':
        files_2008.append(file);
    if file.split('_')[0]=='09':
        files_2009.append(file);

#print(fp.get_sentences(files[0]))

In [4]:
stop_words_list=set(stopwords.words('english'))
tokenizer = RegexpTokenizer(r'\w+')
##########loading dev2vec=============
d2v_model = gensim.models.doc2vec.Doc2Vec.load('doc2vec_models_50len/doc2vec.model')
docvec = d2v_model.docvecs[files[0]]


In [5]:
def read_data(file_name):
    with open(file_name, 'r') as f:
        word_vocab = set()  # not using list to avoid duplicate entry
        word_count = {};
        word_index={};

        word2vector = {}
        count=0;
        for line in f:
            line_ = line.strip()  # Remove white space
            words_Vec = line_.split()
            word_vocab.add(words_Vec[0])
            word_index[words_Vec[0]]=count;
            count=count+1;

            word2vector[words_Vec[0]] = np.array(words_Vec[1:], dtype=float)
    #print("Total Words in DataSet:", len(word_vocab))
    return word_vocab, word2vector, word_index

In [6]:
def load_embeddings(filename):
    dvocab, w2v,word_index  = read_data(filename);
    return w2v,word_index

In [7]:
embedding_array = fp.embedding_array
word_index = fp.word_index

reverseWordIndex = {}
for key in word_index:
    reverseWordIndex[word_index[key]] = key

In [8]:
#### TO avoid downloading tf hub modules everytime
##  export TFHUB_CACHE_DIR=/usr/local/bin

#sen2vec embeddings
def sen2vec1(sentences):
    embed = hub.Module(module_url)
    with tf.Session() as session:
        session.run([tf.global_variables_initializer(), tf.tables_initializer()])
        sentenceEmbeddings = session.run(embed(sentences))    
        return sentenceEmbeddings

    
def sent2vector(sent):
    words = word_tokenize(sent.lower())
    emb=[]
    for w in words:
        if w in embedding_array.keys():
            emb.append(embedding_array.get(w));
        else:
            emb.append(embedding_array.get('unk'));
    return np.mean(np.array(emb),axis=0)

def get_sent_embeddings(sent_list):
    out_vec=np.zeros((len(sent_list)-1,200))
    for i  in range(len(sent_list)-1):
        out_vec[i]=sent2vector(sent_list[i]);
    return out_vec
    
# sentence_embeddings = sen2Vec(['UWA was ordered to pay the costs of Dr Gra','Sirtex succeeded in its cross-claim against Dr'])

In [9]:
def get_statements(file):
    #with open(dirpath + file, 'r',encoding="utf-8", errors='replace') as f:
    with open(dirpath + file, 'r') as f:
        data=str(f.read());
        data=data.lower()
        data = data.replace("\"id=", "id=\"");
        data=data.replace("\n","")
        data=data.replace('".*?=.*?"', "",)
        data=data.replace("&","");
        xml = ET.fromstring(str(data))
        name=None;
        rows_list=[];
        catchphrases=[];
        sentences=[];
        for child in xml:
            if child.tag=="catchphrases":
                for catchphrase in child:
                    id=catchphrase.attrib.get("id")
                    #print(catchphrase.text)
                    catchphrases.append({"file_id":file,"Name":name,"Id":id,"text":catchphrase.text})
                    #catchphrases+=tokenizer.tokenize(catchphrase.text)
            if child.tag=="sentences":
                for sentence in child:
                    id = sentence.attrib.get("id")
                    sentences.append({"file_id":file,"Name":name,"Id":id,"text":sentence.text})
                    #sentences+=tokenizer.tokenize(sentence.text)
    return sentences,catchphrases

In [10]:
sentences,catchphrases = get_statements(files[0])
start=time.time()
sentences, catch_words = fp.get_dataframe(pd.DataFrame(sentences),pd.DataFrame(catchphrases))
print(sentences,catch_words)
print("time-taken:",time.time()-start)

(        Id     file_id  is_catchword                                words
0       s0  07_209.xml             0      [9205, 3424, 27288, 9330, 9272]
1       s0  07_209.xml             1     [3424, 27288, 9330, 9272, 13945]
2       s0  07_209.xml             0    [27288, 9330, 9272, 13945, 24684]
3       s0  07_209.xml             0     [9330, 9272, 13945, 24684, 1109]
4       s0  07_209.xml             0    [9272, 13945, 24684, 1109, 27288]
5       s0  07_209.xml             0   [13945, 24684, 1109, 27288, 34485]
6       s0  07_209.xml             0   [24684, 1109, 27288, 34485, 39252]
7       s0  07_209.xml             0    [1109, 27288, 34485, 39252, 1109]
8       s0  07_209.xml             0   [27288, 34485, 39252, 1109, 40642]
9       s0  07_209.xml             0   [34485, 39252, 1109, 40642, 42512]
10      s1  07_209.xml             0      [812, 9467, 2100, 42512, 30971]
11      s1  07_209.xml             0    [9467, 2100, 42512, 30971, 23350]
12      s1  07_209.xml             0 

In [11]:
def prepare_index_matrices(sentence_indexes):
    length=len(sentence_indexes);
    
    sentence_indexes=[word_index['unk'],word_index['unk']]+sentence_indexes+[word_index['unk'],word_index['unk']];    
    out_indices=[]
    for i in range(2,length):
        out_indices.append(sentence_indexes[i-2:i+3]);
        
    return out_indices;

def prepare_label_matrices(labels):
    length=len(labels);
    labels=[0,0]+labels+[0,0];
    out_labels=[];
    
    for i in range(2,length):
        out_labels.append(labels[i]);
        
    return out_labels;
    

In [12]:
def lookup_indexes_labels(sentences,catchphrases):
    
    sentence_indexes=[];
    labels=[];
    
    for word in sentences:
        if word in word_index:
            sentence_indexes.append(word_index[word]);
        else:
            #print("unknown word",word)
            sentence_indexes.append(word_index['unk']);
            
    for word in sentences:
        if word in catchphrases:
            labels.append(1.0);
        else:
            #print("unknown word",word)
            labels.append(0.0);
            
    return sentence_indexes,labels;

In [13]:
def get_file_sentences(file_name):
    temp_data=full_data[(full_data.file_id==file_name) & full_data['Id'].str.startswith(('s'))]
    return list(temp_data.text.values)[:-1]


def get_phrases( phrase_indices_2, output):
    catch_phrases_temp = []
    for i, out in enumerate(output):
        if out == 1:
            phrase = ''
            for ind in range(5):
                phrase += reverseWordIndex[phrase_indices_2[i][ind]]+" ";
                catch_phrases_temp.append(phrase[:-1])
    return catch_phrases_temp


In [14]:
rouge = Rouge()
def get_phrases_scores(true_list, predicted_list):
    from collections import defaultdict
    scores_map = defaultdict(float)
    for phrase in predicted_list:
        max_prec=0;
        for true_phrase in true_list:
            [precision, recall, f_score] = rouge.rouge_l(phrase, true_phrase);
            if precision>max_prec:
                max_prec=precision;
        scores_map[phrase]=max_prec;
    return scores_map

In [15]:

def MLP( stacked_tensor, weights_input, weights_input_2, bias1, bias2):
    h1 = (tf.matmul(weights_input,stacked_tensor,transpose_a=True,transpose_b=True))
    #h1 = tf.add(h1, bias1)
    h1 = tf.nn.tanh(h1)
    h2 = tf.matmul(weights_input_2,h1,transpose_a=True,transpose_b=False)
    #h2 = tf.add(h2, bias2)
    h2 = tf.nn.sigmoid(h2)
    #h2 = tf.round(tf.transpose(h2))
    return tf.transpose(h2)
    #return tf.cast(tf.to_float(h2>0.5),dtype=tf.float32)

    
    
## Building Graph
phrases_indices = tf.placeholder(tf.int32, shape=[None, 5])
phrase_labels = tf.placeholder(tf.float32, shape=[None, 1])

test_phrases_indices = tf.placeholder(tf.int32, shape=[None, 5])

W = tf.Variable(embedding_array, dtype=tf.float32,name="W")
#W = tf.Variable(tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0))
phrase_tensor = tf.nn.embedding_lookup(W, phrases_indices)

#sentence_indices = tf.placeholder(tf.int32, shape=[None,1])
#sentence_embeddings = tf.placeholder(tf.float32, shape=[None,200])
#sentence_tensor = tf.nn.embedding_lookup(sentence_embeddings, sentence_indices)
#sentence_tensor=tf.reshape(sentence_tensor, [tf.shape(sentence_tensor)[0], 200])

#doc_embedding = tf.placeholder(tf.float32, shape=[1,embedding_size])
#document_tensor = tf.tile(doc_embedding, [tf.shape(phrases_indices)[0], 1])

phrase_tensor = tf.reshape(phrase_tensor, [tf.shape(phrase_tensor)[0], embedding_size*5])
conv_filter = tf.Variable(tf.truncated_normal((embedding_size*5, embedding_size), stddev=0.1))
word_tensor = tf.nn.relu(tf.matmul(phrase_tensor, conv_filter))

#stacked_tensor = tf.concat(values=[word_tensor, sentence_tensor, document_tensor], axis=1)
#stacked_tensor = tf.concat(values=[word_tensor, document_tensor], axis=1)
stacked_tensor = tf.concat(values=[word_tensor], axis=1)
weights_input = tf.Variable(tf.random_normal((embedding_size*1, cnn_filter_size), stddev=0.2))
weights_input_2 = tf.Variable(tf.random_normal((100, 1), stddev=0.2))

bias1 = tf.Variable(tf.random_normal((1, cnn_filter_size), stddev=0.2))
bias2 = tf.Variable(tf.random_normal((1, 1), stddev=0.2))
print(bias2)

sentence_values=MLP(stacked_tensor, weights_input, weights_input_2, bias1, bias2)

#prediction_loss = tf.nn.softmax_cross_entropy_with_logits_v2(labels=phrase_labels,logits=sentence_values)
loss = tf.reduce_mean(tf.keras.metrics.mean_squared_error(phrase_labels, sentence_values))
#loss = tf.convert_to_tensor(tf.reduce_mean(sentence_values));

#using the gradient descent optimizer
optimizer = tf.train.GradientDescentOptimizer(learning_rate)
grads = optimizer.compute_gradients(loss)
#clipped_grads = [(tf.clip_by_norm(grad, 5), var) for grad, var in grads]
app = optimizer.apply_gradients(grads)

#For test data
test_embed = tf.nn.embedding_lookup(W, test_phrases_indices)
test_embed = tf.reshape(test_embed, [tf.shape(test_embed)[0], embedding_size*5])
test_conv_filter = tf.Variable(tf.truncated_normal((embedding_size*5, embedding_size), stddev=0.1));

test_word_tensor = tf.nn.relu(tf.matmul(test_embed, test_conv_filter));
test_pred = MLP(test_word_tensor, weights_input, weights_input_2, bias1, bias2)

init = tf.global_variables_initializer()




def train( sess, num_steps):
    init.run()
    #print("Initailized")
    start=time.time()
    total_loss=0.0;
    train_files = files_2006 + files_2007 + files_2008
    for file in train_files:
        start=time.time()
        file_data_frame = None; catch_words = None
        try:
            sentences, catchphrases=get_statements(file);
            file_data_frame, catch_words = fp.get_dataframe(pd.DataFrame(sentences),pd.DataFrame(catchphrases))
        except:
            print('************************BAD DATA IN FILE, file=', file)
            continue
        #needs sentences list
        #sentences_list = fp.get_sentences(file) #indexes/order preserved
        #sentence_embeddings = sen2vec(sentences_list)
        #sentence_embeddings = get_sent_embeddings(sentences_list)
        #sentence_ids_to_indices = np.array([int(x[1:]) for x in list(file_data_frame['Id'].values)])

        #phrase inputs and labels
        phrases_indices_1 = np.array(list(file_data_frame['words'].values))
        phrases_label_matrix = np.array(list(file_data_frame['is_catchword'].values))

        #doc embedding
        #doc_embedding = d2v_model.docvecs[file]
        #doc_embedding = np.reshape(doc_embedding, (1, embedding_size))
        #sentence_ids_to_indices = np.reshape(sentence_ids_to_indices, (sentence_ids_to_indices.shape[0], 1))
        phrases_label_matrix = np.reshape(phrases_label_matrix, (phrases_label_matrix.shape[0], 1))

        feed_dict = {#self.doc_embedding: doc_embedding, 
                     phrases_indices: phrases_indices_1, 
                     phrase_labels: phrases_label_matrix}
        
        #loss_val = sess.run([self.loss], feed_dict=feed_dict)  
        _,loss_val,outputs = sess.run([app,loss,sentence_values], feed_dict=feed_dict)

        if num_steps%20 == 0:
            print('Train Finished for file ', file, ' loss_val=', loss_val)

            


def test( sess, file):
    try:
        sentences, catchphrases = get_statements(file);
        file_data_frame, catch_words = fp.get_dataframe(pd.DataFrame(sentences),pd.DataFrame(catchphrases))
    except:
        print('************************BAD DATA IN FILE, file=', file)
        return {}

    test_phrases_indices_1 = np.array(list(file_data_frame['words'].values))
    phrases_label_matrix = np.array(list(file_data_frame['is_catchword'].values))
    phrases_label_matrix = np.reshape(phrases_label_matrix, (phrases_label_matrix.shape[0], 1))

    feed_dict = {test_phrases_indices: test_phrases_indices_1}
    outputs = sess.run([test_pred], feed_dict=feed_dict)
    
    outputs = [1 if x[0]>=threshold else 0 for x in outputs[0].tolist()] 

    predicted_catch_phrases = get_phrases(test_phrases_indices_1, outputs)
    print("Reched end of testing")
    return get_phrases_scores(catchphrases, predicted_catch_phrases)




def load_embeddings(sess,embedding_array):
    sess.run(W, feed_dict={W: np.asarray(embedding_array)})
        


<tf.Variable 'Variable_4:0' shape=(1, 1) dtype=float32_ref>


In [16]:
def get_features(inputs):
    out=np.zeros((len(inputs),embedding_size));
    for i in range(len(inputs)):
        try:
            c = w2v[inputs[i]]
        except KeyError:
            c = np.zeros((1,embedding_size));
        out[i]=c;
    return np.transpose(out);


def get_precision(result):
    mean_Res=0.0
    count=1;
    for v in result:
        if not  math.isnan(v):
            mean_Res += v
            count += 1
        else:
            print(v);
    print(mean_Res/count)

    
def get_key_phrases(phrases):
    res_map = {}
    for phrase in phrases.keys():
        if 'unk' not in phrase:
            res_map[phrase] = phrases[phrase];
    return res_map

In [17]:
### Training module

with tf.Session().as_default() as sess:
    load_embeddings(sess,embedding_array);
    start = time.time()
    for i in range(1):
        train(sess, i)
    t=time.time()-start
    print ('**************************Train_Time = epoch', t, '  mins: ',divmod(divmod(t, 3600)[1], 60)[0])

    
### Saving the model

saver = tf.train.Saver()
save_path = saver.save(sess, "./MyModels/model.ckpt")
print("Model saved in path: %s" % save_path)

('Train Finished for file ', '06_159.xml', ' loss_val=', 0.22676311)
('**************************Train_Time = epoch', 0.19909286499023438, '  mins: ', 0.0)
Model saved in path: ./MyModels/model.ckpt


In [18]:
### Loading from saved model

files_phrase_score_map = {}

with tf.Session() as new_sess:
    load_embeddings(new_sess,embedding_array);
    saver.restore(new_sess, "./MyModels/model.ckpt")
    print("Saved model restored")

    
    test_files = files_2009
    
    
    ### TO test more files, increasing number in below test_files's slice 
    for file in test_files[:1]:
        phrase_score_map = test(new_sess, file)
        files_phrase_score_map[file] = phrase_score_map
        
        predictions = get_key_phrases(phrase_score_map)        
        top_k =  sorted(predictions.values(), reverse = True)[:20]
        res = set([])
        for k in top_k:
            res.add(predictions.keys()[predictions.values().index(k)])

        print ('Testing done for file: ', file, ' MeanPrecision: ', get_precision(top_k))
        print ('Output Phrases: ', list(res))
        print ('\n\n')


INFO:tensorflow:Restoring parameters from ./MyModels/model.ckpt
Saved model restored
Reched end of testing
0.952380952381
('Testing done for file: ', '09_245.xml', ' MeanPrecision: ', None)
('Output Phrases: ', ['detailed'])



