In [52]:
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import pyspark
import numpy as np
import os
from pyspark.sql.types import *
import pickle
import tensorflow as tf
from src.util import *
from sklearn.feature_extraction.text import CountVectorizer
import pyspark.sql.functions as F
import s3fs

import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize
import re

import collections
from gensim.models.keyedvectors import KeyedVectors
from gensim.test.utils import get_tmpfile
from gensim.scripts.glove2word2vec import glove2word2vec



[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# Load Data

In [53]:
df = spark.read.json(
 's3://aws-logs-816063959671-us-east-1/data/tldr-training-data.jsonl')


In [54]:
subset = df.sample(withReplacement=False, fraction=0.1)
subset.cache()
subset.count()

308249

In [55]:
subset.printSchema()

root
 |-- author: string (nullable = true)
 |-- body: string (nullable = true)
 |-- content: string (nullable = true)
 |-- content_len: long (nullable = true)
 |-- id: string (nullable = true)
 |-- normalizedBody: string (nullable = true)
 |-- subreddit: string (nullable = true)
 |-- subreddit_id: string (nullable = true)
 |-- summary: string (nullable = true)
 |-- summary_len: long (nullable = true)
 |-- title: string (nullable = true)



In [56]:
from src.util import *

In [57]:
import pandas as pd
import matplotlib.pyplot as plt
import pyspark
import numpy as np
import string
import re
from pyspark.sql.functions import isnan, when, count, col
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from pyspark.ml.clustering import KMeans
from pyspark.ml.feature import CountVectorizer, IDF
from pyspark.sql.functions import udf
from pyspark.sql.types import *
from pyspark.sql import Row
import pyspark.sql.functions as F
import pickle
PUNCTUATION = set(string.punctuation)
STOPWORDS = set(stopwords.words('english'))
STOPWORDS = set(stopwords.words('english'))
addl_punctuation = set(['...', '`', '¿','⸮', '``', "''"])
PUNCTUATION = PUNCTUATION.union(addl_punctuation)

CONTRACTIONS = {
"ain't": "am not", "aren't": "are not", "can't": "cannot","can't've": "cannot have","'cause": "because",
"could've": "could have","couldn't": "could not","couldn't've": "could not have","didn't": "did not",
"doesn't": "does not","don't": "do not","hadn't": "had not","hadn't've": "had not have",
"hasn't": "has not","haven't": "have not","he'd": "he would","he'd've": "he would have",
"he'll": "he will","he's": "he is","how'd": "how did","how'll": "how will",
"how's": "how is","i'd": "i would","i'll": "i will","i'm": "i am","i've": "i have","isn't": "is not",
"it'd": "it would","it'll": "it will","it's": "it is","let's": "let us", "ma'am": "madam", "mayn't": "may not",
"might've": "might have","mightn't": "might not","must've": "must have","mustn't": "must not",
"needn't": "need not","oughtn't": "ought not","shan't": "shall not","sha'n't": "shall not","she'd": "she would",
"she'll": "she will","she's": "she is","should've": "should have","shouldn't": "should not","that'd": "that would",
"that's": "that is","there'd": "there had","there's": "there is","they'd": "they would","they'll": "they will",
"they're": "they are","they've": "they have","wasn't": "was not","we'd": "we would","we'll": "we will",
"we're": "we are","we've": "we have","weren't": "were not","what'll": "what will","what're": "what are",
"what's": "what is","what've": "what have","where'd": "where did","where's": "where is","who'll": "who will",
"who's": "who is","won't": "will not","wouldn't": "would not","you'd": "you would","you'll": "you will",
"you're": "you are"
}

def clean_text(text, remove_stopwords=True):
    text = text.lower()

    if True:
        text = text.split()
        new_text = [CONTRACTIONS[w] if w in CONTRACTIONS else w for w in text]

        text = " ".join(new_text)
    text = re.sub(r'^https?:\/\/.*[\r\n]*', '', text, flags=re.MULTILINE)
    tokens = word_tokenize(text)
    tokens = [w for w in tokens if w not in PUNCTUATION]
    if remove_stopwords==True:
        tokens = [w for w in tokens if w not in STOPWORDS]
    return ' '.join(tokens)

def word_length(string):
    tokens = word_tokenize(string)
    tokens = [w for w in tokens if w not in PUNCTUATION]
    return len(tokens)

def clean_data(df, n_words_summary=50, remove_stopwords=True):
    # Get rid of all rows where subreddit is null (these are spam)
    df = df.filter(df.subreddit.isNotNull())
    # Lowercase columns:
    for col in ['body','content','normalizedBody','subreddit','summary','title']:
        df = df.withColumn(col, F.lower(F.col(col)))
    # Converts 'null' strings in the title column back to null values
    df = df.withColumn('title', when(df.title == 'null', F.lit(None)).otherwise(df.title))
    

    # Creat edit(bool) and edit_len columns, while removing 'edit:%' from summary column
    split_col = F.split(df['summary'], '(edit:|[^a-z]edit)')
    df = df.withColumn('edit', split_col.getItem(1))
    df = df.withColumn('summary', split_col.getItem(0))
    function = udf(word_length, LongType())
    df = df.withColumn('summary_len', function(df.summary))
        # Creates edit_len column, number of words from 'edit'
    df = df.withColumn('edit', df.edit).na.fill('')
    df = df.withColumn('edit_len', function(df.edit))
        # Converts -1 in edit_len column to null
    df = df.withColumn('edit_len',
        when(df.edit_len == -1, F.lit(0)).otherwise(df.edit_len))
    df = df.withColumn('edit', when(df.edit.isNull(), F.lit(0)).otherwise(1))
    # Remove all rows where summary contains less than 5 words
    df = df.filter(df.summary_len >= 5)
    # Remove all rows where summary contains greater than n_words_summary words
    df = df.filter((df.summary_len <= n_words_summary))
    # Remove all rows where the summary length is not less than 50% of the content length
    df = df.filter(df.summary_len <= df.content_len*0.5)
    # Clean Content column
    cleantext_udf = udf(clean_text, StringType())
    df = df.withColumn('content', cleantext_udf(df.content, F.lit(remove_stopwords)))
    df = df.withColumn('summary', cleantext_udf(df.summary, F.lit(False)))
    return df

In [58]:
newdf = clean_data(subset)

In [59]:
newdf.count()

267475

In [60]:
pdf = newdf.toPandas()
pdf.drop(['body', 'normalizedBody', 'author', 'id', 'subreddit', 
          'subreddit_id', 'title'], axis=1, inplace=True)
pdf = pdf.replace('', np.NaN)
pdf = pdf.replace(float('nan'), np.NaN)
pdf.dropna(inplace=True)
pdf.head()

Unnamed: 0,content,content_len,summary,summary_len,edit,edit_len
0,think fixed either utc standard utc+1 year aro...,178,shifting seasonal time is no longer worth it,8,1,0
1,back still listened skrillex songs almost alwa...,182,do not listen to dubstep right before you go t...,11,1,0
2,definitive line comes hours anything someone c...,134,there is not a definitive skill level it is al...,17,1,0
3,good audio/video quality including mics banter...,101,good job but more firefall talk,6,1,0
4,narrative hl1 lead believe black mesa 's first...,104,the scientists never needed a crystal to visit...,17,1,0


# Subset a pickle

In [61]:
len(pdf)

267474

In [62]:
pdf.to_pickle('df_subset.pkl')

## Create subset for testing

In [39]:
# pdf = pdf.sample(frac=0.1)
# len(pdf)

In [40]:
from sklearn.model_selection import train_test_split

In [41]:
X = pdf.content
y = pdf.summary

X_train, X_test, y_train, y_test = train_test_split(X,y)

# Util.py

In [13]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
from nltk import ne_chunk
nltk.download('maxent_ne_chunker')
nltk.download('words')
import pandas as pd
import re
import collections
import pickle
import numpy as np
from gensim.models.keyedvectors import KeyedVectors
from gensim.test.utils import get_tmpfile
from gensim.scripts.glove2word2vec import glove2word2vec
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
import csv     
import timeit  
import random  



default_path = "./data/"

train_article_path = default_path + "sumdata/train/train.article.txt"
train_title_path   = default_path + "sumdata/train/train.title.txt"
valid_article_path = default_path + "sumdata/train/valid.article.filter.txt"
valid_title_path   = default_path + "sumdata/train/valid.title.filter.txt"

#valid_article_path = default_path + "sumdata/DUC2003/input.txt"
#valid_title_path   = default_path + "sumdata/DUC2003/task1_ref0.txt"

train_article_path = X_train
train_title_path   = X_test
valid_article_path = y_train
valid_title_path   = y_test


def clean_str(sentence):
    sentence = re.sub("[#.]+", "#", sentence)
    return sentence

def get_text_list(series, toy):
    if not toy:
        return series.tolist()[:200000]
    else:
        return series.tolist()[:50]

def build_dict(step, toy=False):
    if step == "train":
        train_article_list = get_text_list(train_article_path, toy)
        train_title_list = get_text_list(train_title_path, toy)

        words = list()
        for sentence in train_article_list + train_title_list:
            for word in word_tokenize(sentence):
                words.append(word)

        word_counter = collections.Counter(words).most_common()
        word_dict = dict()
        word_dict["<padding>"] = 0
        word_dict["<unk>"] = 1
        word_dict["<s>"] = 2
        word_dict["</s>"] = 3
        for word, _ in word_counter:
            word_dict[word] = len(word_dict)

        with open(default_path + "word_dict.pickle", "wb") as f:
            pickle.dump(word_dict, f)

    elif step == "valid":
        with open(default_path + "word_dict.pickle", "rb") as f:
            word_dict = pickle.load(f)

    reversed_dict = dict(zip(word_dict.values(), word_dict.keys()))

    article_max_len = 50
    summary_max_len = 15

    return word_dict, reversed_dict, article_max_len, summary_max_len


def build_dataset(step, word_dict, article_max_len, summary_max_len, toy=False):
    if step == "train":
        article_list = get_text_list(train_article_path, toy)
        title_list = get_text_list(train_title_path, toy)
    elif step == "valid":
        article_list = get_text_list(valid_article_path, toy)
    else:
        raise NotImplementedError

    x = [word_tokenize(d) for d in article_list]
    x = [[word_dict.get(w, word_dict["<unk>"]) for w in d] for d in x]
    x = [d[:article_max_len] for d in x]
    x = [d + (article_max_len - len(d)) * [word_dict["<padding>"]] for d in x]
    
    if step == "valid":
        return x
    else:        
        y = [word_tokenize(d) for d in title_list]
        y = [[word_dict.get(w, word_dict["<unk>"]) for w in d] for d in y]
        y = [d[:(summary_max_len - 1)] for d in y]
        return x, y


def batch_iter(inputs, outputs, batch_size, num_epochs):
    inputs = np.array(inputs)
    outputs = np.array(outputs)

    num_batches_per_epoch = (len(inputs) - 1) // batch_size + 1
    for epoch in range(num_epochs):
        for batch_num in range(num_batches_per_epoch):
            start_index = batch_num * batch_size
            end_index = min((batch_num + 1) * batch_size, len(inputs))
            yield inputs[start_index:end_index], outputs[start_index:end_index]


def get_init_embedding(word_dict , reversed_dict, embedding_size):
    print("Loading Lists...")
    train_article_list = get_text_list(train_article_path, False)
    train_title_list = get_text_list(train_title_path, False)

    print("Loading TF-IDF...")
    tf_idf_list = tf_idf_generate(train_article_list+train_title_list)
    
    print("Loading Pos Tags...")
    pos_list , postags_for_named_entity = get_pos_tags_dict(word_dict.keys())

    #print("Loading Named Entity...")
    #named_entity_recs = named_entity(postags_for_named_entity) 
    
    print("Loading Glove vectors...")

    s3 = s3fs.S3FileSystem(anon=False)
    with s3.open(
        's3://aws-logs-816063959671-us-east-1/data/model_glove_300.pkl', 
        'rb') as handle:
        word_vectors = pickle.load(handle)
     
    used_words = 0
    word_vec_list = list()
    for _, word in sorted(reversed_dict.items()):
        try:
            word_vec = word_vectors.word_vec(word)
            if word in tf_idf_list:
                v= tf_idf_list[word]
                rich_feature_array = np.array([v,v,v,v,v,v,v,v,v,v])
                word_vec = np.append(word_vec, rich_feature_array)
            else:
                v=0
                rich_feature_array = np.array([v,v,v,v,v,v,v,v,v,v])
                word_vec = np.append(word_vec, rich_feature_array)

            if word in pos_list:
                v=pos_list[word]
                rich_feature_array_2 = np.array([v,v,v,v,v,v,v,v,v,v])
                word_vec = np.append(word_vec, rich_feature_array_2)
            else:
                v=0
                rich_feature_array_2 = np.array([v,v,v,v,v,v,v,v,v,v])
                word_vec = np.append(word_vec, rich_feature_array_2) 

            #if word in named_entity_recs:
            #  v=named_entity_recs[word]
            #  rich_feature_array_3 = np.array([v,v,v,v,v,v,v,v,v,v])
            #  word_vec = np.append(word_vec, rich_feature_array_3)
            #else:
            #  v=0
            #  rich_feature_array_3 = np.array([v,v,v,v,v,v,v,v,v,v])
            #  word_vec = np.append(word_vec, rich_feature_array_3)  
          
            used_words += 1
        except KeyError:
            word_vec = np.zeros([embedding_size], dtype=np.float32) #to generate for <padding> and <unk>
        
        
        word_vec_list.append(np.array(word_vec))

    print("words found in glove percentage = " + str((used_words/len(word_vec_list))*100) )
          
    # Assign random vector to <s>, </s> token
    word_vec_list[2] = np.random.normal(0, 1, embedding_size)
    word_vec_list[3] = np.random.normal(0, 1, embedding_size)

    return np.array(word_vec_list)

def tf_idf_generate(sentences):
    #https://stackoverflow.com/questions/30976120/find-the-tf-idf-score-of-specific-words-in-documents-using-sklearn

    from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
    # our corpus
    data = sentences
    cv = CountVectorizer()
    # convert text data into term-frequency matrix
    data = cv.fit_transform(data)
    tfidf_transformer = TfidfTransformer()
    # convert term-frequency matrix into tf-idf
    tfidf_matrix = tfidf_transformer.fit_transform(data)
    # create dictionary to find a tfidf word each word
    word2tfidf = dict(zip(cv.get_feature_names(), tfidf_transformer.idf_))
    return word2tfidf

def get_pos_tags_dict(words):
    #sent = nltk.word_tokenize(sent)
    #print(sent)
    post_tags_for_words = nltk.pos_tag(words)
    pos_list ={}
    for word,pos in post_tags_for_words:
        pos_list[word] = pos

    df = pd.DataFrame(list(pos_list.items()))
    df.columns = ['word', 'pos']
    df.pos = pd.Categorical(df.pos)
    df['code'] = df.pos.cat.codes
    
    pos_list ={}
    for index, row in df.iterrows():
        pos_list[row['word']] = row['code']
    return pos_list , post_tags_for_words

def named_entity(post_tags_for_words):
    names = ne_chunk(post_tags_for_words)
    names_dict = {}
    for n in names:
        if (len(n) == 1):
            named_entity = str(n).split(' ')[0][1:]
            word = str(n).split(' ')[1].split('/')[0]
            names_dict[word] = named_entity

    df = pd.DataFrame(list(names_dict.items()))
    df.columns = ['word', 'pos']
    df.pos = pd.Categorical(df.pos)
    df['code'] = df.pos.cat.codes

    names_dict ={}
    for index, row in df.iterrows():
        names_dict[row['word']] = row['code']
    print(names_dict)
    return names_dict



[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /usr/share/nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to /usr/share/nltk_data...
[nltk_data]   Package words is already up-to-date!


# Implement

In [14]:
print("Building dictionary...")
word_dict, reversed_dict, article_max_len, summary_max_len = build_dict("train", False)
print("Loading training dataset...")
train_x, train_y = build_dataset("train", word_dict, article_max_len, summary_max_len, False)

Building dictionary...
Loading training dataset...


In [15]:
print("Loading Lists...")
train_article_list = get_text_list(train_article_path, False)
train_title_list = get_text_list(train_title_path, False)

print("Loading TF-IDF...")
tf_idf_list = tf_idf_generate(train_article_list+train_title_list)
tf_idf_list["apple"]

Loading Lists...
Loading TF-IDF...


6.08174166850836

In [16]:
print(word_dict["apple"])
print(word_dict["cat"])
print(word_dict["dog"])

1529
1109
529


In [17]:
print(reversed_dict[9])
print(reversed_dict[7])
print(reversed_dict[42])

people
one
say


In [18]:
print("article_max_len : " + str(article_max_len))
print("summary_max_len : " + str(summary_max_len))

article_max_len : 50
summary_max_len : 15


In [19]:
print(train_x[0])
for num in train_x[0] :
    print(reversed_dict[num])

[365, 332, 705, 52, 37, 54, 23, 128, 527, 1184, 332, 86, 78, 265, 445, 581, 9873, 112, 65, 1142, 626, 13340, 154, 284, 2143, 363, 74, 1219, 34108, 20172, 581, 306, 3724, 3947, 478, 120, 284, 527, 53, 8, 84, 265, 5, 45, 100, 22, 19, 581, 135, 34]
dating
boyfriend
nearly
2
years
pretty
good
together
talked
marriage
boyfriend
great
guy
seem
interested
perfect
upsets
everything
anything
asks
constantly
obsess
making
happy
ca
n't
help
natural
pleaser
molded
perfect
girlfriend
hobbies
sacrifice
mine
keep
happy
talked
every
time
try
seem
like
new
end
going
back
perfect
girl
feel


In [20]:
print(train_y[0])
for num in train_y[0] :
    print(reversed_dict[num])

[2162, 7, 1291, 97, 896, 504, 1782, 105, 478, 407, 766, 340, 21, 12530]
owned
one
six
months
truly
amazing
animals
however
mine
turned
trouble
worth
got
breeder


In [21]:
test_embedding = get_init_embedding(word_dict , reversed_dict, 320)


Loading Lists...
Loading TF-IDF...
Loading Pos Tags...
Loading Glove vectors...
words found in glove percentage = 28.77891899751092


In [22]:
len(test_embedding[30])

320

In [23]:
test_embedding.shape

(460813, 320)

In [24]:
pos_list = get_pos_tags_dict(word_dict.keys())

# Model

In [25]:
import tensorflow as tf
from tensorflow.contrib import rnn
#from utils import get_init_embedding


class Model(object):
    def __init__(self, reversed_dict, article_max_len, summary_max_len, args, forward_only=False):
        self.vocabulary_size = len(reversed_dict)
        self.embedding_size = args.embedding_size
        self.num_hidden = args.num_hidden
        self.num_layers = args.num_layers
        self.learning_rate = args.learning_rate
        self.beam_width = args.beam_width
        if not forward_only:
            self.keep_prob = args.keep_prob
        else:
            self.keep_prob = 1.0
        self.cell = tf.nn.rnn_cell.BasicLSTMCell
        with tf.variable_scope("decoder/projection"):
            self.projection_layer = tf.layers.Dense(self.vocabulary_size, use_bias=False)

        self.batch_size = tf.placeholder(tf.int32, (), name="batch_size")
        self.X = tf.placeholder(tf.int32, [None, article_max_len])
        self.X_len = tf.placeholder(tf.int32, [None])
        self.decoder_input = tf.placeholder(tf.int32, [None, summary_max_len])
        self.decoder_len = tf.placeholder(tf.int32, [None])
        self.decoder_target = tf.placeholder(tf.int32, [None, summary_max_len])
        self.global_step = tf.Variable(0, trainable=False)

        with tf.name_scope("embedding"):
            if not forward_only and args.glove: #training
                #init_embeddings = tf.constant(get_init_embedding(word_dict ,reversed_dict, self.embedding_size), dtype=tf.float32)
                init_embeddings = tf.constant(test_embedding, dtype=tf.float32)
            else: #testing
                init_embeddings = tf.random_uniform([self.vocabulary_size, self.embedding_size], -1.0, 1.0)
            self.embeddings = tf.get_variable("embeddings", initializer=init_embeddings)
            self.encoder_emb_inp = tf.transpose(tf.nn.embedding_lookup(self.embeddings, self.X), perm=[1, 0, 2])
            self.decoder_emb_inp = tf.transpose(tf.nn.embedding_lookup(self.embeddings, self.decoder_input), perm=[1, 0, 2])

        with tf.name_scope("encoder"):
            fw_cells = [self.cell(self.num_hidden) for _ in range(self.num_layers)]
            bw_cells = [self.cell(self.num_hidden) for _ in range(self.num_layers)]
            fw_cells = [rnn.DropoutWrapper(cell) for cell in fw_cells]
            bw_cells = [rnn.DropoutWrapper(cell) for cell in bw_cells]
            
            encoder_outputs, encoder_state_fw, encoder_state_bw = tf.contrib.rnn.stack_bidirectional_dynamic_rnn(
                fw_cells, bw_cells, self.encoder_emb_inp,
                sequence_length=self.X_len, time_major=True, dtype=tf.float32)
            self.encoder_output = tf.concat(encoder_outputs, 2)
            encoder_state_c = tf.concat((encoder_state_fw[0].c, encoder_state_bw[0].c), 1)
            encoder_state_h = tf.concat((encoder_state_fw[0].h, encoder_state_bw[0].h), 1)
            self.encoder_state = rnn.LSTMStateTuple(c=encoder_state_c, h=encoder_state_h)

        with tf.name_scope("decoder"), tf.variable_scope("decoder") as decoder_scope:
            decoder_cell = self.cell(self.num_hidden * 2)

            if not forward_only: #trainig
                attention_states = tf.transpose(self.encoder_output, [1, 0, 2])
                attention_mechanism = tf.contrib.seq2seq.BahdanauAttention(
                    self.num_hidden * 2, attention_states, memory_sequence_length=self.X_len, normalize=True)
                decoder_cell = tf.contrib.seq2seq.AttentionWrapper(decoder_cell, attention_mechanism,
                                                                   attention_layer_size=self.num_hidden * 2)
                initial_state = decoder_cell.zero_state(dtype=tf.float32, batch_size=self.batch_size)
                initial_state = initial_state.clone(cell_state=self.encoder_state)
                helper = tf.contrib.seq2seq.TrainingHelper(self.decoder_emb_inp, self.decoder_len, time_major=True)
                decoder = tf.contrib.seq2seq.BasicDecoder(decoder_cell, helper, initial_state)
                outputs, _, _ = tf.contrib.seq2seq.dynamic_decode(decoder, output_time_major=True, scope=decoder_scope)
                self.decoder_output = outputs.rnn_output
                self.logits = tf.transpose(
                    self.projection_layer(self.decoder_output), perm=[1, 0, 2])
                self.logits_reshape = tf.concat(
                    [self.logits, tf.zeros([self.batch_size, summary_max_len - tf.shape(self.logits)[1], self.vocabulary_size])], axis=1)
                
            else: #testing
                tiled_encoder_output = tf.contrib.seq2seq.tile_batch(
                    tf.transpose(self.encoder_output, perm=[1, 0, 2]), multiplier=self.beam_width)
                tiled_encoder_final_state = tf.contrib.seq2seq.tile_batch(self.encoder_state, multiplier=self.beam_width)
                tiled_seq_len = tf.contrib.seq2seq.tile_batch(self.X_len, multiplier=self.beam_width)
                attention_mechanism = tf.contrib.seq2seq.BahdanauAttention(
                    self.num_hidden * 2, tiled_encoder_output, memory_sequence_length=tiled_seq_len, normalize=True)
                decoder_cell = tf.contrib.seq2seq.AttentionWrapper(decoder_cell, attention_mechanism,
                                                                   attention_layer_size=self.num_hidden * 2)
                initial_state = decoder_cell.zero_state(dtype=tf.float32, batch_size=self.batch_size * self.beam_width)
                initial_state = initial_state.clone(cell_state=tiled_encoder_final_state)
                decoder = tf.contrib.seq2seq.BeamSearchDecoder(
                    cell=decoder_cell,
                    embedding=self.embeddings,
                    start_tokens=tf.fill([self.batch_size], tf.constant(2)),
                    end_token=tf.constant(3),
                    initial_state=initial_state,
                    beam_width=self.beam_width,
                    output_layer=self.projection_layer
                )
                outputs, _, _ = tf.contrib.seq2seq.dynamic_decode(
                    decoder, output_time_major=True, maximum_iterations=summary_max_len, scope=decoder_scope)
                self.prediction = tf.transpose(outputs.predicted_ids, perm=[1, 2, 0])

        with tf.name_scope("loss"):
            if not forward_only: #training
                crossent = tf.nn.sparse_softmax_cross_entropy_with_logits(
                    logits=self.logits_reshape, labels=self.decoder_target)
                weights = tf.sequence_mask(self.decoder_len, summary_max_len, dtype=tf.float32)
                self.loss = tf.reduce_sum(crossent * weights / tf.to_float(self.batch_size))

                params = tf.trainable_variables()
                gradients = tf.gradients(self.loss, params)
                clipped_gradients, _ = tf.clip_by_global_norm(gradients, 5.0)
                optimizer = tf.train.AdamOptimizer(self.learning_rate)
                self.update = optimizer.apply_gradients(zip(clipped_gradients, params), global_step=self.global_step)
                

# Train

In [29]:
import time
start = time.perf_counter()
import tensorflow as tf
import argparse
import pickle
import os
#from model import Model
#from utils import build_dict, build_dataset, batch_iter

# Uncomment next 2 lines to suppress error and Tensorflow info verbosity. Or change logging levels
# tf.logging.set_verbosity(tf.logging.FATAL)
# os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

#def add_arguments(parser):
#    parser.add_argument("--num_hidden", type=int, default=150, help="Network size.")
#    parser.add_argument("--num_layers", type=int, default=2, help="Network depth.")
#    parser.add_argument("--beam_width", type=int, default=10, help="Beam width for beam search decoder.")
#    parser.add_argument("--glove", action="store_true", help="Use glove as initial word embedding.")
#    parser.add_argument("--embedding_size", type=int, default=300, help="Word embedding size.")
#
#    parser.add_argument("--learning_rate", type=float, default=1e-3, help="Learning rate.")
#    parser.add_argument("--batch_size", type=int, default=64, help="Batch size.")
#    parser.add_argument("--num_epochs", type=int, default=10, help="Number of epochs.")
#    parser.add_argument("--keep_prob", type=float, default=0.8, help="Dropout keep prob.")
#
#    parser.add_argument("--toy", action="store_true", help="Use only 50K samples of data")
#
#    parser.add_argument("--with_model", action="store_true", help="Continue from previously saved model")

class args:
    pass

args.num_hidden=150
args.num_layers=2
args.beam_width=10
args.glove="store_true"
args.embedding_size=320

args.learning_rate=1e-3
args.batch_size=64
args.num_epochs=10
args.keep_prob = 0.8

args.toy=True #"store_true"

args.with_model="store_true"


#parser = argparse.ArgumentParser()
#add_arguments(parser)
#args = parser.parse_args()
#with open("args.pickle", "wb") as f:
#    pickle.dump(args, f)

if not os.path.exists(default_path + "saved_model_2"):
    os.mkdir(default_path + "saved_model_2")
else:
    #if args.with_model:
        old_model_checkpoint_path = open(default_path + 'saved_model_2/checkpoint', 'r')
        old_model_checkpoint_path = "".join([default_path + "saved_model_2/",old_model_checkpoint_path.read().splitlines()[0].split('"')[1] ])


print("Building dictionary...")
word_dict, reversed_dict, article_max_len, summary_max_len = build_dict("train", args.toy)
print("Loading training dataset...")
train_x, train_y = build_dataset("train", word_dict, article_max_len, summary_max_len, args.toy)

tf.reset_default_graph()

with tf.Session() as sess:
    model = Model(reversed_dict, article_max_len, summary_max_len, args)
    sess.run(tf.global_variables_initializer())
    saver = tf.train.Saver(tf.global_variables())
    if 'old_model_checkpoint_path' in globals():
        print("Continuing from previous trained model:" , old_model_checkpoint_path , "...")
        saver.restore(sess, old_model_checkpoint_path )

    batches = batch_iter(train_x, train_y, args.batch_size, args.num_epochs)
    num_batches_per_epoch = (len(train_x) - 1) // args.batch_size + 1

    print("\nIteration starts.")
    print("Number of batches per epoch :", num_batches_per_epoch)
    for batch_x, batch_y in batches:
        batch_x_len = list(map(lambda x: len([y for y in x if y != 0]), batch_x))
        batch_decoder_input = list(map(lambda x: [word_dict["<s>"]] + list(x), batch_y))
        batch_decoder_len = list(map(lambda x: len([y for y in x if y != 0]), batch_decoder_input))
        batch_decoder_output = list(map(lambda x: list(x) + [word_dict["</s>"]], batch_y))

        batch_decoder_input = list(
            map(lambda d: d + (summary_max_len - len(d)) * [word_dict["<padding>"]], batch_decoder_input))
        batch_decoder_output = list(
            map(lambda d: d + (summary_max_len - len(d)) * [word_dict["<padding>"]], batch_decoder_output))

        train_feed_dict = {
            model.batch_size: len(batch_x),
            model.X: batch_x,
            model.X_len: batch_x_len,
            model.decoder_input: batch_decoder_input,
            model.decoder_len: batch_decoder_len,
            model.decoder_target: batch_decoder_output
        }

        _, step, loss = sess.run([model.update, model.global_step, model.loss], feed_dict=train_feed_dict)

        if step % 1000 == 0:
            print("step {0}: loss = {1}".format(step, loss))

        if step % num_batches_per_epoch == 0:
            hours, rem = divmod(time.perf_counter() - start, 3600)
            minutes, seconds = divmod(rem, 60)
            saver.save(sess, default_path + "saved_model_2/model.ckpt", global_step=step)
            print(" Epoch {0}: Model is saved.".format(step // num_batches_per_epoch),
            "Elapsed: {:0>2}:{:0>2}:{:05.2f}".format(int(hours),int(minutes),seconds) , "\n")

FileNotFoundError: [Errno 2] No such file or directory: './data/saved_model_2/checkpoint'

# Test

In [27]:
tf.reset_default_graph()

class args:
    pass
  
args.num_hidden=150
args.num_layers=2
args.beam_width=10
args.glove="store_true"
args.embedding_size=320

args.learning_rate=1e-3
args.batch_size=64
args.num_epochs=10
args.keep_prob = 0.8

args.toy=True

args.with_model="store_true"



#print("Loading dictionary...")
#word_dict, reversed_dict, article_max_len, summary_max_len = build_dict("valid", args.toy)
#print("Loading validation dataset...")
#valid_x = build_dataset("valid", word_dict, article_max_len, summary_max_len, args.toy)
#valid_x_len = [len([y for y in x if y != 0]) for x in valid_x]
#print("Loading article and reference...")
#article = get_text_list(valid_article_path, args.toy)
#reference = get_text_list(valid_title_path, args.toy)

with tf.Session() as sess:
    print("Loading saved model...")
    model = Model(reversed_dict, article_max_len, summary_max_len, args, forward_only=True)
    saver = tf.train.Saver(tf.global_variables())
    ckpt = tf.train.get_checkpoint_state(default_path + "saved_model_2/")
    saver.restore(sess, ckpt.model_checkpoint_path)

    batches = batch_iter(valid_x, [0] * len(valid_x), args.batch_size, 1)

    print("Writing summaries to 'result2.txt'...")
    
    for batch_x, _ in batches:
        batch_x_len = [len([y for y in x if y != 0]) for x in batch_x]

        valid_feed_dict = {
            model.batch_size: len(batch_x),
            model.X: batch_x,
            model.X_len: batch_x_len,
        }

        prediction = sess.run(model.prediction, feed_dict=valid_feed_dict)
        prediction_output = [[reversed_dict[y] for y in x] for x in prediction[:, 0, :]]
        summary_array = []
        with open(default_path + "model_2_files/result2.txt", "wb") as f:
            for line in prediction_output:
                summary = list()
                for word in line:
                    if word == "</s>":
                        break
                    if word not in summary:
                        summary.append(word)
                summary_array.append(" ".join(summary))
                #print(" ".join(summary), file=f)

    print('Summaries have been generated')

Loading saved model...
Instructions for updating:
Use tf.cast instead.


AttributeError: 'NoneType' object has no attribute 'model_checkpoint_path'

In [28]:
summary_array = []
with open(default_path + "model_2_files/result2.txt", "wb") as f:
    for line in prediction_output:
        summary = list()
        for word in line:
            if word == "</s>":
                break
            if word not in summary:
                summary.append(word)
        summary_array.append(" ".join(summary))

FileNotFoundError: [Errno 2] No such file or directory: './data/model_2_files/result2.txt'

In [None]:
summary_array[:2]
