In [52]:
import torch
import pandas as pd
import os
import string
from nltk.tokenize import word_tokenize
from collections import defaultdict
import numpy as np 

In [47]:
nltk.download('punkt_tab')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     /home/hyarrava/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package wordnet to /home/hyarrava/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/hyarrava/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [48]:
curr_dir = "/home/hyarrava/Text_generation_using_lstm/"

all_headlines = []

for filename in os.listdir(os.path.join(curr_dir, "data/")):
    if 'Articles' in filename:
        articles_df = pd.read_csv(os.path.join(curr_dir, "data/", filename))
        all_headlines.extend(list(articles_df.headline.values))
        break

all_headlines = [h for h in all_headlines if h!="Unknow"]

In [49]:
def clean_text(txt):
    txt = ''.join(v for v in txt if v not in string.punctuation).lower()
    txt = txt.encode("utf-8").decode("ascii", "ignore")
    return txt

corpus = [clean_text(x) for x in all_headlines]

In [50]:
def build_word_index(corpus):
    word_index = defaultdict(int)
    index =1

    for line in corpus:
        tokens = word_tokenize(line.lower())
        for token in tokens:
            if token not in word_index:
                word_index[token] = index
                index+=1
    return word_index


def get_sequence_of_tokens(corpus, word_index):
    input_sequences = [] 
    for line in corpus:
        tokens = word_tokenize(line.lower())
        token_list = [word_index[token] for token in tokens] ## list of tokens for each sentence or line       
        for i in range(1, len(token_list)):
            n_gram_sequence = token_list[:i+1]
            input_sequences.append(n_gram_sequence)
            
    return input_sequences

In [51]:
word_index = build_word_index(corpus)
input_sequences = get_sequence_of_tokens(corpus, word_index)

### Padding

In [55]:
def pad_sequences(sequences, maxlen= None, padding = 'post', truncating = 'post', value =0):
    if maxlen is None:
        maxlen = max(len(seq) for seq in sequences)

    padded_sequences = []
    for seq in sequences:
        if len(seq) < maxlen:
            if padding == 'post':
                padded_seq = seq + [value]* (maxlen - len(seq))
            else:
                padded_seq = [value]* (maxlen - len(seq)) + seq

        else : 
            padded_seq = seq

        if len(seq) > maxlen:
            if truncating == 'pre':
                truncated_seq = seq[-maxlen:]
            else :
                truncated_seq = seq[:maxlen]

        padded_sequences.append(padded_seq)

    return np.array(padded_sequences)
            
            

In [56]:
padded_sequences = pad_sequences(input_sequences, maxlen= None, padding = 'post', 
                                 truncating = 'post', value =0)

In [58]:
padded_sequences[0]

array([1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])