<h4>Importing Libraries

In [128]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.autograd import Variable

In [129]:
class Tokenizer():
    def fit_on_texts(self,list_data):
        word_list = " ".join(list_data).split()
        self.word_counts = list(set(word_list))
        self.word_dict = {w: i for i, w in enumerate(self.word_counts)}
        self.number_dict = {i: w for i, w in enumerate(self.word_counts)}
        
    def texts_to_sequences(self,data):
        encoded_sequence = list()
        for item in data:
            encoded_sequence.append([self.word_dict[word] for word in item.split()])
        return encoded_sequence
    
def pad_sequences(data,padding='pre',padding_value=0):
    sequence = None
    if isinstance(data,list):
        maxlen = max(len(item) for item in data)
        
    if padding == 'pre':
        for idx in range(len(data)):
            data[idx] = [padding_value]*(maxlen-len(data[idx])) + data[idx]
    else:
        for idx in range(len(data)):
            data[idx] = data[idx]+ [padding_value]*(maxlen-len(data[idx]))
                                                    
    return data
    

In [134]:
class Preprocessing():
    
    def __init__(self,input_file):
        self.input_data_file = input_file
        self.data = None
        self.vocab_size = None
        self.encoded_data = None
        self.max_length = None
        self.sequences = None
        self.x = None
        self.y = None
        self.tokenizer = None
    
    def load_data(self):
        fp = open(self.input_data_file,'r')
        self.data = fp.read().splitlines()        
        fp.close()
        
    def encode_data(self):
        self.tokenizer = Tokenizer()
        self.tokenizer.fit_on_texts(self.data)
        self.encoded_data = self.tokenizer.texts_to_sequences(self.data)
        print(self.encoded_data)
        self.vocab_size = len(self.tokenizer.word_counts)+1
        
    def generate_sequence(self):
        seq_list = list()
        for item in self.encoded_data:
            l = len(item)
            for id in range(1,l):
                seq_list.append(item[:id+1])
        #print(seq_list[0])
        print(seq_list)
        self.sequences = pad_sequences(seq_list,padding='pre', padding_value=0)
        print(self.sequences)
        self.sequences = array(self.sequences)
            
    def get_data(self):
        self.x = self.sequences[:,:-1]
        self.y = self.sequences[:,-1]
        print(self.y)
        #self.y = to_categorical(self.y,num_classes=self.vocab_size)
        self.y = pd.get_dummies(self.y)
        print(self.y)

In [135]:
pr = Preprocessing('data.txt')
pr.load_data()
pr.encode_data()
pr.generate_sequence()
pr.get_data()

[[2, 12, 16, 4, 3, 14, 9], [17, 0, 18, 6, 19, 21], [2, 10, 8, 12, 11, 7, 15], [1, 16, 5, 20, 13]]
[[2, 12], [2, 12, 16], [2, 12, 16, 4], [2, 12, 16, 4, 3], [2, 12, 16, 4, 3, 14], [2, 12, 16, 4, 3, 14, 9], [17, 0], [17, 0, 18], [17, 0, 18, 6], [17, 0, 18, 6, 19], [17, 0, 18, 6, 19, 21], [2, 10], [2, 10, 8], [2, 10, 8, 12], [2, 10, 8, 12, 11], [2, 10, 8, 12, 11, 7], [2, 10, 8, 12, 11, 7, 15], [1, 16], [1, 16, 5], [1, 16, 5, 20], [1, 16, 5, 20, 13]]
[[0, 0, 0, 0, 0, 2, 12], [0, 0, 0, 0, 2, 12, 16], [0, 0, 0, 2, 12, 16, 4], [0, 0, 2, 12, 16, 4, 3], [0, 2, 12, 16, 4, 3, 14], [2, 12, 16, 4, 3, 14, 9], [0, 0, 0, 0, 0, 17, 0], [0, 0, 0, 0, 17, 0, 18], [0, 0, 0, 17, 0, 18, 6], [0, 0, 17, 0, 18, 6, 19], [0, 17, 0, 18, 6, 19, 21], [0, 0, 0, 0, 0, 2, 10], [0, 0, 0, 0, 2, 10, 8], [0, 0, 0, 2, 10, 8, 12], [0, 0, 2, 10, 8, 12, 11], [0, 2, 10, 8, 12, 11, 7], [2, 10, 8, 12, 11, 7, 15], [0, 0, 0, 0, 0, 1, 16], [0, 0, 0, 0, 1, 16, 5], [0, 0, 0, 1, 16, 5, 20], [0, 0, 1, 16, 5, 20, 13]]
[12 16  4  3 14  9 