## Connecting to S3 Bucket 

In [1]:
import boto3
import io
bucket = 'text-generation-bucket'
key = 'text_data/reviews.csv'
s3_client = boto3.client('s3')
obj = s3_client.get_object(Bucket=bucket, Key=key)

In [2]:
# Importing Nessesary Packages:

import pandas as pd
import os
import re
import numpy as np
import torch

## Reading The Data

In [3]:
# Reading the Data:

df = pd.read_csv(io.BytesIO(obj['Body'].read()),  header= 0, encoding= 'unicode_escape')
df.head()

Unnamed: 0,_unit_id,_golden,_unit_state,_trusted_judgments,_last_judgment_at,positivity,positivity:confidence,relevance,relevance:confidence,articleid,date,headline,positivity_gold,relevance_gold,text
0,842613455,False,finalized,3,12-05-2015 17:48,3.0,0.64,yes,0.64,wsj_398217788,8/14/91,Yields on CDs Fell in the Latest Week,,,NEW YORK -- Yields on most certificates of dep...
1,842613456,False,finalized,3,12-05-2015 16:54,,,no,1.0,wsj_399019502,8/21/07,The Morning Brief: White House Seeks to Limit ...,,,The Wall Street Journal Online</br></br>The Mo...
2,842613457,False,finalized,3,12-05-2015 01:59,,,no,1.0,wsj_398284048,11/14/91,Banking Bill Negotiators Set Compromise --- Pl...,,,WASHINGTON -- In an effort to achieve banking ...
3,842613458,False,finalized,3,12-05-2015 02:19,,0.0,no,0.675,wsj_397959018,6/16/86,Manager's Journal: Sniffing Out Drug Abusers I...,,,The statistics on the enormous costs of employ...
4,842613459,False,finalized,3,12-05-2015 17:48,3.0,0.3257,yes,0.64,wsj_398838054,10-04-2002,Currency Trading: Dollar Remains in Tight Rang...,,,NEW YORK -- Indecision marked the dollar's ton...


In [4]:
data = df[['text']]

In [None]:
# Making all the words to lower case:

data["text"] = [re.sub("[^a-z' ]", "", i.lower()) for i in data["text"]]


In [7]:
# Printing a sample:

data["text"][0]

"new york  yields on most certificates of deposit offered by major banks dropped more than a tenth of a percentage point in the latest week reflecting the overall decline in shortterm interest ratesbrbron smalldenomination or consumer cds sold directly by banks the average yield on sixmonth deposits fell to  from  in the week ended yesterday according to an bank survey by banxquote money markets a wilmington del information servicebrbron threemonth consumer deposits the average yield sank to  from  the week before according to banxquote two banks in the banxquote survey citibank in new york and corestates in pennsylvania are paying less than  on threemonth smalldenomination cdsbrbrdeclines were somewhat smaller on fiveyear consumer cds which eased to  from  banxquote saidbrbryields on threemonth and sixmonth treasury bills sold at monday's auction plummeted more than a fifth of a percentage point from the previous week to  and  respectively"

## Creating the Sequence

In [8]:
# Function to create a sequence of length 10 Tokens:
def create_seq(text, seq_len = 10):
    
    sequences = []
    
    #if the number of tokens in text is greater than 5
    if len(text.split()) > seq_len:
        for i in range(seq_len, len(text.split())):
            # Select sequence of tokens
            seq = text.split()[i-seq_len:i+1]
            #add to the list
            sequences.append(" ".join(seq))
        return sequences
    else:
        return[text]
        

In [9]:
sentence ="i have bought several of the vitality canned dog food products and have found them all to be of good quality the product looks more like a stew than a processed meatand it smells better my labrador is finicky and she appreciates this product better than most."

In [10]:
create_seq(sentence)

['i have bought several of the vitality canned dog food products',
 'have bought several of the vitality canned dog food products and',
 'bought several of the vitality canned dog food products and have',
 'several of the vitality canned dog food products and have found',
 'of the vitality canned dog food products and have found them',
 'the vitality canned dog food products and have found them all',
 'vitality canned dog food products and have found them all to',
 'canned dog food products and have found them all to be',
 'dog food products and have found them all to be of',
 'food products and have found them all to be of good',
 'products and have found them all to be of good quality',
 'and have found them all to be of good quality the',
 'have found them all to be of good quality the product',
 'found them all to be of good quality the product looks',
 'them all to be of good quality the product looks more',
 'all to be of good quality the product looks more like',
 'to be of good

In [11]:
# Creating a list of text:

seq = []
text = data["text"].values
for i in range(1000):
    seqi = create_seq(text[i])
    seq.extend([s for s in seqi if len(s.split(" ")) == 11])
    

In [12]:
len(seq)

196367

In [13]:
for i in range(196357,196367):
    print(seq[i])

ecb more elbow room to cut interest rates says cary leahey
more elbow room to cut interest rates says cary leahey senior
elbow room to cut interest rates says cary leahey senior economist
room to cut interest rates says cary leahey senior economist at
to cut interest rates says cary leahey senior economist at deutsche
cut interest rates says cary leahey senior economist at deutsche bank
interest rates says cary leahey senior economist at deutsche bank securities
rates says cary leahey senior economist at deutsche bank securities in
says cary leahey senior economist at deutsche bank securities in new
cary leahey senior economist at deutsche bank securities in new york


In [14]:
# create inputs and targets (x and y)
x = []
y = []

for s in seq:
      if len(s.split()) == 11:
        x.append(" ".join(s.split()[:-1]))
        y.append(" ".join(s.split()[1:]))

In [15]:
# Printing Last 5 Texts of  x:

for i in range(196357,196367):
    print(x[i])

ecb more elbow room to cut interest rates says cary
more elbow room to cut interest rates says cary leahey
elbow room to cut interest rates says cary leahey senior
room to cut interest rates says cary leahey senior economist
to cut interest rates says cary leahey senior economist at
cut interest rates says cary leahey senior economist at deutsche
interest rates says cary leahey senior economist at deutsche bank
rates says cary leahey senior economist at deutsche bank securities
says cary leahey senior economist at deutsche bank securities in
cary leahey senior economist at deutsche bank securities in new


In [16]:
#Printing Last 5 Texts of y:

for i in range(196357,196367):
    print(y[i])

more elbow room to cut interest rates says cary leahey
elbow room to cut interest rates says cary leahey senior
room to cut interest rates says cary leahey senior economist
to cut interest rates says cary leahey senior economist at
cut interest rates says cary leahey senior economist at deutsche
interest rates says cary leahey senior economist at deutsche bank
rates says cary leahey senior economist at deutsche bank securities
says cary leahey senior economist at deutsche bank securities in
cary leahey senior economist at deutsche bank securities in new
leahey senior economist at deutsche bank securities in new york


In [17]:
# create integer-to-token mapping
int2token = {}
cnt = 0

for w in set(" ".join(seq).split()):
    int2token[cnt] = w
    cnt+= 1

# create token-to-integer mapping
token2int = {t: i for i, t in int2token.items()}

In [20]:
#Creating 2 dictionary that maps token

print(token2int["the"]) # Token-to-Integer

print(int2token[7171])  # Integer-to-Token

3678
manufactured


## Saving the Dictionary as Json File to s3

In [22]:
import json 
dict1 = token2int
dict2 = int2token
s3 = boto3.resource('s3') 
obj1 = s3.Object('text-generation-bucket','inputs/token2int.json')
obj = s3.Object('text-generation-bucket','inputs/int2token.json') 
obj1.put(Body=json.dumps(dict1))
obj.put(Body=json.dumps(dict2))

{'ResponseMetadata': {'RequestId': 'DDG9DSF9D8ME9Y7S',
  'HostId': 'SaJCt5wP+s0M0AZeM5N6MUbd/JtTCYJ2QHted1och9s4Sf/6rYZeAaYpDY9F1CZg+sl/jb7JUXI=',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amz-id-2': 'SaJCt5wP+s0M0AZeM5N6MUbd/JtTCYJ2QHted1och9s4Sf/6rYZeAaYpDY9F1CZg+sl/jb7JUXI=',
   'x-amz-request-id': 'DDG9DSF9D8ME9Y7S',
   'date': 'Mon, 26 Aug 2024 12:22:21 GMT',
   'x-amz-server-side-encryption': 'AES256',
   'etag': '"e91eae5fdc8401792279ce8516ae858b"',
   'server': 'AmazonS3',
   'content-length': '0'},
  'RetryAttempts': 0},
 'ETag': '"e91eae5fdc8401792279ce8516ae858b"',
 'ServerSideEncryption': 'AES256'}

In [23]:
# set vocabulary size
vocab_size = len(int2token)
vocab_size

18726

In [None]:
# define get_integer_seq:

def get_integer_seq(self, seq):
    return [token2int[w] for w in seq.split()]

In [None]:
# converting text sequences to integer sequences:

x_int = [get_integer_seq(i) for i in x]
y_int = [get_integer_seq(i) for i in y]

In [None]:
np.array(x_int).shape

## Saving the processed Input to S3

In [None]:
upload_dir = 'inputs/'
if not os.path.exists(upload_dir): # Make sure that the folder exists
    os.makedirs(upload_dir)

np.save(os.path.join(upload_dir, 'y_int.npy'), y_int)
np.save(os.path.join(upload_dir, 'x_int.npy'), x_int)

In [None]:
# convert lists to numpy arrays
x_int = torch.tensor(np.array(x_int))
y_int = torch.tensor(np.array(y_int))

In [None]:
x_int[0]

## Defining the Model 

In [None]:
class WordLSTM(nn.Module):
    
    def __init__(self, n_hidden=256, n_layers=4, drop_prob=0.3, lr=0.001):
        super().__init__()

        self.drop_prob = drop_prob
        self.n_layers = n_layers
        self.n_hidden = n_hidden
        self.lr = lr
        
        self.emb_layer = nn.Embedding(vocab_size, 200)

        ## define the LSTM
        self.lstm = nn.LSTM(200, n_hidden, n_layers, 
                            dropout=drop_prob, batch_first=True)
        
        ## define a dropout layer
        self.dropout = nn.Dropout(drop_prob)
        
        ## define the fully-connected layer
        self.fc = nn.Linear(n_hidden, vocab_size)      
    
    def forward(self, x, hidden):
        ''' Forward pass through the network. 
            These inputs are x, and the hidden/cell state `hidden`. '''

        ## pass input through embedding layer
        embedded = self.emb_layer(x)     
        
        ## Get the outputs and the new hidden state from the lstm
        lstm_output, hidden = self.lstm(embedded, hidden)
        
        ## pass through a dropout layer
        out = self.dropout(lstm_output)
        
        #out = out.contiguous().view(-1, self.n_hidden) 
        out = out.reshape(-1, self.n_hidden) 

        ## put "out" through the fully-connected layer
        out = self.fc(out)

        # return the final output and the hidden state
        return out, hidden
    
    
    def init_hidden(self, batch_size):
        ''' initializes hidden state '''
        # Create two new tensors with sizes n_layers x batch_size x n_hidden,
        # initialized to zero, for hidden state and cell state of LSTM
        weight = next(self.parameters()).data

        # if GPU is available
        if (torch.cuda.is_available()):
          hidden = (weight.new(self.n_layers, batch_size, self.n_hidden).zero_().cuda(),
                    weight.new(self.n_layers, batch_size, self.n_hidden).zero_().cuda())
        
        # if GPU is not available
        else:
          hidden = (weight.new(self.n_layers, batch_size, self.n_hidden).zero_(),
                    weight.new(self.n_layers, batch_size, self.n_hidden).zero_())
        
        return hidden

# Making the Model Use GPU

In [None]:
net = WordLSTM()

# push the model to GPU (avoid it if you are not using the GPU)
net.cuda()

print(net)

##  Function to Training the Model

In [None]:
def train(net, epochs=10, batch_size=32, lr=0.001, clip=1, print_every=32):
    
    # optimizer
    opt = torch.optim.Adam(net.parameters(), lr=lr)
    
    # loss
    criterion = nn.CrossEntropyLoss()
    
    # push model to GPU
    net.cuda()

    counter = 0

    net.train()

    for e in range(epochs):

        # initialize hidden state
        h = net.init_hidden(batch_size)
        
        for x, y in get_batches(x_int, y_int, batch_size):
            counter+= 1
            
            # convert numpy arrays to PyTorch arrays
            # inputs, targets = torch.tensor(x, dtype=torch.float), torch.tensor(y, dtype=torch.float)
            inputs, targets = x, y
            
            # push tensors to GPU
            inputs, targets = inputs.cuda(), targets.cuda()

            # detach hidden states
            h = tuple([each.data for each in h])

            # zero accumulated gradients
            net.zero_grad()
            
            # get the output from the model
            output, h = net(inputs, h)
            
            # calculate the loss and perform backprop
            loss = criterion(output, targets.view(-1))

            # back-propagate error
            loss.backward()

            # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
            nn.utils.clip_grad_norm_(net.parameters(), clip)

            # update weigths
            opt.step()            
            
            if counter % print_every == 0:
            
              print("Epoch: {}/{}...".format(e+1, epochs),
                    "Step: {}...".format(counter))

In [None]:
def get_batches(arr_x, arr_y, batch_size):
         
    # iterate through the arrays
    prv = 0
    for n in range(batch_size, arr_x.shape[0], batch_size):
      # print(arr_x)
      x = arr_x[prv:n]
      y = arr_y[prv:n]
      prv = n
      yield x, y

## Training the Model

In [None]:
train(net, batch_size = 100, epochs=20, print_every=512)

## Function to Make Prediction

In [None]:
# predict next token
def predict(net, tkn, h=None):
         
  # tensor inputs
  x = np.array([[token2int[tkn]]])
  inputs = torch.from_numpy(x)
  
  # push to GPU
  inputs = inputs.cuda()

  # detach hidden state from history
  h = tuple([each.data for each in h])

  # get the output of the model
  print(inputs, h)
  out, h = net(inputs)

  # get the token probabilities
  p = F.softmax(out, dim=1).data

  p = p.cpu()

  p = p.numpy()
  p = p.reshape(p.shape[1],)

  # get indices of top 3 values
  top_n_idx = p.argsort()[-3:][::-1]

  # randomly select one of the three indices
  sampled_token_index = top_n_idx[random.sample([0,1,2],1)[0]]

  # return the encoded value of the predicted char and the hidden state
  return int2token[sampled_token_index], h


# function to generate text
def sample(net, size, prime='it is'):
        
    # push to GPU
    # net.cuda()
    
    net.eval()

    # batch size is 1
    h = net.init_hidden(1)
    
    toks = prime.split()

    # predict next token
    for t in prime.split():
      token, h = predict(net, t, h)
    
    toks.append(token)

    # predict subsequent tokens
    for i in range(size-1):
        token, h = predict(net, toks[-1], h)
        toks.append(token)

    return ' '.join(toks)

## Making the Model Predict New Data

In [None]:
sample(net, 5, prime = "amazing product")