# Pre-processing

In [4]:
import pandas as pd
import numpy as np
import torch
from torch import nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torchvision import datasets

In [11]:
# load the data and create dictionary for the symbols used in shakespeare's poem
filepath = "poem_data/shakespeare.txt"
syllables_path = "poem_data/Syllable_dictionary.txt"
f = open(filepath, "r").read()
unique_chars = set(f.lower())
dictionary = []
for item in unique_chars:
    if item not in ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']:
        dictionary.append(item)

In [12]:
syllab_df = pd.DataFrame(columns=['word', 'endtone', 'tone1', 'tone2'])

with open(syllables_path) as f:
    content = f.readlines()
    content = [x.strip().lower() for x in content if len(x.strip().lower()) > 1]
    i = 0
    for line in content:
        line = line.split(' ')
        if len(line) == 2:
            syllab_df.loc[i] = [line[0], '', line[1], '']
        else:
            if line[1][0] == 'e':
                syllab_df.loc[i] = [line[0], line[1][1], line[2], '']
            else:
                syllab_df.loc[i] = [line[0], '', line[1], line[2]]
        i += 1

In [13]:
# helper method for encoding/decoding the strings to one-hot
def encoding(char):
    arr = [0 for i in range(len(dictionary))]
    arr[dictionary.index(char)]=1
    return arr

In [8]:
syllab_df

Unnamed: 0,word,endtone,tone1,tone2
0,'gainst,,1,
1,'greeing,1,2,
2,'scaped,,1,
3,'tis,,1,
4,'twixt,,1,
...,...,...,...,...
3200,yours,,1,
3201,youth,,1,
3202,youth's,,1,
3203,youthful,,2,


In [14]:
# helper method for data cleaning
text = open(filepath, 'rb').read().decode(encoding='utf-8').lower().split('\n\n\n')
def remove_number(string):
    return string[string.find('\n')+1:]

Initially, I tried to encode the syllables. I loaded the files into a dataframe, but did not come up with a proper way to encode it and feed into the RNN model. 

Then, I chose to manually save all the symbols used in the training texts and saved it as my dictionary. The dictionary consists of 26 letters and other symbols like '\n' and is 38 in length. After thinking carefully, I chose to encode it in one-hot, transforming each symbol to a tensor of dimension (38), with mostly zeros and the entry at the index where the symbol locates in the dictionary being 1. I splitted the whole text into lists of consecutive 40 symbols. Each sequence of 40 symbols will be feed into the RNN model and the next word of the sequence will be used as the target. Essentially, I was trying to use the previous sequence (40 symbols) to predict the next symbol. Hence, I had the corresponded training inputs and targets. 

I also tried to encode the target in the one-hot format, but when I trained it with cross entropy loss, I got an error saying that the cross entropy loss should only be applied to scalar value targets instead of one-hot encoded. Then, I tried to feed the index of the target symbol, which is a scalar value, into the RNN, but the model did not perform well. Then, I recalled that the cross entropy loss = log_softmax + nll_loss, so I added a log_softmax layer to my predicted output inside my forward function and used the nll_loss as my criterion. It turned out to have reasonably good performance. 

## Customize Shakespeare Dataset

In [15]:
# customize dataset
class ShakespeareDataSet(Dataset):
    
    def __init__(self, filepath):
        text = open(filepath, 'rb').read().decode(encoding='utf-8').lower().split('\n\n\n')
        chapters = list(map(remove_number, text))
        self.inp = []
        self.out = []
        
        for i in range(len(chapters)):
            for j in range(len(chapters[i])-40):
                self.inp.append(list(map(encoding, chapters[i][j:j+40])))
                self.out.append(encoding(chapters[i][j+40]))
        
        self.inp = torch.FloatTensor(self.inp)
        self.out = torch.FloatTensor(self.out)
        
    def __getitem__(self, idx):
        return self.inp[idx], self.out[idx]
    
    def __len__(self):
        return len(self.inp)

In [16]:
shakespearedataset = ShakespeareDataSet(filepath)
train_dataloader = DataLoader(shakespearedataset, batch_size=64, shuffle=True)

# Problem 2

## LSTM Model 1

In [525]:
#LSTM neural networks
EMBEDDING_SIZE = 64
INP_DIMENSION = 38
class ShakespeareLSTM(nn.Module):
    def __init__(self, n_hidden=128, n_layers=2,
                                   drop_prob=0.2, lr=0.001, embedding_size = EMBEDDING_SIZE):
        super(ShakespeareLSTM, self).__init__()
        self.drop_prob = drop_prob
        self.n_layers = n_layers
        self.n_hidden = n_hidden
        self.lr = lr
        self.linear=nn.Linear(INP_DIMENSION, embedding_size)
        # lstm layer
        self.lstm=nn.LSTMCell(embedding_size, n_hidden)
        #dropout layer
        self.dropout=nn.Dropout(drop_prob)  
        #output layer
        self.fc=nn.Linear(n_hidden, INP_DIMENSION)
        
    def forward(self, inp):
        batch_size = inp.shape[0]
        hidden = self.init_hidden(batch_size)
        for i in range(inp.shape[1]):
            single_char = inp[:, i, :]
            single_char = F.relu(self.linear(single_char))
            hidden = self.lstm(single_char, hidden)
            out = F.log_softmax(self.fc(hidden[0]), dim=1)
        return out
    
    def init_hidden(self, batch_size):
        h_0 = torch.zeros(batch_size, self.n_hidden).to(device)
        c_0 = torch.zeros(batch_size, self.n_hidden).to(device)
        nn.init.xavier_normal_(h_0)
        nn.init.xavier_normal_(c_0)
        return (h_0, c_0)

In [526]:
def train(net, data, epochs=5, lr=0.001, clip=5, val_frac=0.1, print_every=400):
    net.train()
    opt = torch.optim.Adam(net.parameters(), lr=lr)
    criterion = nn.NLLLoss()

    for e in range(epochs):
        for i_batch, batch_data in enumerate(data):
            inp, target = batch_data               
            net.zero_grad()
            output = net(inp)
            loss = criterion(output, torch.max(target, 1)[1])
            loss.backward()
            nn.utils.clip_grad_norm_(net.parameters(), clip)
            opt.step()

            if i_batch % print_every == 0:
                print("Epoch: {}/{}...".format(e+1, epochs),
                      "Step: {}/{}...".format(i_batch, len(train_dataloader)),
                      "Loss: {:.4f}...".format(loss.item()),
                     )

In [527]:
# define and print the net
n_hidden = 128
n_layers = 2

shakenet = ShakespeareLSTM(n_hidden, n_layers)
print(shakenet)

ShakespeareLSTM(
  (linear): Linear(in_features=38, out_features=64, bias=True)
  (lstm): LSTMCell(64, 128)
  (dropout): Dropout(p=0.2, inplace=False)
  (fc): Linear(in_features=128, out_features=38, bias=True)
)


In [529]:
n_epochs =  2

# train the model
train(shakenet, train_dataloader, epochs=n_epochs, lr=0.001, print_every=200)

Epoch: 1/2... Step: 0/1375... Loss: 1.4680...
Epoch: 1/2... Step: 200/1375... Loss: 1.2205...
Epoch: 1/2... Step: 400/1375... Loss: 1.2534...
Epoch: 1/2... Step: 600/1375... Loss: 1.5665...
Epoch: 1/2... Step: 800/1375... Loss: 1.3046...
Epoch: 1/2... Step: 1000/1375... Loss: 1.3801...
Epoch: 1/2... Step: 1200/1375... Loss: 1.6903...
Epoch: 2/2... Step: 0/1375... Loss: 1.1575...
Epoch: 2/2... Step: 200/1375... Loss: 1.5174...
Epoch: 2/2... Step: 400/1375... Loss: 1.6945...
Epoch: 2/2... Step: 600/1375... Loss: 1.4279...
Epoch: 2/2... Step: 800/1375... Loss: 1.2962...
Epoch: 2/2... Step: 1000/1375... Loss: 1.3395...
Epoch: 2/2... Step: 1200/1375... Loss: 1.1117...


In [555]:
# 33 epochs
train(shakenet, train_dataloader, epochs=1, lr=0.001, print_every=200)

Epoch: 1/1... Step: 0/1375... Loss: 1.0283...
Epoch: 1/1... Step: 200/1375... Loss: 1.2158...
Epoch: 1/1... Step: 400/1375... Loss: 1.5977...
Epoch: 1/1... Step: 600/1375... Loss: 1.2830...
Epoch: 1/1... Step: 800/1375... Loss: 1.1092...
Epoch: 1/1... Step: 1000/1375... Loss: 0.8747...
Epoch: 1/1... Step: 1200/1375... Loss: 1.0663...


In [572]:
# 35 epochs
train(shakenet, train_dataloader, epochs=1, lr=0.001, print_every=200)

Epoch: 1/1... Step: 0/1375... Loss: 1.0709...
Epoch: 1/1... Step: 200/1375... Loss: 1.0228...
Epoch: 1/1... Step: 400/1375... Loss: 1.6351...
Epoch: 1/1... Step: 600/1375... Loss: 1.3580...
Epoch: 1/1... Step: 800/1375... Loss: 1.0721...
Epoch: 1/1... Step: 1000/1375... Loss: 1.1174...
Epoch: 1/1... Step: 1200/1375... Loss: 1.1743...


## after training for 35 epochs

In [475]:
def freestyle(net, string, temperature):
    predicted = ''
    output = ''
    poem_length = 13 * 40
    net.eval()
    pre = string
    for i in range(poem_length):
        inp = torch.FloatTensor([list(map(encoding, string))])
        out= shakenet(inp)
         # Apply temperature
        soft_out = F.softmax(out / temperature, dim=1)
        p = soft_out.data.cpu().numpy()
        idx = np.random.choice(out.size()[1], p=p[0])
        predicted = dictionary[idx]
        string = string[1:] + predicted
        output += predicted
    return pre + output

In [550]:
model_name = 'shakenet.net'

checkpoint = {'n_hidden': shakenet.n_hidden,
              'n_layers': shakenet.n_layers,
              'state_dict': shakenet.state_dict()}

with open(model_name, 'wb') as f:
    torch.save(checkpoint, f)

I implemented a Char-LSTM model together with linear layers. First I tried to embed the input tensor into a bigger embedded tensor, passing it to a lstm model, and then applied the log_softmax function on the final output. I tunned the n_hidden (number of hidden units in a LSTM cell) the embedding_size for the embedded layer. After trainning for 35 epochs, the loss of my model converges around 1.1. The model successfully learns the wording of Shakespearen poems (using words like thou, art, hath, thee, etc...), most of the English standard words, poem structures, sentence structure, and some of English grammars. However, it does not learn the sonnet structure. With Regard to training time, it took approximately 30 minutes to train for 30 epochs. 

In [573]:
# using temperature of 1.5
poem_start = "shall i compare thee to a summer's day?\n"
print(freestyle(shakenet, poem_start, 1.5))

shall i compare thee to a summer's day?
in oftain afformiring silfulloke
shablow upituves the tail hold will to thee,
when ftering sending now is fwailttend.
'win i'ld their read or shore tiffured plide,
  heis tatten teach acant outcost anone fast.
so trem thy vilents of like be fornare;
childot uppat on hadoure my flame gok:
lively he alone hope blind,
of hid, ere that leapes this is, bitrence dottly.
  onlying: leppable th' outwant smel.
werticold my sweet sowers to stilage huse freemen!
more and outwogh to rememfors gateoun.
wo diliel a waudeusy eye,


In [578]:
# using temperature of 0.75
print(freestyle(shakenet, poem_start, 0.75))

shall i compare thee to a summer's day?
whose thou which still were holying inwart's pride hand,
and all alone alone outlive her false,
and the mounten more ill revosten fitted,
for sweet thine eye a view, the world love'
to time and the parton complexioned flobe,
whilst my selfouty shall you thine eyele,
steal still my day so sleep to the sweet,
but all the world and the rich from kning.
  but that thou my love in his self a look,
what come to make the life to live to hope.
  yet in thy furpoil on the summer ingst.
  but yet not i dring the treamperouse


In [577]:
# using temperature of 0.25
print(freestyle(shakenet, poem_start, 0.25))

shall i compare thee to a summer's day?
  then from the world's looks to the world doth stand
o that still my self the bearth the world be.
  then thou thy self the world with the rank,
and therefore to be world with the world to decay,
and that i see the mounten the most doth the store,
  then i am sometime where the world and there,
the summer's present speak of the world heart,
which i am the praise the world with from thee,
  then thou art a self the world with thee.
  for the world and there in the world with thee.
  you with the world and soul whil


By comparing poem generations with different temperature, I oberserve that the smaller the temperature, the more accurate (more English-like, Shakespeare-like) poems are generated. For example, the poems using temperature=0.25 makes more sense than those using 0.75 and 1.5.

# Problem 3

In problem 2 I was using a stateless LSTM model with nn.LSTMCell model, I am going to improve my model using a stateful LSTM model with nn.LSTM model.

## LSTM Model 2

In [21]:
#LSTM neural networks
class ShakespeareLSTM2(nn.Module):
    def __init__(self, n_hidden=128, n_layers=2,
                                   drop_prob=0.2, lr=0.001):
        super(ShakespeareLSTM2, self).__init__()
        self.drop_prob = drop_prob
        self.n_layers = n_layers
        self.n_hidden = n_hidden
        self.lr = lr
        self.seq_length = 40
        # lstm layer
        self.lstm=nn.LSTM(38, n_hidden, n_layers,
                    batch_first=True)
        #dropout layer
        self.dropout=nn.Dropout(drop_prob)  
        #output layer
        self.fc=nn.Linear(n_hidden * self.seq_length, 38)
        
    def forward(self, inp, hidden):
        r_output, hidden = self.lstm(inp, hidden)
        out = self.dropout(r_output)
        # Stack up LSTM outputs using view
        out = out.contiguous().view(len(out), self.n_hidden * out.shape[1])
        ## put x through the fully-connected layer
        out = F.log_softmax(self.fc(out), dim=1)
        return out, hidden
    
    def init_hidden(self, batch_size):
        weight = next(self.parameters()).data
        hidden = (weight.new(self.n_layers, batch_size, self.n_hidden).zero_(),
                  weight.new(self.n_layers, batch_size, self.n_hidden).zero_())
        return hidden

In [6]:
def train2(net, data, epochs=5, batch_size=64, lr=0.001, seq_length=40, clip=5, val_frac=0.1, print_every=400):
    net.train()
    opt = torch.optim.Adam(net.parameters(), lr=lr)
    criterion = nn.NLLLoss()

    for e in range(epochs):
        # initialize hidden state
        prev_batch_size = 0
        h = net.init_hidden(batch_size)
        for i_batch, batch_data in enumerate(data):
            inp, target = batch_data
            
            #handling varying batch size at the end of epoch
            if prev_batch_size != len(inp):
                h = net.init_hidden(len(inp))
                prev_batch_size = len(inp)
                
            h = tuple([each.data for each in h])
            net.zero_grad()
            output, h = net(inp, h)
            loss = criterion(output, torch.max(target, 1)[1])
            loss.backward()
            nn.utils.clip_grad_norm_(net.parameters(), clip)
            opt.step()

            if i_batch % print_every == 0:
                print("Epoch: {}/{}...".format(e+1, epochs),
                      "Step: {}/{}...".format(i_batch, len(train_dataloader)),
                      "Loss: {:.4f}...".format(loss.item()),
                     )

In [31]:
def freestyle2(net, string, temperature):
    predicted = ''
    output = ''
    poem_length = 13 * 40
    net.eval()
    pre = string
    for i in range(poem_length):
        inp = torch.FloatTensor([list(map(encoding, string))])
        h = net.init_hidden(1)
        h = tuple([each.data for each in h])
        out, h = net(inp, h)
         # Apply temperature
        soft_out = F.softmax(out / temperature, dim=1)
        p = soft_out.data.cpu().numpy()
        idx = np.random.choice(out.size()[1], p=p[0])
        predicted = dictionary[idx]
        string = string[1:] + predicted
        output += predicted
    return pre + output

In [22]:
n_hidden = 128
n_layers = 2
shakenet2 = ShakespeareLSTM2(n_hidden, n_layers)

## after training for 40 epochs

In [27]:
n_epochs =  1

# train the model for 40 epochs
train2(shakenet2, train_dataloader, epochs=n_epochs, lr=0.001, print_every=200)

Epoch: 1/1... Step: 0/1375... Loss: 0.3315...
Epoch: 1/1... Step: 200/1375... Loss: 0.3223...
Epoch: 1/1... Step: 400/1375... Loss: 0.6391...
Epoch: 1/1... Step: 600/1375... Loss: 0.3848...
Epoch: 1/1... Step: 800/1375... Loss: 0.5677...
Epoch: 1/1... Step: 1000/1375... Loss: 0.4695...
Epoch: 1/1... Step: 1200/1375... Loss: 0.3744...


After being trained for 40 epochs, the loss of the new model converges around 0.3. The training loss of the improved model is significantly better than the model in problem 2. Now I try to make poems with the new model.

In [33]:
# using temperature of 1.5
poem_start = "shall i compare thee to a summer's day?\n"
print(freestyle2(shakenet2, poem_start, 1.5))

shall i compare thee to a summer's day?
thou art more lovy hials soingess, and warsettenst,
and perspist whibe frcenothers tell.
  but then such creatute time you a wair,
thus fresher in meriem well thou ever which doth days.
but deach fads in plorded mhere at beederily:
do if borthle thal fair so him had in the lick of eie,
and for this thou redair shines me old lends,
nor all trought in ohe can there for trought,
  do sheet this worths were, your self not,
craes wo the learned mimubed threit care beof,
and see pith't creist loow umboth their epter the,


In [42]:
# using temperature of 0.75
poem_start = "shall i compare thee to a summer's day?\n"
print(freestyle2(shakenet2, poem_start, 0.75))

shall i compare thee to a summer's day?
thou art more lovely and more temperate:
rough winds doot come death mine eye are old,
from false present, thou hame be secterse to. hear spirit,
although they being sprengse thee my recais,
of thy severing old pattech, that thus steme,
the beauty swift diecharned of the fair,
by to is the eityed graces benedy bying,
when sab hom thine eyes they hear in thee frebresting with resember of eye,
when o hemp wouk thy sweet heat gied stone,
uprobed more, that the envermed after lave your merit.
chelk he mistrions no true


In [57]:
# using temperature of 0.25
poem_start = "shall i compare thee to a summer's day?\n"
print(freestyle2(shakenet2, poem_start, 0.25))

shall i compare thee to a summer's day?
thou art more lovely and more never tknowed when my heart:
when i be awner doth lops an beem without,
that find my near in their curs of thee,
when i besond with steeling to have hat,
to decarined from thee finst a conkers
to that are grest and deserving with thy away,
and art leisured thee dead beseater,
but that i am flew of you ride and in not,
nor that they reep usus with thy will.
  i wit a weating my side, against my self,
  but since what is anind in thy feasings have,
  i please there not the oft their grow


### save the model

In [59]:
model_name = 'shakenet2.net'

checkpoint = {'n_hidden': shakenet2.n_hidden,
              'n_layers': shakenet2.n_layers,
              'state_dict': shakenet2.state_dict()}

with open(model_name, 'wb') as f:
    torch.save(checkpoint, f)

### load the model for prediction

In [60]:

with open('shakenet2.net', 'rb') as f:
    checkpoint = torch.load(f)
model = ShakespeareLSTM2(n_hidden =checkpoint['n_hidden'],n_layers=checkpoint['n_layers'])
model.load_state_dict(checkpoint['state_dict'])

<All keys matched successfully>

In [69]:
print(freestyle2(model, poem_start, 0.25))

shall i compare thee to a summer's day?
thou art more lovely and more temperate:
rough winds do shake the darrion buds a perted,
beauty's trom thy sair, can one out from thine:
eetoed to that depart oll are of sweet wrobn,
and own me thought in this alone, and see thoughts,
and praise to me will point did eyes be.
  if thou thy self despite these eyes withered,
made show men minds what the comment of well beseet,
the it to they are will, though thou art some ticely sake
tell of me greef tay will doot with their
and beloved new flook find be dear yourre,



## Conclusion

I tried to improve my model using a pytorch-optimized LSTM model which also has dropout integrated in contrast to implementing the LSTM model using a for loop and pytorch LSTMCell module in problem 2. I also added a dropout layer with the probability of 0.2. The loss dropped significantly compared to the previous model. After being trained for 40 epochs, the loss of the new model converges around 0.3. The training loss of the improved model is significantly better than the model in problem 2.

In terms of predictions, using temperature of 0.25, the model successfully predict the consecutive 69 letters and symbols after the given line ("thou art more lovely and more temperate:\n rough winds do shake the dar"). Using other temperatures, the new model also had more English-like and Shakespeare-like performance. It performs slightly better than the previous model.