## Imports

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

import pandas as pd
import numpy as np

## Data Preparation 

In [2]:
sp = pd.read_csv("All-seasons.csv")
sp.replace('\n',' ',regex = True,inplace=True)
sp.to_csv('sp.csv',index= False)
sp.head()
cartman = sp[sp.Character=="Cartman"]
stan = sp[sp.Character == "Stan"]

In [3]:
cartman.head()

Unnamed: 0,Season,Episode,Character,Line
9,10,1,Cartman,I'm gonna miss him. I'm gonna miss Chef and I...
20,10,1,Cartman,"Reverse to you, Jew. \r"
25,10,1,Cartman,All right! \r
49,10,1,Cartman,"Uh, guys? Did Chef seem a little, uh, trippy t..."
55,10,1,Cartman,"Oh boy oh boy, I can't wait to have Chef's lun..."


In [4]:
stan.head()

Unnamed: 0,Season,Episode,Character,Line
0,10,1,Stan,"You guys, you guys! Chef is going away. \r"
2,10,1,Stan,Forever.\r
4,10,1,Stan,"Chef said he's been bored, so he joining a gro..."
10,10,1,Stan,"Dude, how are we gonna go on? Chef was our fuh..."
21,10,1,Stan,I'll get it. \r


In [5]:
sp_text = ""

chars = []
i = 0
for char in open('character.txt'):
    chars.append(char.strip() + ": ")
    if i>=300:
        break
    i = i+1
lines = [] 

i = 0
for line in open('line.txt'):
        lines.append(line)
        if i>=300:
            break
        i = i+1
        
for i in range(len(lines)):
    total = chars[i] + lines[i]
    sp_text += total
    
print(sp_text)

ï»¿Stan: You guys, you guys! Chef is going away.  
Kyle: Going away? For how long? 
Stan: Forever. 
Chef: I'm sorry boys. 
Stan: Chef said he's been bored, so he joining a group called the Super Adventure Club.  
Chef: Wow! 
Mrs. Garrison: Chef?? What kind of questions do you think adventuring around the world is gonna answer?! 
Chef: What's the meaning of life? Why are we here? 
Mrs. Garrison: I hope you're making the right choice. 
Cartman: I'm gonna miss him.  I'm gonna miss Chef and I...and I don't know how to tell him!  
Stan: Dude, how are we gonna go on? Chef was our fuh...f-ffriend.  
Mayor McDaniels: And we will all miss you, Chef,  but we know you must do what your heart tells you.. 
Jimbo: Bye-bye! 
Gerald: Good-bye! 
Mr. Mackey: So long! 
A Man: So long, Chef! 
A Sign-Holder: Good-bye, Chef! 
Randy: Good-bye, Chef! Have a great time with the Super Adventure Club! 
Chef: Good-bye! .. 
Kyle: Draw two card, fatass. 
Cartman: Reverse to you, Jew.  
Stan: I'll get it.  
Chef: He

In [6]:
sp_text[:100]

'ï»¿Stan: You guys, you guys! Chef is going away.  \nKyle: Going away? For how long? \nStan: Forever. \n'

In [7]:
vocab = list(set(sp_text))
vocab_stoi = {s: i for i, s in enumerate(vocab)}
vocab_itos = {i: s for i, s in enumerate(vocab)}

In [8]:
#num different characters
len(vocab)

69

In [12]:
#random chunk lines

import random
random.seed(7)

sp_len = len(sp_text)

def random_chunk(chunk_len = 300):
    
    start_index = random.randint(0, sp_len-chunk_len)
    #print(start_index)
    end_index = start_index + chunk_len + 1
    
    return sp_text[start_index:end_index]

print(random_chunk())
print(random_chunk())

Mr. Connolly: Tally ho, lads! I must say you're starting to become quite a thorn in my balls. 
Stan: Where's Chef?! What have you done with him?! 
Mr. Connolly: He's safe.  He's fasting in the Deprivation Room and being read the Super Adventure Club manual. We've got to undo the damage you've done. 

CHEF! You need to get out of here before you get arrested, all right?! 
Chef: I specializes in your asshole, Kyle.  
Cartman: ...Man, I can't believe all this time, Chef just wanted us for sex. 
Kyle: He didn't want us for sex, fatass! Something is making him say those things. 
Kenny: (Like what?) 
K


In [13]:
#change text to tensor

def text_to_tensor(text, vocab = vocab):
    
    indices = [vocab_stoi[ch] for ch in text]
    return torch.tensor(indices)

print(text_to_tensor(random_chunk()))
print(text_to_tensor(random_chunk()))

tensor([68,  9, 12, 66, 48, 68, 35, 12, 55, 44, 66,  9, 44, 14, 55,  0, 12,  3,
        14, 12, 10, 54, 54, 44, 10,  9, 55, 12, 51, 26, 68, 12, 40, 40, 40, 12,
        48, 44, 10, 64, 44, 12, 68, 55, 12,  2, 26, 12, 66, 33, 26,  3, 66, 44,
        40, 12, 31, 37, 49, 12, 10, 67,  9, 10,  3, 38, 12, 13, 44, 37,  9, 44,
        12, 39, 26,  3,  2, 39, 12, 14, 26, 12, 33, 10, 64, 44, 12, 14, 26, 40,
        40, 40, 12, 10, 55, 29, 12, 51, 26, 68, 12, 14, 26, 12, 48, 44, 10, 64,
        44, 40, 12, 12, 22, 65, 14, 10,  2, 34, 12, 52, 44, 37,  9, 44, 12,  2,
        26, 14, 12, 48, 44, 10, 64,  3,  2, 39, 12, 13,  3, 14, 33, 26, 68, 14,
        12, 60, 33, 44, 67, 40, 12, 22, 61,  9, 40, 12, 60, 26,  2,  2, 26, 48,
        48, 51, 34, 12, 31, 67, 12, 51, 26, 68, 12, 66, 33, 26, 26, 55, 44, 12,
         2, 26, 14, 12, 14, 26, 12, 48, 44, 10, 64, 44,  0, 12, 14, 33, 44,  2,
        12, 31, 37, 49, 12, 10, 67,  9, 10,  3, 38, 12, 13, 44, 37,  9, 44, 12,
        45, 68, 55, 14, 12, 39, 26,  3, 

In [14]:
#create random training set
#input is first chunk_len - 1 chars
#target is last chunk-len - 1 chars

def random_training_set(chunk_len = 50):
    
    chunk = random_chunk(chunk_len)
    
    #print('chunk:', chunk)
    #print('inp:', chunk[:-1])
    #print('target:', chunk[1:])
    
    inp = text_to_tensor(chunk[:-1]) #omit first token
    target = text_to_tensor(chunk[1:]) #omit last token
    return inp, target

random_training_set(10)

(tensor([48, 48, 12, 55, 44, 44, 12, 51, 26, 68]),
 tensor([48, 12, 55, 44, 44, 12, 51, 26, 68, 12]))

## LSTM MODEL

In [15]:
class SouthParkLSTM(nn.Module):
    def __init__(self, vocab_size, hidden_size, n_layers=1):
        super(SouthParkLSTM, self).__init__()
        
        #RNN ATTRIBUTES
        self.vocab_size = vocab_size
        self.hidden_size = hidden_size
        self.n_layers = n_layers
        
        #create one-hot vectors (number of characters in vocab)
        self.ident = torch.eye(vocab_size)
        
        #RNN
        self.rnn = nn.LSTM(vocab_size, hidden_size, n_layers, batch_first = True)
        
        #layer to decode LSTM output into distribution over the vocab
        self.decoder = nn.Linear(hidden_size, vocab_size)
        
    def forward(self, inp):
        
        #reshape input tensor to [1, seq_len]
        inp = inp.view(1, -1)
        
        #generate 1-hot vecs from token indices
        inp = self.ident[inp]
        
        #set initial hidden state and cell state
        h0 = torch.zeros(1, inp.size(0), self.hidden_size)
        c0 = torch.zeros(1, inp.size(0), self.hidden_size)
        
        #obtain next output and hidden state
        out, _ = self.rnn(inp, (h0, c0))
        
        #print(out, len(out))
        #print('\n')
        
        #run decoder
        output = self.decoder(out.squeeze(0))
        
        #print(output)
        
        return output
    
    def init_hidden(self):
        return torch.zeros(self.n_layers, 1, self.hidden_size)

In [16]:
model = SouthParkLSTM(len(vocab), 128)

In [17]:
def evaluate(model, prime_str='win', predict_len=100, temperature=0.8):
    hidden = model.init_hidden()
    
    #print('HIDDEN: ', hidden)
    
    prime_input = text_to_tensor(prime_str)
    
    #print(prime_str, ',', prime_input)
    
    predicted = prime_str
    
    #print('PREDICTED: ', predicted)
    
    # Use priming string to "build up" hidden state
    for p in range(len(prime_str) - 1):
        
        #print('PRIMING STR LOOP: ', p , prime_input[p])
        
        out = model(prime_input[p])
        
        #print(out, len(out))
            
    inp = prime_input[-1]
    
    #print('INP_EVAL: ', inp)
    
    
    for p in range(predict_len):
        output = model(inp)
        
        # Sample from the network as a multinomial distribution
        output_dist = output.data.view(-1).div(temperature).exp()
        top_i = int(torch.multinomial(output_dist, 1)[0])
        # Add predicted character to string and use as next input
        predicted_char = vocab_itos[top_i]
        predicted += predicted_char
        inp = text_to_tensor(predicted_char)

    return predicted

print(evaluate(model, predict_len=20))

winyYS?WfTF:dKnvgqd»ONq


In [19]:
def train(model, num_iters=10000, lr=0.004):
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    criterion = nn.CrossEntropyLoss()
    for it in range(num_iters):
        # get training set
        inp, target = random_training_set()
        
        #print('INP, TAR: ', inp, target)
        #print('INP, TAR LEN: ', len(inp), len(target))
        #print('\n')
        
        
        # cleanup
        optimizer.zero_grad()
        
        # forward pass
        hidden = model.init_hidden()
        output = model(inp)
        
        output2 = output.squeeze(0)
        
        #print('OUT, TAR: ', output2, target)
        #print('OUT, TAR LEN: ', output2.size(), target.size())
        #print('\n')
        
        
        loss = criterion(output2, target)
        # backward pass
        loss.backward()
        optimizer.step()
        
        #print(evaluate(model, '', 50))
        
        if it % 200 == 199:
            print("[Iter %d] Loss %f" % (it+1, float(loss)))
            print("    " + evaluate(model, ' ', 50))

train(model)

KeyboardInterrupt: 