## Imports

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

import pandas as pd
import numpy as np

## Data Preparation 

In [2]:
sp = pd.read_csv("All-seasons.csv")
sp.replace('\n',' ',regex = True,inplace=True)
sp.to_csv('sp.csv',index= False)
sp.head()
cartman = sp[sp.Character=="Cartman"]
stan = sp[sp.Character == "Stan"]

In [3]:
cartman.head()

Unnamed: 0,Season,Episode,Character,Line
9,10,1,Cartman,I'm gonna miss him. I'm gonna miss Chef and I...
20,10,1,Cartman,"Reverse to you, Jew. \r"
25,10,1,Cartman,All right! \r
49,10,1,Cartman,"Uh, guys? Did Chef seem a little, uh, trippy t..."
55,10,1,Cartman,"Oh boy oh boy, I can't wait to have Chef's lun..."


In [4]:
stan.head()

Unnamed: 0,Season,Episode,Character,Line
0,10,1,Stan,"You guys, you guys! Chef is going away. \r"
2,10,1,Stan,Forever.\r
4,10,1,Stan,"Chef said he's been bored, so he joining a gro..."
10,10,1,Stan,"Dude, how are we gonna go on? Chef was our fuh..."
21,10,1,Stan,I'll get it. \r


In [5]:
sp_text = ""

chars = []
i = 0
for char in open('character.txt'):
    chars.append(char.strip() + ": ")
    if i>=300:
        break
    i = i+1
lines = [] 

i = 0
for line in open('line.txt'):
        lines.append(line)
        if i>=300:
            break
        i = i+1
        
for i in range(len(lines)):
    total = chars[i] + lines[i]
    sp_text += total
    
print(sp_text)

ï»¿Stan: You guys, you guys! Chef is going away.  
Kyle: Going away? For how long? 
Stan: Forever. 
Chef: I'm sorry boys. 
Stan: Chef said he's been bored, so he joining a group called the Super Adventure Club.  
Chef: Wow! 
Mrs. Garrison: Chef?? What kind of questions do you think adventuring around the world is gonna answer?! 
Chef: What's the meaning of life? Why are we here? 
Mrs. Garrison: I hope you're making the right choice. 
Cartman: I'm gonna miss him.  I'm gonna miss Chef and I...and I don't know how to tell him!  
Stan: Dude, how are we gonna go on? Chef was our fuh...f-ffriend.  
Mayor McDaniels: And we will all miss you, Chef,  but we know you must do what your heart tells you.. 
Jimbo: Bye-bye! 
Gerald: Good-bye! 
Mr. Mackey: So long! 
A Man: So long, Chef! 
A Sign-Holder: Good-bye, Chef! 
Randy: Good-bye, Chef! Have a great time with the Super Adventure Club! 
Chef: Good-bye! .. 
Kyle: Draw two card, fatass. 
Cartman: Reverse to you, Jew.  
Stan: I'll get it.  
Chef: He

In [6]:
sp_text[:100]

'ï»¿Stan: You guys, you guys! Chef is going away.  \nKyle: Going away? For how long? \nStan: Forever. \n'

In [7]:
vocab = list(set(sp_text))
vocab_stoi = {s: i for i, s in enumerate(vocab)}
vocab_itos = {i: s for i, s in enumerate(vocab)}

In [8]:
#num different characters
len(vocab)

69

In [9]:
#random chunk lines

import random
random.seed(7)

sp_len = len(sp_text)

def random_chunk(chunk_len = 300):
    
    start_index = random.randint(0, sp_len-chunk_len)
    #print(start_index)
    end_index = start_index + chunk_len + 1
    
    return sp_text[start_index:end_index]

print(random_chunk())
print(random_chunk())

Mr. Connolly: Tally ho, lads! I must say you're starting to become quite a thorn in my balls. 
Stan: Where's Chef?! What have you done with him?! 
Mr. Connolly: He's safe.  He's fasting in the Deprivation Room and being read the Super Adventure Club manual. We've got to undo the damage you've done. 

CHEF! You need to get out of here before you get arrested, all right?! 
Chef: I specializes in your asshole, Kyle.  
Cartman: ...Man, I can't believe all this time, Chef just wanted us for sex. 
Kyle: He didn't want us for sex, fatass! Something is making him say those things. 
Kenny: (Like what?) 
K


In [10]:
#change text to tensor

def text_to_tensor(text, vocab = vocab):
    
    indices = [vocab_stoi[ch] for ch in text]
    return torch.tensor(indices)

print(text_to_tensor(random_chunk()))
print(text_to_tensor(random_chunk()))

tensor([23, 34, 62, 36, 54, 23, 66, 62, 17, 48, 36, 34, 48, 24, 17, 59, 62, 63,
        24, 62, 68, 47, 47, 48, 68, 34, 17, 62, 28, 51, 23, 62, 14, 14, 14, 62,
        54, 48, 68, 30, 48, 62, 23, 17, 62, 60, 51, 62, 36, 46, 51, 63, 36, 48,
        14, 62, 67, 38,  1, 62, 68, 35, 34, 68, 63, 44, 62, 10, 48, 38, 34, 48,
        62, 29, 51, 63, 60, 29, 62, 24, 51, 62, 46, 68, 30, 48, 62, 24, 51, 14,
        14, 14, 62, 68, 17, 65, 62, 28, 51, 23, 62, 24, 51, 62, 54, 48, 68, 30,
        48, 14, 62, 62,  0, 58, 24, 68, 60, 61, 62,  8, 48, 38, 34, 48, 62, 60,
        51, 24, 62, 54, 48, 68, 30, 63, 60, 29, 62, 10, 63, 24, 46, 51, 23, 24,
        62, 55, 46, 48, 35, 14, 62,  0, 31, 34, 14, 62, 55, 51, 60, 60, 51, 54,
        54, 28, 61, 62, 67, 35, 62, 28, 51, 23, 62, 36, 46, 51, 51, 17, 48, 62,
        60, 51, 24, 62, 24, 51, 62, 54, 48, 68, 30, 48, 59, 62, 24, 46, 48, 60,
        62, 67, 38,  1, 62, 68, 35, 34, 68, 63, 44, 62, 10, 48, 38, 34, 48, 62,
        13, 23, 17, 24, 62, 29, 51, 63, 

In [11]:
#create random training set
#input is first chunk_len - 1 chars
#target is last chunk-len - 1 chars

def random_training_set(chunk_len = 50):
    
    chunk = random_chunk(chunk_len)
    
    #print('chunk:', chunk)
    #print('inp:', chunk[:-1])
    #print('target:', chunk[1:])
    
    inp = text_to_tensor(chunk[:-1]) #omit first token
    target = text_to_tensor(chunk[1:]) #omit last token
    return inp, target

random_training_set(10)

(tensor([54, 54, 62, 17, 48, 48, 62, 28, 51, 23]),
 tensor([54, 62, 17, 48, 48, 62, 28, 51, 23, 62]))

## LSTM MODEL

In [12]:
class SouthParkLSTM(nn.Module):
    def __init__(self, vocab_size, hidden_size, n_layers=1):
        super(SouthParkLSTM, self).__init__()
        
        #RNN ATTRIBUTES
        self.vocab_size = vocab_size
        self.hidden_size = hidden_size
        self.n_layers = n_layers
        
        #create one-hot vectors (number of characters in vocab)
        self.ident = torch.eye(vocab_size)
        
        #RNN
        self.rnn = nn.LSTM(vocab_size, hidden_size, n_layers, batch_first = True)
        
        #layer to decode LSTM output into distribution over the vocab
        self.decoder = nn.Linear(hidden_size, vocab_size)
        
    def forward(self, inp):
        
        #reshape input tensor to [1, seq_len]
        inp = inp.view(1, -1)
        
        #generate 1-hot vecs from token indices
        inp = self.ident[inp]
        
        #set initial hidden state and cell state
        h0 = torch.zeros(1, inp.size(0), self.hidden_size)
        c0 = torch.zeros(1, inp.size(0), self.hidden_size)
        
        #obtain next output and hidden state
        out, _ = self.rnn(inp, (h0, c0))
        
        #print(out, len(out))
        #print('\n')
        
        #run decoder
        output = self.decoder(out.squeeze(0))
        
        #print(output)
        
        return output
    
    def init_hidden(self):
        return torch.zeros(self.n_layers, 1, self.hidden_size)

In [13]:
model = SouthParkLSTM(len(vocab), 128)

In [20]:
def evaluate(model, prime_str='I', predict_len=500, temperature=0.8):
    hidden = model.init_hidden()
    
    #print('HIDDEN: ', hidden)
    
    prime_input = text_to_tensor(prime_str)
    
    #print(prime_str, ',', prime_input)
    
    predicted = prime_str
    
    #print('PREDICTED: ', predicted)
    
    # Use priming string to "build up" hidden state
    for p in range(len(prime_str) - 1):
        
        #print('PRIMING STR LOOP: ', p , prime_input[p])
        
        out = model(prime_input[p])
        
        #print(out, len(out))
            
    inp = prime_input[-1]
    
    #print('INP_EVAL: ', inp)
    
    
    for p in range(predict_len):
        output = model(inp)
        
        # Sample from the network as a multinomial distribution
        output_dist = output.data.view(-1).div(temperature).exp()
        top_i = int(torch.multinomial(output_dist, 1)[0])
        
        # Add predicted character to string and use as next input
        predicted_char = vocab_itos[top_i]
        predicted += predicted_char
        inp = text_to_tensor(predicted_char)

    return predicted

print(evaluate(model, predict_len=20))

Itithef: freron as t.


In [21]:
def train(model, num_iters=10000, lr=0.001):
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    criterion = nn.CrossEntropyLoss()
    for it in range(num_iters):
        # get training set
        inp, target = random_training_set()
        
        #print('INP, TAR: ', inp, target)
        #print('INP, TAR LEN: ', len(inp), len(target))
        #print('\n')
        
        
        # cleanup
        optimizer.zero_grad()
        
        # forward pass
        hidden = model.init_hidden()
        output = model(inp)
        
        output2 = output.squeeze(0)
        
        #print('OUT, TAR: ', output2, target)
        #print('OUT, TAR LEN: ', output2.size(), target.size())
        #print('\n')
        
        
        loss = criterion(output2, target)
        # backward pass
        loss.backward()
        optimizer.step()
        
        #print(evaluate(model, '', 50))
        
        if it % 200 == 199:
            print("[Iter %d] Loss %f" % (it+1, float(loss)))
            print("    " + evaluate(model, ' ', 300))

train(model)

[Iter 200] Loss 0.313821
     
Surrevearre t Ne w my ale thedyo 
Ky! yoldin. Adr 
Ste ha way che the I Ned y: bothe! whattofr.. O! meve Hen Adre rer! y call aigh, 
Mrid allen avennd: It. homar 
Dasuso  Geng ousouppetho. wim h! Sthindvealiderallend Mrthe t tthe andng Im s, 
Mryong? y w s 
Cors m 
Gor s: ind ancle. Busoven illlyen
[Iter 400] Loss 0.232627
     sere: heve minde and at genoy. ded. g d yonateves 
Sthe themar. ongith 
Mrg t che: yond pallelis. rinyomayourit, m Col, s t. ad ped s, 
Ky ff: th, ghinnt the myoustheenchenele ancatureny: lie thadus t'rt 
Kyor 
Kyondupen s the Goube Youndyourianeanghico  s Ithin Ime l with 
St lay: kelld d Thime. th
[Iter 600] Loss 0.267770
     y he adanove A-f: ut Kyontt. whe 
Ste heve lloounon: s.. hendifif: f: 
Sus. Chis? bedrst?  Ad, me chant! He wid me Hod whathe Efr br you handefon: s hing ch. aimen I'rtemy 
Sthend 
Dr smas t Af dy: 
han: 
Mers th s e! rignon'meang Suste, mef: ghef ybe- whee: that ste s mefr. 
St aieald: s 
Kyble ang
[Iter 8

[Iter 5200] Loss 0.200267
     chathit P. Che gonound: Su man: kefin, tt thas. re o Clit A 
Cousotondoure imis br Pant. hin youthef .. ond dy wantee: fove He! Jabrinoulyo? te'sore, man: belly h o. one ylope were told Cher Wh! br s kan s Ahes. t 
Soun quplwan: attifure  Nno Go Choupe  te Mranthystusend b. f! fon 
Kys! tonthe ss p 
[Iter 5400] Loss 0.311103
     Che we t h mfou chald Clounng, go Ah! s: sethed and 
Dr t an's yolllld lleare ing hent rest witerimarttus ld kima 
Surigithanomesthe y Cus a  Of the Cl 
Menouth: le sathiloullldverevecourWeve 
Suho I y s! Ge t ghim t jutoutepe youth 
Chave Cho I  ploulithamallid! lluth he. het ar wat ybo Chis Javere
[Iter 5600] Loss 0.232704
     s verdyoungono hendre Heais Che I'soulll, 
Sththere Buth, Whime hiet? Ne es henouthan. tirr  m ldem  st Weshe souh, Pane. y! an! I's ar y, y g 
Stheclle. uret anjor t mithethined our 
M w 
Sut t manno Mr anacon: bur Wenoff hefiloin: bantrindved ure Aha in his. Chenes 
Che gon, soken Mre s ind pa P. 
[Ite