In [1]:
from collections import Counter
import random
from abc import abstractmethod, ABC

In [2]:
with open('../data/shakespeare.txt') as f:
    txt = f.read()

In [3]:
class Tokenizer(ABC):
    @abstractmethod
    def build(self, data):
        pass

    @abstractmethod
    def encode(self, input):
        pass

    @abstractmethod
    def decode(self, tokens):
        pass

class Model(ABC):
    @abstractmethod
    def build(self, data, tokens):
        pass

    @abstractmethod
    def generate(self, context, gen_steps=100):
        pass


In [29]:

class CTokenizer(Tokenizer):
    def build(self, data):
        self.tokens = sorted(list(set(data)))
        self.ttoi = { t:i for i,t  in enumerate(self.tokens)}
        self.itot = { i:t for i,t  in enumerate(self.tokens)}

    def encode(self, input):
        return [self.ttoi[c] for c in input]

    def decode(self, tokens):
        return ''.join(self.itot[t] for t in tokens)

class WTokenizer(Tokenizer):
    def build(self, data):
        self.tokens = sorted(list(set(data.split())))
        self.ttoi = { t:i for i,t  in enumerate(self.tokens)}
        self.itot = { i:t for i,t  in enumerate(self.tokens)}

    def encode(self, input):
        return [self.ttoi[w] for w in input.split()]

    def decode(self, tokens):
        return ' '.join(self.itot[t] for t in tokens)

class Model0(Model):
    def build(self, data, tokens):
        self.chars = sorted(list(set(data)))
    
    def generate(self, context, gen_steps=100):
        output = ''.join(random.choices(self.chars, k=gen_steps))
        return context + output


In [31]:
test_str = 'She said this was a'

c_tkzr = CTokenizer()
c_tkzr.build(txt)
print(c_tkzr.encode(test_str))

w_tkzr = WTokenizer()
w_tkzr.build(txt)
print(w_tkzr.encode(test_str))

[31, 46, 43, 1, 57, 39, 47, 42, 1, 58, 46, 47, 57, 1, 61, 39, 57, 1, 39]
[3506, 19873, 22837, 24568, 4428]


In [160]:
class Model(ABC):
    @abstractmethod
    def build(self, data, tokens):
        pass

    @abstractmethod
    def generate(self, context, gen_steps=100):
        pass


Gonna have to rewrite this later, but basically you have to write different way to encode it with words.

## 0-th order approximation

In [45]:
class Model0(Model):
    def build(self, data, tokenizer):
        self.tokenizer = tokenizer
    
    def generate(self, context, gen_steps=100):
        output = random.choices(range(len((self.tokenizer.tokens))), k=gen_steps)
        return context + output

In [46]:
m0 = Model0()
m0.build(txt, c_tkzr)
output = m0.generate(c_tkzr.encode(test_str))
print(c_tkzr.decode(output))


She said this was aJGnZ!o&mAokT.FpfBM3k&zYTWNA
icB:-L'BltA;vRUazEfRELIRkYqgDWB'S;zIaBN!j,kjUPjVfiG:T'APtZueCkUY?BBe3SHP


## 1-st order approximation

In [66]:
class Model1(Model):
    def build(self, data, tokenizer):
        self.tokenizer = tokenizer

        tokenized_data = self.tokenizer.encode(data)
        cnts = Counter(tokenized_data)
        self.w_chars = [cnts[self.tokenizer.encode(c)[0]] for c in self.tokenizer.tokens]
    
    def generate(self, context, gen_steps=100):
        output = random.choices(range(len((self.tokenizer.tokens))), weights=self.w_chars, k=gen_steps)
        return context + output

In [69]:
m1 = Model1()
m1.build(txt, c_tkzr)
output = m1.generate(c_tkzr.encode(test_str))
print(c_tkzr.decode(output))

She said this was as?T,a  tonaiSisvntai ed i thhBellnIbsR,agotoo.sne:noluthh:
ifre
o.alaTdo dibgee sio,aI,uo ryo i lelh


## 2nd order approximation

In [72]:
class Model2(Model):
    def build(self, data, tokenizer):
        self.tokenizer = tokenizer
        self.unigram_weights = { i:[0 for _ in range(len(self.tokenizer.tokens))] for i in range(len(self.tokenizer.tokens)) }

        # Process the data and create weights
        tokenized_data = self.tokenizer.encode(data)
        unigram_pairs = list(zip(tokenized_data, tokenized_data[1:]))
        for fc, sc in unigram_pairs:
            self.unigram_weights[fc][sc] += 1
    
    def generate(self, context, gen_steps=100):
        """
        Generates new tokens based off context.
        NOTE: For unigram, you can't start generation off of an empty string.
        """
        output = context
        for _ in range(gen_steps):
            output.append(random.choices(range(len(self.tokenizer.tokens)), weights=self.unigram_weights[output[-1]])[0])
        return output


In [85]:
m2 = Model2()
m2.build(txt, c_tkzr)
output = m2.generate(c_tkzr.encode(test_str))
print(c_tkzr.decode(output))

She said this was anje, ng Y:
T:
Myol es;
ARO:--tooristhang st I cthal opung fororonoucane-dkin ard th ind s hesuthid m


In [113]:
data = txt
tokenizer = c_tkzr 
test_str = 'She said this was a'

models = [Model0(), Model1(), Model2()]

for i, model in enumerate(models):
    model.build(data, tokenizer)
    output = model.generate(tokenizer.encode(test_str), gen_steps=50)
    print(f'Model{i}:\n{tokenizer.decode(output)}\n\n')

Model0:
She said this was a-SATW
z&Ur3J:wfGVUVNWC$ONy-jKO'uU$v'l,mJ!htrdwBYIc


Model1:
She said this was a uioeeknheIB gpIoIshynstlshwm?lbobb ,Ib;, ;
i
dndr


Model2:
She said this was an's.

Wie her sesel
AREDUE ayamu pp hthisachthare 




## Word-tokenization and same things!

In [107]:
w_tkzr = WTokenizer()
w_tkzr.build(txt[:200_000])
print(w_tkzr.encode(test_str))

print(len(w_tkzr.tokens))

[920, 6224, 7165, 7711, 1238]
8100


In [110]:
data = txt[:200_000]
tokenizer = w_tkzr
test_str = 'She said this was a'

models = [Model0(), Model1(), Model2()]

for i, model in enumerate(models):
    model.build(data, tokenizer)
    output = model.generate(tokenizer.encode(test_str), gen_steps=30)
    print(f'Model{i}:\n{tokenizer.decode(output)}\n\n')

Model0:
She said this was a white again? planted Cushions, method, wise, behold men's pass'd Romans. vent comes Must, side saving senators, Ladders, covets certain themselves, Marry, requests movers fear. hell. Romans.' Brother alike. brave? natural


Model1:
She said this was a by BRUTUS: duke. your But it lack i' were wealsmen Capitol-- make Only in or would think stopp'd my estimation my prey angry Volsce, i' stripes mother, hang I your


Model2:
She said this was a humorous patrician, and parent. CORIOLANUS: I count my nature is left, Marcius: A weeder-out of wine. Second Citizen: But we the king, To your voices, I came, Ready to put


