In [5]:
import torch 
import pandas as pd 
import numpy as np 
import re

In [22]:
class Tokenizer:
    def __init__(self, data:pd.DataFrame):
        self.vocab, self.word_to_idx, self.idx_to_word = self.create_vocab(data)
        
    def create_vocab(self, df: pd.DataFrame):
        df["text"] = df["text"].apply(self.normalizer)
        vocab = set()
        for i, row in df.iterrows():
            vocab.update(row["text"].split())
        vocab = sorted(list(vocab))
        vocab.append("<pad>")
        vocab.append("<unk>")
        
        word_to_idx = {word: idx for idx, word in enumerate(vocab)}
        idx_to_word = {idx: word for idx, word in enumerate(vocab)}
        
        return vocab, word_to_idx, idx_to_word
    
    def get_vocab(self):
        return self.vocab
    
    def normalizer(self, s: str):
        s = s.lower()
        s = re.sub(r'[^a-z\s]', '', s)
        s = re.sub(r'\s+', ' ', s)
        s = s.strip()
        return s
    
    def encode(self, text:str):
        text = self.normalizer(text)
        return [self.word_to_idx.get(word, len(self.vocab) - 1)  for word in text.split()]
    
    def decode(self, idxs):
        return " ".join([self.idx_to_word[idx] for idx in idxs])
    
        

In [23]:
data = pd.DataFrame(data = ["""A closed-loop controller or feedback controller is a control loop which incorporates feedback, in contrast to an open-loop controller or non-feedback controller. A closed-loop controller uses feedback to control states or outputs of a dynamical system. Its name comes from the information path in the system: process inputs (e.g., voltage applied to an electric motor) have an effect on the process outputs (e.g., speed or torque of the motor), which is measured with sensors and processed by the controller; the result (the control signal) is "fed back" as input to the process, closing the loop"""], columns = ["text"])

In [24]:
data

Unnamed: 0,text
0,A closed-loop controller or feedback controlle...


In [25]:
tokenizer = Tokenizer(data)

In [26]:
tokenizer.get_vocab()

['a',
 'an',
 'and',
 'applied',
 'as',
 'back',
 'by',
 'closed',
 'closing',
 'comes',
 'contrast',
 'control',
 'controller',
 'dynamical',
 'e',
 'effect',
 'electric',
 'fed',
 'feedback',
 'from',
 'g',
 'have',
 'in',
 'incorporates',
 'information',
 'input',
 'inputs',
 'is',
 'its',
 'loop',
 'measured',
 'motor',
 'name',
 'non',
 'of',
 'on',
 'open',
 'or',
 'outputs',
 'path',
 'process',
 'processed',
 'result',
 'sensors',
 'signal',
 'speed',
 'states',
 'system',
 'the',
 'to',
 'torque',
 'uses',
 'voltage',
 'which',
 'with',
 '<pad>',
 '<unk>']

In [27]:
test_data = "A closed-loop controller or feedback controller is a control loop which incorporates feedback, in contrast to an open-loop controller or non-feedback controller. A closed-loop controller uses feedback to control states or outputs of a dynamical system. Its name comes from the information path in the system: process inputs (e.g., voltage applied to an electric motor) have an effect on the process outputs (e.g., speed or torque of the motor), which is measured with sensors and processed by the controller; the result (the control signal)"

In [31]:
encoded_test = tokenizer.encode(test_data)
print(encoded_test)

[0, 7, 29, 12, 37, 18, 12, 27, 0, 11, 29, 53, 23, 18, 22, 10, 49, 1, 36, 29, 12, 37, 33, 18, 12, 0, 7, 29, 12, 51, 18, 49, 11, 46, 37, 38, 34, 0, 13, 47, 28, 32, 9, 19, 48, 24, 39, 22, 48, 47, 40, 26, 14, 20, 52, 3, 49, 1, 16, 31, 21, 1, 15, 35, 48, 40, 38, 14, 20, 45, 37, 50, 34, 48, 31, 53, 27, 30, 54, 43, 2, 41, 6, 48, 12, 48, 42, 48, 11, 44]


In [32]:
print(tokenizer.decode(encoded_test))

a closed loop controller or feedback controller is a control loop which incorporates feedback in contrast to an open loop controller or non feedback controller a closed loop controller uses feedback to control states or outputs of a dynamical system its name comes from the information path in the system process inputs e g voltage applied to an electric motor have an effect on the process outputs e g speed or torque of the motor which is measured with sensors and processed by the controller the result the control signal


In [33]:
test_data2 = "today is sunday and i am going to the market"
encoded_test2 = tokenizer.encode(test_data2)
print(encoded_test2)


[56, 27, 56, 2, 56, 56, 56, 49, 48, 56]


In [34]:
print(tokenizer.decode(encoded_test2))

<unk> is <unk> and <unk> <unk> <unk> to the <unk>
