In [1]:
import re, json
import pandas as pd
from collections import Counter
from tqdm import tqdm

In [2]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r"http\S+", "", text)
    text = re.sub(r"[^a-zA-Z\s]", "", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

df1 = pd.read_csv("../data/Sentiment_Analysis.csv")
df2 = pd.read_csv("../data/imdb_dataset.csv")

texts = pd.concat([df1['text'], df2['text']]).apply(clean_text).tolist()
print("Total training texts:", len(texts))

Total training texts: 105000


In [3]:
class BPETokenizer:
    def __init__(self, vocab_size=2000):
        self.vocab_size = vocab_size
        self.vocab = {}
        self.merges = []

    def get_stats(self, corpus):
        pairs = Counter()
        for word, freq in corpus.items():
            symbols = word.split()
            for i in range(len(symbols)-1):
                pairs[(symbols[i], symbols[i+1])] += freq
        return pairs

    def merge_vocab(self, pair, corpus):
        merged = {}
        bigram = ' '.join(pair)
        replacement = ''.join(pair)
        for word in corpus:
            new_word = word.replace(bigram, replacement)
            merged[new_word] = corpus[word]
        return merged

    def train(self, texts):
        corpus = Counter([' '.join(list(word)) + ' </w>' for text in texts for word in text.split()])
        for _ in tqdm(range(self.vocab_size)):
            pairs = self.get_stats(corpus)
            if not pairs: break
            best = max(pairs, key=pairs.get)
            corpus = self.merge_vocab(best, corpus)
            self.merges.append(best)

        tokens = set()
        for word in corpus:
            tokens.update(word.split())
        self.vocab = {tok: i for i, tok in enumerate(tokens)}

    def tokenize_word(self, word):
        word = list(word) + ['</w>']
        i = 0
        while i < len(word)-1:
            pair = (word[i], word[i+1])
            if pair in self.merges:
                word[i:i+2] = [''.join(pair)]
            else:
                i += 1
        return word

    def tokenize(self, text):
        tokens = []
        for word in text.split():
            tokens.extend(self.tokenize_word(word))
        return tokens

In [4]:
tokenizer = BPETokenizer(vocab_size=2000)
tokenizer.train(texts)

with open("../outputs/subword_vocab.json", "w") as f:
    json.dump(tokenizer.vocab, f, indent=2)

print("Tokenizer training complete!")

100%|██████████| 2000/2000 [20:51<00:00,  1.60it/s]


Tokenizer training complete!


In [5]:
for i in range(10):
    print(texts[i])
    print(tokenizer.tokenize(texts[i]))
    print()

and here is the rap song african warrior queens for which chatgpt wrote the lyrics yes amateur but beautiful nn on ko link below sound on
['an', 'd</w>', 'he', 're', '</w>', 'is', '</w>', 'the', '</w>', 'ra', 'p</w>', 'so', 'n', 'g</w>', 'af', 'ri', 'ca', 'n</w>', 'wa', 'r', 'ri', 'or', '</w>', 'que', 'en', 's</w>', 'fo', 'r</w>', 'wh', 'ic', 'h</w>', 'ch', 'at', 'g', 'p', 't</w>', 'w', 'ro', 'te', '</w>', 'the', '</w>', 'ly', 'ri', 'c', 's</w>', 'ye', 's</w>', 'am', 'at', 'eu', 'r</w>', 'but', '</w>', 'be', 'au', 'ti', 'ful', '</w>', 'nn', '</w>', 'on</w>', 'k', 'o</w>', 'li', 'n', 'k</w>', 'be', 'lo', 'w</w>', 'so', 'un', 'd</w>', 'on</w>']

we asked chatgpt nnhow to become a successful tradernnhere is what it thinksnnchatgpt
['we', '</w>', 'as', 'ke', 'd</w>', 'ch', 'at', 'g', 'p', 't</w>', 'nn', 'ho', 'w</w>', 'to', '</w>', 'be', 'co', 'me', '</w>', 'a</w>', 'suc', 'ce', 's', 's', 'ful', '</w>', 't', 'ra', 'de', 'r', 'nn', 'he', 're', '</w>', 'is', '</w>', 'wh', 'at', '</w>', 'it',