In [1]:
from tokenizers import ByteLevelBPETokenizer
import re
import glob
import chardet
import nltk
from nltk import Tree
nltk.download('punkt')
import os

[nltk_data] Downloading package punkt to /home/jts75596/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# Function to convert files to UTF-8 encoding

In [2]:
def convert_to_utf8(filename, output_filename):
    with open(filename, 'rb') as f:
        raw_data = f.read()
        result = chardet.detect(raw_data)
        encoding = result['encoding']
#         print(f"Converting {filename} from {encoding} to UTF-8")
        
    with open(filename, 'r', encoding=encoding, errors='ignore') as f, open(output_filename, 'w', encoding='utf-8') as outfile:
        for line in f:
            outfile.write(line)

In [2]:
# Glob together all of the raw text files from wsj
pattern = "../../treebank_3/raw/wsj/*/*"
raw_wsj = glob.glob(pattern, recursive=True)

In [4]:
# Make all subdirectories you need
for i in range(0, 25):
    output_dir = "../data/raw_ptb_utf8/"
    if i < 10:
        i = "0"+str(i)
    output_dir = output_dir + str(i)
    os.makedirs(output_dir, exist_ok=True)

# Convert eah file to utf-8
for file in raw_wsj:
    output_dir = "../data/raw_ptb_utf8"
    output_filename = file.replace("../../treebank_3/raw/wsj", output_dir)
    output_filename = output_filename + ".txt"
    convert_to_utf8(file, output_filename)

In [34]:
pattern = "../data/raw_ptb_utf8/*/*"
utf8_ptb = glob.glob(pattern, recursive=True)

In [35]:
# "../tokenizers/rnng/vocab.json", "../tokenizers/rnng/merges.txt"

In [36]:
tokenizer = ByteLevelBPETokenizer()

In [37]:
tokenizer.train(files=utf8_ptb, vocab_size=12000, min_frequency=10, special_tokens=[
    "<s>",
    "<pad>",
    "</s>",
    "<unk>",
    "<mask>",
])
tokenizer.save_model("../tokenizers/rnng")






['../tokenizers/rnng/vocab.json', '../tokenizers/rnng/merges.txt']

In [38]:
tokenizer = ByteLevelBPETokenizer("../tokenizers/rnng/vocab.json", "../tokenizers/rnng/merges.txt")

In [69]:
syntax_tree = """(SINV-2  (ADVP-PRD-TPC-1 (RB So))  (, ,)  (ADVP (RB too))  (PRN    (, ,)    (S      (NP-SBJ (JJ many) (NNS analysts))      (VP (VBP predict) (SBAR (-NONE- 0) (SINV (-NONE- *T*-2)))))    (, ,))  (VP (MD will) (VP (-NONE- *?*) (ADVP-PRD (-NONE- *T*-1))))  (NP-SBJ    (NP (NNP Exxon) (NNP Corp.))    (, ,)    (NP (NNP Chevron) (NNP Corp.))    (CC and)    (NP (NNP Amoco) (NNP Corp)))  (. .))"""


tree = Tree.fromstring(syntax_tree)
print("Parsed Tree:", tree)

Parsed Tree: (SINV-2
  (ADVP-PRD-TPC-1 (RB So))
  (, ,)
  (ADVP (RB too))
  (PRN
    (, ,)
    (S
      (NP-SBJ (JJ many) (NNS analysts))
      (VP (VBP predict) (SBAR (-NONE- 0) (SINV (-NONE- *T*-2)))))
    (, ,))
  (VP (MD will) (VP (-NONE- *?*) (ADVP-PRD (-NONE- *T*-1))))
  (NP-SBJ
    (NP (NNP Exxon) (NNP Corp.))
    (, ,)
    (NP (NNP Chevron) (NNP Corp.))
    (CC and)
    (NP (NNP Amoco) (NNP Corp)))
  (. .))


In [86]:
def tokenize_tree(tree, tokenizer):
    tokenized_tree = tree.copy(deep=True)
    for subtree in tokenized_tree.subtrees(lambda t: t.height() == 2):
        word = subtree[0]
        if re.match(r'\b\w+\b|[.!,]', word):  # Match words and punctuation
            encoded = tokenizer.encode(" " + word)
#             print(encoded.tokens)
            subtree[0] = " ".join(encoded.tokens)
    return tokenized_tree

In [87]:
tokenized_tree = tokenize_tree(tree, tokenizer)

In [88]:
string = " Rudolph Agnew, 55 years old and former chairman of Consolidated Gold Fields PLC, was named a nonexecutive director of this British industrial conglomerate."
encoded = tokenizer.encode(string)

In [90]:
def tree_to_str(tree):
    return " ".join(tree.pformat().split())

In [91]:
# tree_to_str(tokenized_tree)

In [92]:
train = "../data/train-ptb.txt"
test = "../data/test-ptb.txt"
val = "../data/val-ptb.txt"

In [93]:
with open(train, 'r') as infile, open("../data/tokenized_data/tok-train-ptb.txt", 'w') as outfile:    
    for line in infile:
        tree = Tree.fromstring(line)
        tokenized_tree = tokenize_tree(tree, tokenizer)
        tree_str = tree_to_str(tokenized_tree)
        outfile.write(tree_str + '\n')

In [94]:
with open(test, 'r') as infile, open("../data/tokenized_data/tok-test-ptb.txt", 'w') as outfile:    
    for line in infile:
        tree = Tree.fromstring(line)
        tokenized_tree = tokenize_tree(tree, tokenizer)
        tree_str = tree_to_str(tokenized_tree)
        outfile.write(tree_str + '\n')

In [95]:
with open(val, 'r') as infile, open("../data/tokenized_data/tok-val-ptb.txt", 'w') as outfile:    
    for line in infile:
        tree = Tree.fromstring(line)
        tokenized_tree = tokenize_tree(tree, tokenizer)
        tree_str = tree_to_str(tokenized_tree)
        outfile.write(tree_str + '\n')