# wikitext dataset preparation

In [1]:
# %env HF_HOME=/mnt/LLM
# %env CUDA_VISIBLE_DEVICES=1
%env OMP_NUM_THREADS=16 
%env MKL_NUM_THREADS=16 

from IPython.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

env: HF_HOME=/mnt/LLM
env: OMP_NUM_THREADS=16
env: MKL_NUM_THREADS=16


In [2]:
import re
import json
from datasets import load_dataset

In [4]:
# loading dataset
data = load_dataset("wikitext", "wikitext-2-raw-v1", split="train")  # choose test or train
len(data['text'])

36718

In [6]:
# typical article heading - note the headers and empty lines
data[:6]

{'text': ['',
  ' = Valkyria Chronicles III = \n',
  '',
  ' Senjō no Valkyria 3 : Unrecorded Chronicles ( Japanese : 戦場のヴァルキュリア3 , lit . Valkyria of the Battlefield 3 ) , commonly referred to as Valkyria Chronicles III outside Japan , is a tactical role @-@ playing video game developed by Sega and Media.Vision for the PlayStation Portable . Released in January 2011 in Japan , it is the third game in the Valkyria series . Employing the same fusion of tactical and real @-@ time gameplay as its predecessors , the story runs parallel to the first game and follows the " Nameless " , a penal military unit serving the nation of Gallia during the Second Europan War who perform secret black operations and are pitted against the Imperial unit " Calamaty Raven " . \n',
  " The game began development in 2010 , carrying over a large portion of the work done on Valkyria Chronicles II . While it retained the standard features of the series , it also underwent multiple adjustments , such as making th

In [7]:
# finding articles based on " = ARTICLE_TITLE = \n" title format

article_starts = [i for i, line in enumerate(data['text']) if re.match(r"^\s=\s\w", line)]
article_starts.append(len(data['text']) + 1)
print(f"{len(article_starts)=}")
for i in range(3):
    print(data['text'][article_starts[i]], end ='')

len(article_starts)=630
 = Valkyria Chronicles III = 
 = Tower Building of the Little Rock Arsenal = 
 = Cicely Mary Barker = 


In [8]:
import random
prompts = []

for i in range(len(article_starts) - 1):
    # get article text
    text = '\n'.join(data['text'][article_starts[i]: article_starts[i + 1]])
    
    # take first random number of characters
    random_char_count = random.randint(1000, 1500)  # enter number limits here
    prompt = text[:random_char_count]

    # trimming at the last space
    pattern = r'\s+(?=[^\s]*$)'
    match = re.search(pattern, prompt)
    prompt = prompt[:match.start()]
    prompts.append(prompt)

random.shuffle(prompts)

len(prompts)

629

In [9]:
# print samples of resulting prompts
for i in range(3):
    print(prompts[i])
    print('-' * 80)
    # texts[0]

 = Jacob deGrom = 


 Jacob Anthony deGrom ( born June 19 , 1988 ) , is an American professional baseball pitcher for the New York Mets of Major League Baseball ( MLB ) . Prior to playing professionally , deGrom attended Stetson University and played college baseball for the Stetson Hatters . 

 DeGrom began playing baseball as a shortstop and was converted into a pitcher during his junior year at Stetson . The Mets selected him in the ninth round of the 2010 MLB Draft , and he made his MLB debut with the Mets on May 15 , 2014 . That year , deGrom was named the National League 's ( NL ) Rookie of the Month twice , and the NL Rookie of the Year . In 2015 , deGrom was selected as an MLB All @-@ Star . 


 = = Amateur career = = 


 DeGrom attended Calvary Christian Academy in Ormond Beach , Florida , where he played for the school 's baseball and basketball teams . As a senior , the Florida Sports Writers Association named deGrom to the All @-@ Florida second team . He also played Americ

In [10]:
save_path = "./wikitext_prompts.json"

json_str = json.dumps([[i, p] for i, p in enumerate(prompts)])

with open (save_path, "w") as t:
    t.write(json_str)