<h1>Implementation</h1>
This notebook is used to randomly select 10 data sequences of 10 laps and generate blog posts based on those.

_Below code can be uncommented to select a GPU that is not yet occupied._

In [12]:
import os

os.environ["CUDA_VISIBLE_DEVICES"] = "1"

<h2>Loading the finetuned model</h2>

In [1]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

model_folder = '../FineTuning/FinetunedModels/RUCAIBox_mvp-data-to-text'
model = AutoModelForSeq2SeqLM.from_pretrained(model_folder)
tok = AutoTokenizer.from_pretrained(model_folder)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


<h2>Loading and reformatting the race data</h2>
In the reformat function, the JSON data is converted to the same format as the format of the data that the LLM was pretrained and finetuned on.

In [9]:
import os
import json


def reformatData(arr):
    prompt = 'Write an entertaining live blog post describing the following event in a Formula 1 race: '
    for obj in arr:
        i = str(arr.index(obj) + 1)
        sub, act, obj = obj['subject'], obj['action'], obj['object']
        sub_i, act_i, obj_i = 'Agent' + i, 'Action' + i, 'Object' + i
        try: sub_cat, sub_ent = [list(s.items()) for s in sub][0][0]
        except IndexError: continue
        prompt += sub_i + ' | ' + sub_cat + ' | ' + sub_ent + ' [SEP] '
        prompt += act_i + ' | ' + act + ' [SEP] '
        for key in obj:
            obj_cat, obj_lst = key, obj[key]
            for obj_ent in obj_lst:
                prompt += obj_i + ' | ' + obj_cat + ' | ' + obj_ent + ' [SEP] '
    return prompt[:-7]


def loadInputData():
    input_folder = '../EventIdentification/ExampleEvents4/'
    race_folders = [input_folder + f for f in
                    os.listdir(input_folder) if 'ipynb' not in f]
    prompts, json_data = {}, {}
    for rf in race_folders:
        rkey = rf.split('/')[-1]
        prompts[rkey], json_data[rkey] = {}, {}
        lap_files = [f for f in os.listdir(rf) if 'ipynb' not in f]
        for lf in lap_files:
            lkey = lf.split('.')[0]
            fn = rf + '/' + lf
            with open(fn, 'r') as f:
                data = json.load(f)
                if not data: continue
                prompts[rkey][lkey] = []
                json_data[rkey][lkey] = []
                for d in data:
                    prompts[rkey][lkey].append(reformatData(d))
                    json_data[rkey][lkey].append(d)
                # prompts[rkey][lkey] = reformatData(data)
    return prompts, json_data


prompts, data = loadInputData()

<h2>Selecting the random races</h2>

In each year between 2018 and 2023, except for 2020 (as this was a Covid year with many cancelled races), two races are randomly selected.

In [6]:
import random


def getRandomRacesInYear(keys, year):
    keys_fy = [k for k in keys if k.startswith(str(year))]
    r1 = random.choice(keys_fy)
    r2 = random.choice(keys_fy)
    # repeat process if the two races are the same
    if r1 == r2: getRandomRacesInYear(keys, year)
    else: return [r1, r2]


# years, sel_races = [2018, 2019, 2021, 2022, 2023], []
years, sel_races = [2023], []
for y in years: sel_races += getRandomRacesInYear(list(data.keys()), y)

<h2>Generating the blog posts</h2>
The post generation function takes care of randomly selecting a sequence of 10 laps.

In [10]:
def generatePosts(prompts):
    # nr_laps = len(list(prompts_fr.keys()))
    # lap_seq = random.randint(1, nr_laps - 11)
    # start, end = lap_seq, lap_seq + 10
    # print(start, end)
    # keys = list(prompts_fr.keys())[start:end]
    # prompts = list(prompts_fr.items())[start:end]
    # prompts = prompts_fr[start:end]
    model_inputs = tok(prompts, return_tensors='pt', padding=True)
    model_output = model.generate(**model_inputs, max_new_tokens=500)
    blog_posts = tok.batch_decode(model_output, skip_special_tokens=True)
    # return {keys[i]: blog_posts[i] for i in range(len(keys))}
    return blog_posts


def randomizeSeq(prompts, nr_laps):
    lap_seq = random.randint(1, nr_laps - 11)
    start, end = lap_seq, lap_seq + 10
    return dict(list(prompts.items())[start:end])

<h2>Save CSV files</h2>

In [14]:
import numpy as np

for r in sel_races:
    print(r)
    posts, nr_laps = {}, len(list(prompts[r].keys()))
    prompt_seq = randomizeSeq(prompts[r], nr_laps)
    for lap in prompt_seq:
        posts[lap] = generatePosts(prompt_seq[lap])
    cols = [['Lap'], ['Data'], ['Prompt'], ['Output']]
    for lap in posts:
        for i in range(len(posts[lap])):
            a = [[lap], [str(data[r][lap][i])], [str(prompts[r][lap][i])], [str(posts[lap][i])]]
            for i in range(len(cols)): cols[i] += a[i]
    fn = './GeneratedPosts/' + r + '.csv'
    np.savetxt(fn, [p for p in zip(*cols)], delimiter=';', fmt='%s')

2023_jeddah
2023_miami
