In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
%pip install transformers datasets torch

Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m12.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m8.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (

In [3]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import re
import torch

os.chdir('/content/drive/MyDrive/Colab Notebooks/google-2e')

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

Using device: cuda


In [4]:
train_df = pd.read_csv('train.tsv', sep = '\t', header = None)
dev_df = pd.read_csv('dev.tsv', sep = '\t', header = None)
test_df = pd.read_csv('test.tsv', sep = '\t', header = None)

train_df.columns = ['Query', 'Well-Formed Rank']
dev_df.columns = ['Query', 'Well-Formed Rank']
test_df.columns = ['Query', 'Well-Formed Rank']

train_df

Unnamed: 0,Query,Well-Formed Rank
0,The European Union includes how many ?,0.2
1,What are Mia Hamms accomplishment ?,0.4
2,Which form of government is still in place in ...,1.0
3,When was the canal de panama built ?,0.8
4,What color is the black box on commercial aero...,0.6
...,...,...
17495,What is the youngest college graduate ?,0.4
17496,Pros and cons of making dams ?,0.0
17497,Life and story of jose ma panganiban ?,0.0
17498,How do you apply for a super delegate position ?,1.0


In [5]:
train_df.apply(lambda x: x.isna().sum())

Unnamed: 0,0
Query,0
Well-Formed Rank,0


In [6]:
train_df['Query'] = train_df['Query'].str.rstrip('?')
dev_df['Query'] = dev_df['Query'].str.rstrip('?')
test_df['Query'] = test_df['Query'].str.rstrip('?')

filtered_train = train_df[train_df['Well-Formed Rank']>=0.5]
filtered_dev = dev_df[dev_df['Well-Formed Rank']>=0.5]
filtered_test = test_df[test_df['Well-Formed Rank']>=0.5]

filtered_train

Unnamed: 0,Query,Well-Formed Rank
2,Which form of government is still in place in ...,1.0
3,When was the canal de panama built,0.8
4,What color is the black box on commercial aero...,0.6
6,How did samoans come into samoa,0.8
9,What is the value of military payment certific...,0.8
...,...,...
17491,What are some consequences of cyber bullying,1.0
17492,What are the 4 major climate types of northern...,1.0
17494,How does a sound wave transmit matter,1.0
17498,How do you apply for a super delegate position,1.0


In [9]:
# finetune gpt model
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments

tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.pad_token = tokenizer.eos_token

model = GPT2LMHeadModel.from_pretrained('gpt2')
model.to(device)

# prepare datasets
train_queries = filtered_train['Query'].tolist()
dev_queries = filtered_dev['Query'].tolist()

# we pad labels to train for autocompletion
def prepare_inputs_and_labels(queries, n): # i think the issue is here, the model keeps suggesting n words and doesn't know where to stop --> suggest a still incomplete query
    inputs = []
    labels = []

    for query in queries:
        input_ids = tokenizer(query, truncation=True, return_tensors='pt')['input_ids'][0]
        inputs.append(tokenizer.decode(input_ids[:-n], skip_special_tokens=True))
        labels.append(tokenizer.decode(input_ids, skip_special_tokens=True))

    return inputs, labels

inputs_train, labels_train = prepare_inputs_and_labels(train_queries, n=3)
inputs_dev, labels_dev = prepare_inputs_and_labels(dev_queries, n=3)

tokenized_train_cleaned = tokenizer(inputs_train, padding=True, truncation=True, return_tensors='pt')
tokenized_dev_cleaned = tokenizer(inputs_dev, padding=True, truncation=True, return_tensors='pt')

train_labels = torch.full(tokenized_train_cleaned['input_ids'].shape, -100)
for i in range(len(labels_train)):
    label_ids = tokenizer(labels_train[i], truncation=True, return_tensors='pt')['input_ids'][0]
    train_labels[i, :label_ids.shape[0]] = label_ids[:tokenized_train_cleaned['input_ids'].shape[1]]

dev_labels = torch.full(tokenized_dev_cleaned['input_ids'].shape, -100)
for i in range(len(labels_dev)):
    label_ids = tokenizer(labels_dev[i], truncation=True, return_tensors='pt')['input_ids'][0]
    dev_labels[i, :label_ids.shape[0]] = label_ids[:tokenized_dev_cleaned['input_ids'].shape[1]]

tokenized_train_cleaned['labels'] = train_labels
tokenized_dev_cleaned['labels'] = dev_labels

# create dataset
class QueryDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        return item

    def __len__(self):
        return len(self.encodings.input_ids)

train_dataset = QueryDataset(tokenized_train_cleaned)
dev_dataset = QueryDataset(tokenized_dev_cleaned)

# train model on dataset (takes 5-10 min w/ gpu)
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=4,
    save_steps=10_000,
    save_total_limit=2,
    logging_dir='./logs',
    report_to='none',
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=dev_dataset,
)

trainer.train()

# save finetuned model
model.save_pretrained('./gpt_model_save1')
tokenizer.save_pretrained('./gpt_model_save1')

Step,Training Loss
500,4.7308
1000,4.5124
1500,4.3496
2000,4.3027
2500,3.9114
3000,3.6215
3500,3.5608
4000,3.5788
4500,3.4965
5000,3.1759


('./gpt_model_save1/tokenizer_config.json',
 './gpt_model_save1/special_tokens_map.json',
 './gpt_model_save1/vocab.json',
 './gpt_model_save1/merges.txt',
 './gpt_model_save1/added_tokens.json')

In [10]:
# test model
# load in model
# tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
# tokenizer.pad_token = tokenizer.eos_token

# model = GPT2LMHeadModel.from_pretrained('gpt2')
# model = model.to(device)
tokenizer = GPT2Tokenizer.from_pretrained('./gpt_model_save1')
model = GPT2LMHeadModel.from_pretrained('./gpt_model_save1')
model = model.to(device)
test_queries = test_df['Query'][:10].tolist()
n = 3

for query in test_queries:

    input_ids_full = tokenizer(query, return_tensors='pt')

    # get the incomplete input by removing the last n tokens
    input_ids = input_ids_full['input_ids'][0][:-n].unsqueeze(0)
    attention_mask = input_ids_full['attention_mask'][:, :-n]
    input_ids = input_ids.to(device)
    attention_mask = attention_mask.to(device)

    outputs = model.generate(
        input_ids=input_ids,
        attention_mask=attention_mask,
        pad_token_id=tokenizer.eos_token_id,
        max_length=input_ids.shape[1] + n + 5,
        num_return_sequences=1,
        no_repeat_ngram_size=2,
        do_sample=True,
        top_k=50,
        top_p=0.95,
        temperature=1.0,
        repetition_penalty=1.2
    )

    predicted_completion = tokenizer.decode(outputs[0], skip_special_tokens=True)

    print(f"Input (Incomplete query): {tokenizer.decode(input_ids[0], skip_special_tokens=True)}")
    print(f"Full query: {query}")
    print(f"Completion: {predicted_completion}\n")

Input (Incomplete query): Interesting facts
Full query: Interesting facts about Egypt 
Completion: Interesting facts about the nazi 's control of

Input (Incomplete query): What is thais in phuket
Full query: What is thais in phuket famous for 
Completion: What is thais in phuket and what is their capital city of ph

Input (Incomplete query): What places have the olig
Full query: What places have the oligarchy government 
Completion: What places have the oligarchy structure in spain and what are

Input (Incomplete query): Where is the radiator fan relay located at 97
Full query: Where is the radiator fan relay located at 97 voyager 
Completion: Where is the radiator fan relay located at 97 ywtr jr xbl z

Input (Incomplete query): When was the first
Full query: When was the first helicopters built 
Completion: When was the first computer made after being invented by the Chinese

Input (Incomplete query): Where is atp synthesized
Full query: Where is atp synthesized and stored 
Completio

# Vanessa's updated version


In [11]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments

tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.pad_token = tokenizer.eos_token

model = GPT2LMHeadModel.from_pretrained('gpt2')
model.to(device)

# prepare datasets
train_queries = filtered_train['Query'].tolist()
dev_queries = filtered_dev['Query'].tolist()

# we pad labels to train for autocompletion
def prepare_inputs_and_labels(queries, n):
    inputs = []
    labels = []

    for query in queries:
        input_ids = tokenizer(query, truncation=True, return_tensors='pt')['input_ids'][0]
        inputs.append(tokenizer.decode(input_ids[:-n], skip_special_tokens=True))
        labels.append(tokenizer.decode(input_ids, skip_special_tokens=True))

    return inputs, labels

inputs_train, labels_train = prepare_inputs_and_labels(train_queries, n=3)
inputs_dev, labels_dev = prepare_inputs_and_labels(dev_queries, n=3)

tokenized_train_cleaned = tokenizer(inputs_train, padding=True, truncation=True, return_tensors='pt')
tokenized_dev_cleaned = tokenizer(inputs_dev, padding=True, truncation=True, return_tensors='pt')

train_labels = torch.full(tokenized_train_cleaned['input_ids'].shape, -100)
for i in range(len(labels_train)):
    label_ids = tokenizer(labels_train[i], truncation=True, return_tensors='pt')['input_ids'][0]
    train_labels[i, :label_ids.shape[0]] = label_ids[:tokenized_train_cleaned['input_ids'].shape[1]]

dev_labels = torch.full(tokenized_dev_cleaned['input_ids'].shape, -100)
for i in range(len(labels_dev)):
    label_ids = tokenizer(labels_dev[i], truncation=True, return_tensors='pt')['input_ids'][0]
    dev_labels[i, :label_ids.shape[0]] = label_ids[:tokenized_dev_cleaned['input_ids'].shape[1]]

tokenized_train_cleaned['labels'] = train_labels
tokenized_dev_cleaned['labels'] = dev_labels

# create dataset
class QueryDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        return item

    def __len__(self):
        return len(self.encodings.input_ids)

train_dataset = QueryDataset(tokenized_train_cleaned)
dev_dataset = QueryDataset(tokenized_dev_cleaned)

# train model on dataset (takes 5-10 min w/ gpu)
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=1, # VH: reduced epoch
    per_device_train_batch_size=4,
    save_steps=10_000,
    save_total_limit=2,
    logging_dir='./logs',
    report_to='none',
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=dev_dataset,
)

trainer.train()

# save finetuned model
model.save_pretrained('./gpt_model_save2')
tokenizer.save_pretrained('./gpt_model_save2')

Step,Training Loss
500,4.7273
1000,4.5046
1500,4.3462
2000,4.3171


('./gpt_model_save2/tokenizer_config.json',
 './gpt_model_save2/special_tokens_map.json',
 './gpt_model_save2/vocab.json',
 './gpt_model_save2/merges.txt',
 './gpt_model_save2/added_tokens.json')

In [12]:
# # Load fine-tuned model for testing
# tokenizer = GPT2Tokenizer.from_pretrained('./fine_tuned_model')
# model = GPT2LMHeadModel.from_pretrained('./fine_tuned_model')
# model = model.to(device)

test_queries = test_df['Query'][:10].tolist()

for query in test_queries:
    # tokenize and truncate input
    input_ids_full = tokenizer(query, return_tensors='pt')
    n = 3  # number of tokens to truncate for input
    input_ids = input_ids_full['input_ids'][0][:-n].unsqueeze(0).to(device)
    attention_mask = input_ids_full['attention_mask'][:, :-n].to(device)

    # (played around tbh)
    # limit len output
    # and lower traninig samples
    outputs = model.generate(
    input_ids=input_ids,
    attention_mask=attention_mask,
    pad_token_id=tokenizer.eos_token_id,
    max_length=input_ids.shape[1] + n + 10,
    num_return_sequences=1,  #try 3
    no_repeat_ngram_size=2,
    do_sample=True,
    top_k=16,               # lower top_k: focus on most likely next tokens
    top_p=0.5,              # lower top_p: narrow token slt
    temperature=0.7,        # lower temp: less randomness
    repetition_penalty=2.0  # increase to avoid repetition
  )


    # decode and post-process the output
    predicted_completion = tokenizer.decode(outputs[0], skip_special_tokens=True)
    if not predicted_completion.endswith("?"):
        predicted_completion += "?"  # add ? if missing

    print(f"Input (Incomplete query): {tokenizer.decode(input_ids[0], skip_special_tokens=True)}")
    print(f"Full query: {query}")
    print(f"Completion: {predicted_completion}\n")

Input (Incomplete query): Interesting facts
Full query: Interesting facts about Egypt 
Completion: Interesting facts about the world in which it was invented by the inventor of a?

Input (Incomplete query): What is thais in phuket
Full query: What is thais in phuket famous for 
Completion: What is thais in phuket 's surname and what does it mean to be a monk of?

Input (Incomplete query): What places have the olig
Full query: What places have the oligarchy government 
Completion: What places have the oligarchs in america and what is their role as a government of?

Input (Incomplete query): Where is the radiator fan relay located at 97
Full query: Where is the radiator fan relay located at 97 voyager 
Completion: Where is the radiator fan relay located at 97.1 miles per hour on a 2000 cherokee tah?

Input (Incomplete query): When was the first
Full query: When was the first helicopters built 
Completion: When was the first movie made in america and what did it mean to him as?

Input (In



```
Query: Interesting facts about Egypt
Process Input (truncate last n tokens): Interesting facts ...
Full query: Interesting facts about Egypt
```



In [13]:
# # Load fine-tuned model for testing
# tokenizer = GPT2Tokenizer.from_pretrained('./fine_tuned_model')
# model = GPT2LMHeadModel.from_pretrained('./fine_tuned_model')
# model = model.to(device)

test_queries = test_df['Query'][:10].tolist()

for query in test_queries:
    # tokenize and truncate input
    input_ids_full = tokenizer(query, return_tensors='pt')
    n = 3  # number of tokens to truncate for input
    input_ids = input_ids_full['input_ids'][0][:-n].unsqueeze(0).to(device)
    attention_mask = input_ids_full['attention_mask'][:, :-n].to(device)

    # (played around tbh)
    # limit len output
    # and lower traninig samples
    outputs = model.generate(
    input_ids=input_ids,
    attention_mask=attention_mask,
    pad_token_id=tokenizer.eos_token_id,
    max_length=input_ids.shape[1] + n + 10,
    num_return_sequences=1,  #try 3
    no_repeat_ngram_size=2,
    do_sample=True,
    top_k=17,               # lower top_k: focus on most likely next tokens
    top_p=0.5,              # lower top_p: narrow token slt
    temperature=0.8,        # lower temp: less randomness
    repetition_penalty=2.0 # increase to avoid repetition
  )


    # decode and post-process the output
    predicted_completion = tokenizer.decode(outputs[0], skip_special_tokens=True)
    if not predicted_completion.endswith("?"):
        predicted_completion += "?"  # add ? if missing

    print(f"Input (Incomplete query): {tokenizer.decode(input_ids[0], skip_special_tokens=True)}")
    print(f"Full query: {query}")
    print(f"Completion: {predicted_completion}\n")

Input (Incomplete query): Interesting facts
Full query: Interesting facts about Egypt 
Completion: Interesting facts about the world of science fiction and fantasy writing in julian?

Input (Incomplete query): What is thais in phuket
Full query: What is thais in phuket famous for 
Completion: What is thais in phuket and how does it affect the world economy ia ica ?

Input (Incomplete query): What places have the olig
Full query: What places have the oligarchy government 
Completion: What places have the oligarchy in Greece and what is its role as a political party of?

Input (Incomplete query): Where is the radiator fan relay located at 97
Full query: Where is the radiator fan relay located at 97 voyager 
Completion: Where is the radiator fan relay located at 97.5 volts and how many miles from New York to San Francisco?

Input (Incomplete query): When was the first
Full query: When was the first helicopters built 
Completion: When was the first ever movie made and what did it mean to y

# Try to remove the trailing of the open-ended outputs

- Currently doesn't seem much better. It removed trailings but doesn't provide suggestions for other inputs
- Needs further work


In [14]:
test_queries = test_df['Query'][:10].tolist()

import re

def clean_completion(completion):
    # hardcode rm trailing
    completion = re.sub(r'\b(and|what|how|does|is|was|to|by)\b.*$', '', completion).strip()
    return completion


for query in test_queries:
    # tokenize and truncate input
    input_ids_full = tokenizer(query, return_tensors='pt')
    n = 3  # number of tokens to truncate for input
    input_ids = input_ids_full['input_ids'][0][:-n].unsqueeze(0).to(device)
    attention_mask = input_ids_full['attention_mask'][:, :-n].to(device)


    outputs = model.generate(
    input_ids=input_ids,
    attention_mask=attention_mask,
    pad_token_id=tokenizer.eos_token_id,
    max_length=input_ids.shape[1] + 10,  # changed here (remove +n)
    num_return_sequences=2,
    no_repeat_ngram_size=2,
    do_sample=True,
    top_k=20,               # lower top_k to focus on most likely next tokens
    top_p=0.8,              # lower top_p to further narrow token slt
    temperature=0.5,        # lower temp for less randomness
    repetition_penalty=2.0  # increase penalty to avoid repetition
  )

    # decode and post-process the output
    predicted_completion = tokenizer.decode(outputs[0], skip_special_tokens=True)
    predicted_completion = clean_completion(predicted_completion)

    if not predicted_completion.endswith("?"):
        predicted_completion += "?"  # add ? if missing

    print(f"Input (Incomplete query): {tokenizer.decode(input_ids[0], skip_special_tokens=True)}")
    print(f"Full query: {query}")
    print(f"Completion: {predicted_completion}\n")

Input (Incomplete query): Interesting facts
Full query: Interesting facts about Egypt 
Completion: Interesting facts about the world in australia?

Input (Incomplete query): What is thais in phuket
Full query: What is thais in phuket famous for 
Completion: What?

Input (Incomplete query): What places have the olig
Full query: What places have the oligarchy government 
Completion: What places have the oligarchy in czech republics?

Input (Incomplete query): Where is the radiator fan relay located at 97
Full query: Where is the radiator fan relay located at 97 voyager 
Completion: Where?

Input (Incomplete query): When was the first
Full query: When was the first helicopters built 
Completion: When?

Input (Incomplete query): Where is atp synthesized
Full query: Where is atp synthesized and stored 
Completion: Where?

Input (Incomplete query): How much chemical engineers
Full query: How much chemical engineers are paid 
Completion: How much chemical engineers make a year in the US?

Inp

# V3


a silly output of this version


- Input (Incomplete query): How much chemical engineers
- Full query: How much chemical engineers are paid
- Completion: How much chemical engineers do in the US  and how many are there on earth?


