## Loading Data and Model

In [None]:
import numpy as np
import torch
import math
import matplotlib.pyplot as plt
import pandas as pd
import os
from pathlib import Path
import zipfile
from tqdm import tqdm
import torch.nn as nn
from torch.utils.data import DataLoader
from PIL import Image
from torchvision import transforms
import torch.nn.functional as F
from IPython.display import display
from PIL import Image
from datasets import Dataset
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import spacy
import nltk


import os
os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com'

### hint_description is a dictionary that links every hint's ID to its corresponding description
### there are a total of 118 hints 

TRAIN_TEXT = "/bohr/train-7ul9/v2"
hint_description = Dataset.load_from_disk(TRAIN_TEXT + "/dataset/hint_descriptions")
hint_description = {
    x['ID']: {'description': x['Description'], 'icons': x['image']}
    for x in hint_description
}

### 20 samples of hints and label, for validation

validation_data = Dataset.load_from_disk(TRAIN_TEXT + "/dataset/takehome_validation")


from transformers import AutoTokenizer, AutoModelForMaskedLM

model_dir = '/personal/bert-large-uncased/'
model = AutoModelForMaskedLM.from_pretrained(model_dir)
tokenizer = AutoTokenizer.from_pretrained(model_dir)

### Select Best Prompt using Sentence Transformer

In [None]:
from sentence_transformers import SentenceTransformer, InputExample, losses
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
from torch.utils.data import DataLoader
import itertools
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import DBSCAN
from sklearn.metrics import pairwise_distances_argmin_min
from sklearn.metrics import pairwise_distances

st_model = SentenceTransformer('/all-MiniLM-L6-v2')

'''
train_examples = []
for val in validation_data:
  train_examples.append(InputExample(texts=[give_best_prompt(val['hints']), {val['label']}], label=1))

train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=2)

train_loss = losses.CosineSimilarityLoss(st_model)

st_model.fit(
    train_objectives=[(train_dataloader, train_loss)],
    epochs=20,
    warmup_steps=5,
    output_path='./model',
    optimizer_params={'lr': 1e-5},
    weight_decay=0.01,
    save_best_model=True,
    show_progress_bar=True
)
'''

In [None]:
nltk.data.path.append("/personal/")

def tuple_to_prompt(tuple):
    sentence = "A "
    for i, word in enumerate(tuple):
        sentence += word
        if i == 0:
            sentence += " related to "
        elif i < len(tuple) - 1:
            sentence += ", and "
        else:
            sentence += "."
    return sentence

def all_possible_prompts(hints: list[int]):
    split_strings_list = []
    prompt_list = []
    for i, hint in enumerate(hints):
        split_strings = hint_description[hint]['description'].split('\n')
        temp = []
        for word in split_strings:
            temp += word.split(" - ")
        split_strings = temp
        split_strings_list.append(split_strings)
    cartesian = itertools.product(*split_strings_list)
    cartesian_list = []
    for point in cartesian:
        cartesian_list.append(point)
        prompt_list.append(tuple_to_prompt(point))
    return prompt_list, cartesian_list

def find_closest_outlier(embeddings, eps=0.5, min_samples=5):
    clustering = DBSCAN(eps=eps, min_samples=min_samples).fit(embeddings)
    labels = clustering.labels_
    outliers = embeddings[labels == -1]
    inliers = embeddings[labels != -1]
    if len(inliers) == 0:
        return np.where(labels == -1)[0][0]
    distances = pairwise_distances(inliers, outliers)
    closest_outlier_indices = distances.argmin(axis=1)
    closest_inlier_index = distances.min(axis=1).argmin()
    return np.where(labels == -1)[0][closest_outlier_indices[closest_inlier_index]]
    
def find_middle_sentence(embeddings):
    clustering = DBSCAN(eps=0.5, min_samples=5).fit(embeddings)
    labels = clustering.labels_
    unique_labels = set(labels)
    unique_labels.discard(-1)
    if not unique_labels:
        return find_closest_outlier(embeddings)
    main_cluster_label = max(unique_labels, key=lambda label: np.sum(labels == label))
    main_cluster_indices = np.where(labels == main_cluster_label)[0]
    main_cluster_embeddings = embeddings[main_cluster_indices]
    cluster_center = np.mean(main_cluster_embeddings, axis=0)
    closest_idx, _ = pairwise_distances_argmin_min(
        cluster_center.reshape(1, -1), 
        main_cluster_embeddings
    )
    return main_cluster_indices[closest_idx[0]]

def adj_adjust(hints:tuple):
    sentence = "A [MASK] is a "
    noun_list = []
    adj_list = []
    for i, hint in enumerate(hints):
        word = hint
        if i == 0:
            sentence += word
        else:
            tags = nltk.pos_tag(["A",word.lower(),"cat"]) 
            if tags[1][1] in ["JJ","JJR","JJS"]:
                adj_list.append(word)
                sentence = sentence.replace("A [MASK] is a ",f"A [MASK] is a {word} ")
            else:
                noun_list.append(word)
    if len(adj_list)>=3:
        return "Too Many Adj"
    if len(noun_list)>0:
        sentence += " connected to "
    for i in range(len(noun_list)):
        word = noun_list[i]
        sentence += word
        if i < len(noun_list)-1:
            sentence += ", and "
        else:
            sentence +="."
    sentence = sentence.replace('\n', ', ')
    return sentence

def give_best_prompt(hints: list[int]):
    prompts_list, cartesian_list = all_possible_prompts(hints)
    embeddings = st_model.encode(prompts_list)
    index = find_middle_sentence(embeddings)
    best_cartesian = cartesian_list[index]
    best_prompt = adj_adjust(best_cartesian)
    if best_prompt == best_prompt:
        best_prompt = prompts_list[index]
        best_prompt = best_prompt.replace("A ", "A [MASK] is a ")
    return best_prompt
    
for i in range (20):
    print(give_best_prompt(validation_data[i]['hints']))
    print(validation_data[i]['label'])

### Using BERT to fill [MASK]

In [None]:
text = 'A [MASK] is a type of Clothing related to Head, and High, and Defence.'

inputs = tokenizer(text, return_tensors="pt")

with torch.no_grad():
    outputs = model(**inputs)

mask_token_index = torch.where(inputs["input_ids"] == tokenizer.mask_token_id)[1]

mask_logits = outputs.logits[0, mask_token_index, :]

probabilities = torch.nn.functional.softmax(mask_logits, dim=-1)
predicted_token_id = torch.argmax(probabilities).item()

predicted_token = tokenizer.decode(predicted_token_id)

print(f"Input text: {text}")
print()
#print(f"Predicted token for [MASK]: {predicted_token}")

top_k = 10
top_k_tokens_ids = torch.topk(mask_logits, top_k, dim=1).indices[0].tolist()
top_k_tokens = [tokenizer.decode([token_id]) for token_id in top_k_tokens_ids]

print(f"Top {top_k} predictions:")
for i, token in enumerate(top_k_tokens):
    print(f"{i+1}. {token}")

In [None]:
def BERTTopKMaskWord(input_sentence,candidate_words):
    model.to('cpu')
    inputs = tokenizer(input_sentence, return_tensors='pt')
    mask_token_index = torch.where(inputs["input_ids"][0] == tokenizer.mask_token_id)[0]
    
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits
    
    mask_logits = logits[0, mask_token_index, :].squeeze()
    
    candidate_indices = [tokenizer.convert_tokens_to_ids(word) for word in candidate_words]
    
    candidate_logits = mask_logits[candidate_indices]
    
    _, topk_indices = torch.topk(candidate_logits,20)
    
    result_list = []
    j = 0
    for i in topk_indices:
        if j == 10:
            break
        if candidate_words[i] not in result_list:
            result_list.append(candidate_words[i])
            j+=1
        
    return result_list

In [None]:
def score(guesses: list[str], gold: str):
    # Normalize to lowercase
    guesses = [g.lower() for g in guesses[:10]]
    gold = gold.lower()

    result = {
        "hits@10": 0.0,
        "ndcg@10": 0.0,
        "total_score": 0.0
    }

    if gold in guesses:
        rank = guesses.index(gold)
        result["hits@10"] = 1.0
        result["ndcg@10"] = 1.0 / math.log2(rank + 2)  # rank + 2 because index is 0-based
    else:
        result["hits@10"] = 0.0
        result["ndcg@10"] = 0.0

    result["total_score"] = 0.9 * result["hits@10"] + 0.1 * result["ndcg@10"]
    return result

#print(score(['cat', 'dog', 'tree', 'flower', 'rock', 'water', 'fried rice', 'airplane', 'cactus', 'tiger'], gold='cactus'))

In [None]:
total_scores=0
for example in validation_data:
    result_list = BERTTopKMaskWord( hints_to_sentence_ori(example['hints']), example['options'] )
    print(hints_to_sentence_ori(example['hints']),example['label'],example['label'] in result_list)
    total_scores += score(result_list, example['label'])['total_score']
print(f"Average validation score: {total_scores / len(validation_data)}")

## Fine Tuning



In [None]:
import pandas as pd

def parse_txt_file(file_path):
    
    with open(file_path, 'r') as file:
        content = file.read().strip()
    
    blocks = content.split('\n\n')
    
    data = []
    for block in blocks:
        lines = block.strip().split('\n')
        if len(lines) < 5:
            continue 
            
        elif len(lines) == 5:
            hint1 = lines[0].rstrip(';').strip()
            hint2 = lines[1].rstrip(';').strip()
            hint3 = lines[2].rstrip(';').strip()
            
            options = lines[3].strip()
            options = options.strip('{}').replace(' ', '').split(',')
            
            label = lines[4].strip()
            
            data.append([hint1, hint2, hint3, options, label])
            
        elif len(lines) == 6:
            
            hint1 = lines[0].rstrip(';').strip()
            hint2 = lines[1].rstrip(';').strip()
            hint3 = lines[2].rstrip(';').strip()
            
            options = lines[4].strip()
            options = options.strip('{}').replace(' ', '').split(',')
            
            label = lines[5].strip()
            
            data.append([hint1, hint2, hint3, options, label])
    
    df = pd.DataFrame(data, columns=['hint1', 'hint2', 'hint3', 'options', 'label'])
    return df


file_path = '/personal/2.txt'
data = parse_txt_file(file_path)
data.head()

In [None]:
def all_possible_prompts_df(df, idx):
    split_strings_list = []
    prompt_list = []
    for i in range (3):
        hint = df.iloc[idx,i]
        split_strings = hint.split(', ')
        split_strings_list.append(split_strings)
    cartesian = itertools.product(*split_strings_list)
    for point in cartesian:
         prompt_list.append(tuple_to_prompt(point))
    return prompt_list
    
def give_best_prompt_df(df,idx):
    prompts_list = all_possible_prompts_df(df,idx)    
    embeddings = st_model.encode(prompts_list)
    index = find_middle_sentence(embeddings)
    best_prompt = prompts_list[index]
    best_prompt = best_prompt.replace("A ", "A [MASK] is a type of ")
    return best_prompt

In [None]:
def hints_to_sentence_df(df,idx):
    sentence = "[MASK] is a "
    for i in range(3):
        hint = df.iloc[idx,i]
        split_strings = hint.split(', ')
        if i == 0:
            sentence += f"{split_strings[0]}"
            sentence += " connected to "
        elif i < 2:
            sentence += f"{split_strings[0]}"
            sentence += ", and "
        else:
            sentence += f"{split_strings[0]}"
            sentence += "."
    sentence = sentence.replace('\n', ', ')
    return sentence

def choice_to_doc(choice:str)->str: ### Label to prompt 
  return f"Our target word: {choice}"

i=14
print(hints_to_sentence_df(data,i))
print(data.loc[i,'label'])

In [None]:
class LoRA(nn.Module):
    def __init__(self, in_features, out_features, rank=8, alpha=16):
        super().__init__()
        self.rank = rank  ### LoRA的秩（rank），控制低秩矩阵的大小
        self.scaling = alpha ### 用来控制lora层的scaling参数
        self.A = nn.Linear(in_features, rank, bias=False)  ### 低秩矩阵A
        self.B = nn.Linear(rank, out_features, bias=False)  ### 低秩矩阵B
        
        self.A.weight.data.normal_(mean=0.0, std=0.02) ### 矩阵参数初始化
        self.B.weight.data.zero_()

    def forward(self, x):
        return self.B(self.A(x)) * self.scaling

def apply_lora(model, rank=8):
    for param in model.parameters():
        param.requires_grad = False
    for name, module in model.named_modules():
        if isinstance(module, nn.Linear) and module.weight.shape[0] == module.weight.shape[1]:
            lora = LoRA(module.weight.shape[0], module.weight.shape[1], rank=rank).to(model.device)
            setattr(module, "lora", lora)
            original_forward = module.forward

            # 显式绑定
            def forward_with_lora(x, layer1=original_forward, layer2=lora):
                return layer1(x) + layer2(x)

            module.forward = forward_with_lora

In [None]:
class MyDataset(torch.utils.data.Dataset):
    def __init__(self,data,tokenizer):
        self.data = data
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.data)

    def __getitem__(self,idx):
        sentence = hints_to_sentence_df(self.data,idx)
        candidate = self.data.loc[idx,'options']
        label = self.data.loc[idx,'label']
        
        sentence = tokenizer(sentence, return_tensors='pt',padding='max_length',max_length=50)
        
        padding_length = 60 - len(candidate)
        candidate = candidate + [tokenizer.pad_token] * padding_length
        candidate = np.array([tokenizer.convert_tokens_to_ids(word) for word in candidate])
        
        label = tokenizer.convert_tokens_to_ids(label) #tokenizer(label, return_tensors='pt',padding='max_length',max_length=8)["input_ids"][0]

        return sentence, candidate, label

class MyDataset_DS(torch.utils.data.Dataset):
    def __init__(self,data,tokenizer):
        self.data = data
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.data)

    def __getitem__(self,idx):
        sentence = hints_to_sentence_ori(self.data[idx]['hints'])
        candidate = self.data[idx]['options']
        label = self.data[idx]['label']
        
        sentence = tokenizer(sentence, return_tensors='pt',padding='max_length',max_length=50)
        
        padding_length = 60 - len(candidate)
        candidate = candidate + [tokenizer.pad_token] * padding_length
        candidate = np.array([tokenizer.convert_tokens_to_ids(word) for word in candidate])
        
        label = tokenizer.convert_tokens_to_ids(label) #tokenizer(label, return_tensors='pt',padding='max_length',max_length=8)["input_ids"][0]

        return sentence, candidate, label

In [None]:
def train_model(model, loader, epochs=3, learning_rate=5e-5):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    criterion = torch.nn.BCELoss()
    optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)

    log1=[]
    log2=[]
    
    for epoch in range(epochs):
        
        model.train()
        model.to(device)
        train_loss = 0
        
        for sentence, candidate, label in loader:
            
            sentence = sentence.to(device)
            sentence = {k: v.squeeze(1).to(device) for k, v in sentence.items()}
            outputs = model(**sentence)
            logits = outputs.logits
            label = label.to(device)
            candidate = candidate.to(device)

            mask_token_index = [torch.where(sentence['input_ids'][i] == tokenizer.mask_token_id)[0] for i in range(label.size(0))]
            mask_logits = logits[torch.arange(label.size(0)), mask_token_index, :].squeeze()

            candidate_logits = torch.gather(mask_logits, 
                                            dim=1, 
                                            index=candidate.to(dtype=torch.int64)
                               )

            probs = torch.softmax(candidate_logits, dim=-1)

            label_probs = torch.zeros_like(probs)
            for i in range (label.size(0)):
                sample1 = candidate[i]
                label1 = label[i]
                indeces = torch.where(sample1==label1)
                for idx in indeces:
                    label_probs[i,idx]=1

            loss = criterion(probs, label_probs)
            
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()
            
            train_loss += loss.item()

        total_scores=0
        for example in test_dataset:
            result_list = BERTTopKMaskWord(hints_to_sentence_ori(example['hints']),example['options'])
            total_scores += score(result_list, example['label'])['total_score']
        print(f"Train Loss: {train_loss:.8f},     "+f"Average validation score: {total_scores*2 / len(validation_data)}")
        log1.append(train_loss)
        log2.append(total_scores/len(validation_data))
    return log1, log2

In [None]:
model_dir = '/personal/bert-large-uncased/'
model = AutoModelForMaskedLM.from_pretrained(model_dir)
tokenizer = AutoTokenizer.from_pretrained(model_dir)
apply_lora(model,rank=32)

In [None]:
def print_model_layers(model, indent=0):
    for name, module in model.named_children():
        print("  " * indent + f"{name}: {module.__class__.__name__}")
        if len(list(module.parameters())) > 0:
            param = next(module.parameters())
            print("  " * (indent + 1) + f"requires_grad={param.requires_grad}")
        if len(list(module.named_children())) > 0:
            print_model_layers(module, indent + 2)           
#print_model_layers(model)

In [None]:
print(len(validation_data))
train_indices = [i for i in range(len(validation_data)) if i % 2 == 0]
test_indices = [i for i in range(len(validation_data)) if i % 2 != 0]
train_dataset = validation_data.select(train_indices)
test_dataset = validation_data.select(test_indices)

In [None]:
mydataset = MyDataset_DS(train_dataset,tokenizer)
loader = DataLoader(mydataset,batch_size=5,shuffle=True)
losslog, scorelog = train_model(model,loader,epochs=30,learning_rate=5e-5)