In [None]:
import json
import pandas as pd
import random
from tqdm import tqdm
import os

# Set paths
BASE_PATH = "HybridQA"
TRAIN_FILE = os.path.join(BASE_PATH, "train.json")
TABLES_PATH = os.path.join(BASE_PATH, "WikiTables-WithLinks", "tables_tok")

# Function: Load specified number of training data
def load_train_data(file_path, num_samples=None):
    with open(file_path, 'r') as f:
        train_data = json.load(f)
    
    if num_samples is not None and num_samples < len(train_data):
        train_data = random.sample(train_data, num_samples)
    
    print(f"Loaded {len(train_data)} training examples.")
    return train_data

# Function: Load and process tables
def load_table(table_id):
    table_file = os.path.join(TABLES_PATH, f"{table_id}.json")
    with open(table_file, 'r') as f:
        table_data = json.load(f)
    
    headers = [h[0] for h in table_data['header']]
    rows = []
    for row in table_data['data']:
        row_dict = {}
        for i, cell in enumerate(row):
            row_dict[headers[i]] = cell[0]
        rows.append(row_dict)
    
    return pd.DataFrame(rows)

# Prepare training data
def prepare_training_data(train_data, num_negative_samples=3):
    prepared_data = []
    for item in tqdm(train_data, desc="Preparing training data"):
        question = item['question']
        table_id = item['table_id']
        labels = item['labels']
        
        # Load table
        table = load_table(table_id)
        
        # Find positive samples (rows with label 1)
        positive_indices = [i for i, label in enumerate(labels) if label == 1]
        
        for pos_idx in positive_indices:
            positive_row = table.iloc[pos_idx].to_dict()
            
            # Select negative samples
            negative_indices = [i for i, label in enumerate(labels) if label == 0]
            negative_samples = random.sample(negative_indices, min(num_negative_samples, len(negative_indices)))
            
            for neg_idx in negative_samples:
                negative_row = table.iloc[neg_idx].to_dict()
                
                prepared_data.append({
                    'question': question,
                    'positive_row': positive_row,
                    'negative_row': negative_row,
                    'table_id': table_id
                })
    
    return prepared_data

# User input for the amount of data to process
num_samples = int(input("Enter the number of training samples to process (or 0 for all): "))
if num_samples <= 0:
    num_samples = None

# Load training data
train_data = load_train_data(TRAIN_FILE, num_samples)

# Prepare training data
training_data = prepare_training_data(train_data)
print(f"Prepared {len(training_data)} training samples.")

# Show a sample
sample = random.choice(training_data)
print("Sample training data:")
print(f"Question: {sample['question']}")
print(f"Positive row: {sample['positive_row']}")
print(f"Negative row: {sample['negative_row']}")
print(f"Table ID: {sample['table_id']}")

# Save prepared training data
output_file = os.path.join(BASE_PATH, f"prepared_dpr_training_data_{len(train_data)}_samples.json")
with open(output_file, 'w') as f:
    json.dump(training_data, f)
print(f"Saved prepared training data to {output_file}")

In [None]:
BASE_PATH = "HybridQA"
TEST_FILE = os.path.join(BASE_PATH, "dev.json")# or test
TABLES_PATH = os.path.join(BASE_PATH, "WikiTables-WithLinks", "tables_tok")

def load_test_data(file_path, num_samples=None):
    with open(file_path, 'r') as f:
        test_data = json.load(f)
    
    if num_samples is not None and num_samples < len(test_data):
        test_data = random.sample(test_data, num_samples)
    
    print(f"Loaded {len(test_data)} test examples.")
    return test_data

def load_table(table_id):
    table_file = os.path.join(TABLES_PATH, f"{table_id}.json")
    with open(table_file, 'r') as f:
        table_data = json.load(f)
    
    headers = [h[0] for h in table_data['header']]
    rows = []
    for row in table_data['data']:
        row_dict = {}
        for i, cell in enumerate(row):
            row_dict[headers[i]] = cell[0]
        rows.append(row_dict)
    
    return rows

def prepare_test_data(test_data):
    prepared_data = []
    for item in tqdm(test_data, desc="Preparing test data"):
        question = item['question']
        table_id = item['table_id']
        labels = item['labels']
        
        table_rows = load_table(table_id)
        
        assert len(labels) == len(table_rows), f"Mismatch in labels and rows for table {table_id}"
        
        correct_row_index = labels.index(1) if 1 in labels else -1
        
        prepared_data.append({
            'question': question,
            'table_id': table_id,
            'rows': table_rows,
            'correct_row_index': correct_row_index
        })
    
    return prepared_data

num_samples = int(input("Enter the number of test samples to process (or 0 for all): "))
if num_samples <= 0:
    num_samples = None

test_data = load_test_data(TEST_FILE, num_samples)

prepared_test_data = prepare_test_data(test_data)

print(f"Prepared {len(prepared_test_data)} test samples.")

sample = random.choice(prepared_test_data)
print("\nSample test data:")
print(f"Question: {sample['question']}")
print(f"Table ID: {sample['table_id']}")
print(f"Number of rows: {len(sample['rows'])}")
print(f"Correct row index: {sample['correct_row_index']}")
if sample['correct_row_index'] != -1:
    print(f"Correct row: {sample['rows'][sample['correct_row_index']]}")

output_file = os.path.join(BASE_PATH, f"prepared_dpr_test_data_{len(test_data)}_samples.json")
with open(output_file, 'w') as f:
    json.dump(prepared_test_data, f)

print(f"\nSaved prepared test data to {output_file}")

In [7]:
import json
import os
import torch
import torch.nn as nn
from transformers import AutoModel, AutoTokenizer
from torch.utils.data import Dataset, DataLoader
import numpy as np
from tqdm import tqdm

# testing DPR
BASE_PATH = "HybridQA"
PREPARED_DATA_FILE = os.path.join(BASE_PATH, "prepared_dpr_training_data.json")

# DPR model definition
class DPRModel(nn.Module):
    def __init__(self, model_name='bert-base-uncased'):
        super(DPRModel, self).__init__()
        self.question_encoder = AutoModel.from_pretrained(model_name)
        self.passage_encoder = AutoModel.from_pretrained(model_name)

    def encode_question(self, input_ids, attention_mask):
        return self.question_encoder(input_ids=input_ids, attention_mask=attention_mask).last_hidden_state[:, 0, :]

    def encode_passage(self, input_ids, attention_mask):
        return self.passage_encoder(input_ids=input_ids, attention_mask=attention_mask).last_hidden_state[:, 0, :]

# Dataset class
class DPRDataset(Dataset):
    def __init__(self, data, tokenizer, max_length=512):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        question = item['question']
        positive_row = ' '.join(str(v) for v in item['positive_row'].values())
        negative_row = ' '.join(str(v) for v in item['negative_row'].values())

        question_encoding = self.tokenizer(question, truncation=True, padding='max_length', 
                                           max_length=self.max_length, return_tensors='pt')
        positive_encoding = self.tokenizer(positive_row, truncation=True, padding='max_length', 
                                           max_length=self.max_length, return_tensors='pt')
        negative_encoding = self.tokenizer(negative_row, truncation=True, padding='max_length', 
                                           max_length=self.max_length, return_tensors='pt')

        return {
            'question': {k: v.squeeze(0) for k, v in question_encoding.items()},
            'positive': {k: v.squeeze(0) for k, v in positive_encoding.items()},
            'negative': {k: v.squeeze(0) for k, v in negative_encoding.items()}
        }

# Encoding function
def encode_data(model, dataset, device):
    model.eval()
    dataloader = DataLoader(dataset, batch_size=32, shuffle=False)
    
    question_embeddings = []
    positive_embeddings = []
    negative_embeddings = []
    
    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Encoding"):
            q_emb = model.encode_question(
                batch['question']['input_ids'].to(device),
                batch['question']['attention_mask'].to(device)
            )
            p_emb = model.encode_passage(
                batch['positive']['input_ids'].to(device),
                batch['positive']['attention_mask'].to(device)
            )
            n_emb = model.encode_passage(
                batch['negative']['input_ids'].to(device),
                batch['negative']['attention_mask'].to(device)
            )
            
            question_embeddings.extend(q_emb.cpu().numpy())
            positive_embeddings.extend(p_emb.cpu().numpy())
            negative_embeddings.extend(n_emb.cpu().numpy())
    
    return np.array(question_embeddings), np.array(positive_embeddings), np.array(negative_embeddings)

# Load saved training data
print(f"Loading prepared training data from {PREPARED_DATA_FILE}")
with open(PREPARED_DATA_FILE, "r") as f:
    training_data = json.load(f)

print(f"Loaded {len(training_data)} prepared training samples.")

# Initialize device, tokenizer, and model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
model = DPRModel('bert-base-uncased').to(device)

# Create dataset
dataset = DPRDataset(training_data, tokenizer)

# Encode data
question_embeddings, positive_embeddings, negative_embeddings = encode_data(model, dataset, device)

# Save encoded vectors
np.save(os.path.join(BASE_PATH, "question_embeddings.npy"), question_embeddings)
np.save(os.path.join(BASE_PATH, "positive_embeddings.npy"), positive_embeddings)
np.save(os.path.join(BASE_PATH, "negative_embeddings.npy"), negative_embeddings)

print("Encoded data saved.")

# Display shapes of encoded results
print(f"Question embeddings shape: {question_embeddings.shape}")
print(f"Positive embeddings shape: {positive_embeddings.shape}")
print(f"Negative embeddings shape: {negative_embeddings.shape}")

# Display a sample of encoded results
print("\nSample embeddings:")
print(f"Question: {question_embeddings[0][:5]}...")  # Only show first 5 elements
print(f"Positive: {positive_embeddings[0][:5]}...")
print(f"Negative: {negative_embeddings[0][:5]}...")

Loading prepared training data from Data/HybridQA/prepared_dpr_training_data_1_samples.json
Loaded 21 prepared training samples.
Using device: cpu


Encoding: 100%|███████████████████████████████████| 1/1 [00:11<00:00, 11.84s/it]

Encoded data saved.
Question embeddings shape: (21, 768)
Positive embeddings shape: (21, 768)
Negative embeddings shape: (21, 768)

Sample embeddings:
Question: [-0.5504775   0.31225264 -0.07330009 -0.06485221 -0.30499658]...
Positive: [-0.6236328  -0.2962152  -0.18471776 -0.6544239  -0.531056  ]...
Negative: [-0.5705106  -0.17978194 -0.13786103 -0.3468474  -0.24918976]...



