In [None]:
import json
import pandas as pd
import random
from tqdm import tqdm
import os

# Set paths
BASE_PATH = "Data/HybridQA"
TRAIN_FILE = os.path.join(BASE_PATH, "train.p.json")
TABLES_PATH = os.path.join(BASE_PATH, "WikiTables-WithLinks", "tables_tok")

# Function: Load specified number of training data
def load_train_data(file_path, num_samples=None):
    with open(file_path, 'r') as f:
        train_data = json.load(f)
    
    if num_samples is not None and num_samples < len(train_data):
        train_data = random.sample(train_data, num_samples)
    
    print(f"Loaded {len(train_data)} training examples.")
    return train_data

# Function: Load and process tables
def load_table(table_id):
    table_file = os.path.join(TABLES_PATH, f"{table_id}.json")
    with open(table_file, 'r') as f:
        table_data = json.load(f)
    
    headers = [h[0] for h in table_data['header']]
    rows = []
    for row in table_data['data']:
        row_dict = {}
        for i, cell in enumerate(row):
            row_dict[headers[i]] = cell[0]
        rows.append(row_dict)
    
    return pd.DataFrame(rows)

# Prepare training data
def prepare_training_data(train_data, num_negative_samples=3):
    prepared_data = []
    for item in tqdm(train_data, desc="Preparing training data"):
        question = item['question']
        table_id = item['table_id']
        labels = item['labels']
        
        # Load table
        table = load_table(table_id)
        
        # Find positive samples (rows with label 1)
        positive_indices = [i for i, label in enumerate(labels) if label == 1]
        
        for pos_idx in positive_indices:
            positive_row = table.iloc[pos_idx].to_dict()
            
            # Select negative samples
            negative_indices = [i for i, label in enumerate(labels) if label == 0]
            negative_samples = random.sample(negative_indices, min(num_negative_samples, len(negative_indices)))
            
            for neg_idx in negative_samples:
                negative_row = table.iloc[neg_idx].to_dict()
                
                prepared_data.append({
                    'question': question,
                    'positive_row': positive_row,
                    'negative_row': negative_row,
                    'table_id': table_id
                })
    
    return prepared_data

# User input for the amount of data to process
num_samples = int(input("Enter the number of training samples to process (or 0 for all): "))
if num_samples <= 0:
    num_samples = None

# Load training data
train_data = load_train_data(TRAIN_FILE, num_samples)

# Prepare training data
training_data = prepare_training_data(train_data)
print(f"Prepared {len(training_data)} training samples.")

# Show a sample
sample = random.choice(training_data)
print("Sample training data:")
print(f"Question: {sample['question']}")
print(f"Positive row: {sample['positive_row']}")
print(f"Negative row: {sample['negative_row']}")
print(f"Table ID: {sample['table_id']}")

# Save prepared training data
output_file = os.path.join(BASE_PATH, f"prepared_dpr_training_data_{len(train_data)}_samples.json")
with open(output_file, 'w') as f:
    json.dump(training_data, f)
print(f"Saved prepared training data to {output_file}")

In [None]:
BASE_PATH = "Data/HybridQA"
TEST_FILE = os.path.join(BASE_PATH, "dev.p.json")
TABLES_PATH = os.path.join(BASE_PATH, "WikiTables-WithLinks", "tables_tok")

def load_test_data(file_path, num_samples=None):
    with open(file_path, 'r') as f:
        test_data = json.load(f)
    
    if num_samples is not None and num_samples < len(test_data):
        test_data = random.sample(test_data, num_samples)
    
    print(f"Loaded {len(test_data)} test examples.")
    return test_data

def load_table(table_id):
    table_file = os.path.join(TABLES_PATH, f"{table_id}.json")
    with open(table_file, 'r') as f:
        table_data = json.load(f)
    
    headers = [h[0] for h in table_data['header']]
    rows = []
    for row in table_data['data']:
        row_dict = {}
        for i, cell in enumerate(row):
            row_dict[headers[i]] = cell[0]
        rows.append(row_dict)
    
    return rows

def prepare_test_data(test_data):
    prepared_data = []
    for item in tqdm(test_data, desc="Preparing test data"):
        question = item['question']
        table_id = item['table_id']
        labels = item['labels']
        
        table_rows = load_table(table_id)
        
        assert len(labels) == len(table_rows), f"Mismatch in labels and rows for table {table_id}"
        
        correct_row_index = labels.index(1) if 1 in labels else -1
        
        prepared_data.append({
            'question': question,
            'table_id': table_id,
            'rows': table_rows,
            'correct_row_index': correct_row_index
        })
    
    return prepared_data

num_samples = int(input("Enter the number of test samples to process (or 0 for all): "))
if num_samples <= 0:
    num_samples = None

test_data = load_test_data(TEST_FILE, num_samples)

prepared_test_data = prepare_test_data(test_data)

print(f"Prepared {len(prepared_test_data)} test samples.")

sample = random.choice(prepared_test_data)
print("\nSample test data:")
print(f"Question: {sample['question']}")
print(f"Table ID: {sample['table_id']}")
print(f"Number of rows: {len(sample['rows'])}")
print(f"Correct row index: {sample['correct_row_index']}")
if sample['correct_row_index'] != -1:
    print(f"Correct row: {sample['rows'][sample['correct_row_index']]}")

output_file = os.path.join(BASE_PATH, f"prepared_dpr_test_data_{len(test_data)}_samples.json")
with open(output_file, 'w') as f:
    json.dump(prepared_test_data, f)

print(f"\nSaved prepared test data to {output_file}")

In [7]:
import json
import os
import torch
import torch.nn as nn
from transformers import AutoModel, AutoTokenizer
from torch.utils.data import Dataset, DataLoader
import numpy as np
from tqdm import tqdm

# 设置路径
BASE_PATH = "Data/HybridQA"
PREPARED_DATA_FILE = os.path.join(BASE_PATH, "prepared_dpr_training_data_1_samples.json")

# DPR模型定义
class DPRModel(nn.Module):
    def __init__(self, model_name='bert-base-uncased'):
        super(DPRModel, self).__init__()
        self.question_encoder = AutoModel.from_pretrained(model_name)
        self.passage_encoder = AutoModel.from_pretrained(model_name)

    def encode_question(self, input_ids, attention_mask):
        return self.question_encoder(input_ids=input_ids, attention_mask=attention_mask).last_hidden_state[:, 0, :]

    def encode_passage(self, input_ids, attention_mask):
        return self.passage_encoder(input_ids=input_ids, attention_mask=attention_mask).last_hidden_state[:, 0, :]

# 数据集类
class DPRDataset(Dataset):
    def __init__(self, data, tokenizer, max_length=512):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        question = item['question']
        positive_row = ' '.join(str(v) for v in item['positive_row'].values())
        negative_row = ' '.join(str(v) for v in item['negative_row'].values())

        question_encoding = self.tokenizer(question, truncation=True, padding='max_length', 
                                           max_length=self.max_length, return_tensors='pt')
        positive_encoding = self.tokenizer(positive_row, truncation=True, padding='max_length', 
                                           max_length=self.max_length, return_tensors='pt')
        negative_encoding = self.tokenizer(negative_row, truncation=True, padding='max_length', 
                                           max_length=self.max_length, return_tensors='pt')

        return {
            'question': {k: v.squeeze(0) for k, v in question_encoding.items()},
            'positive': {k: v.squeeze(0) for k, v in positive_encoding.items()},
            'negative': {k: v.squeeze(0) for k, v in negative_encoding.items()}
        }

# 编码函数
def encode_data(model, dataset, device):
    model.eval()
    dataloader = DataLoader(dataset, batch_size=32, shuffle=False)
    
    question_embeddings = []
    positive_embeddings = []
    negative_embeddings = []
    
    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Encoding"):
            q_emb = model.encode_question(
                batch['question']['input_ids'].to(device),
                batch['question']['attention_mask'].to(device)
            )
            p_emb = model.encode_passage(
                batch['positive']['input_ids'].to(device),
                batch['positive']['attention_mask'].to(device)
            )
            n_emb = model.encode_passage(
                batch['negative']['input_ids'].to(device),
                batch['negative']['attention_mask'].to(device)
            )
            
            question_embeddings.extend(q_emb.cpu().numpy())
            positive_embeddings.extend(p_emb.cpu().numpy())
            negative_embeddings.extend(n_emb.cpu().numpy())
    
    return np.array(question_embeddings), np.array(positive_embeddings), np.array(negative_embeddings)

# 加载保存的训练数据
print(f"Loading prepared training data from {PREPARED_DATA_FILE}")
with open(PREPARED_DATA_FILE, "r") as f:
    training_data = json.load(f)

print(f"Loaded {len(training_data)} prepared training samples.")

# 初始化设备、tokenizer和模型
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
model = DPRModel('bert-base-uncased').to(device)

# 创建数据集
dataset = DPRDataset(training_data, tokenizer)

# 编码数据
question_embeddings, positive_embeddings, negative_embeddings = encode_data(model, dataset, device)

# 保存编码后的向量
np.save(os.path.join(BASE_PATH, "question_embeddings.npy"), question_embeddings)
np.save(os.path.join(BASE_PATH, "positive_embeddings.npy"), positive_embeddings)
np.save(os.path.join(BASE_PATH, "negative_embeddings.npy"), negative_embeddings)

print("Encoded data saved.")

# 展示编码结果的形状
print(f"Question embeddings shape: {question_embeddings.shape}")
print(f"Positive embeddings shape: {positive_embeddings.shape}")
print(f"Negative embeddings shape: {negative_embeddings.shape}")

# 展示一个样本的编码结果
print("\nSample embeddings:")
print(f"Question: {question_embeddings[0][:5]}...")  # 只显示前5个元素
print(f"Positive: {positive_embeddings[0][:5]}...")
print(f"Negative: {negative_embeddings[0][:5]}...")

Loading prepared training data from Data/HybridQA/prepared_dpr_training_data_1_samples.json
Loaded 21 prepared training samples.
Using device: cpu


Encoding: 100%|███████████████████████████████████| 1/1 [00:11<00:00, 11.84s/it]

Encoded data saved.
Question embeddings shape: (21, 768)
Positive embeddings shape: (21, 768)
Negative embeddings shape: (21, 768)

Sample embeddings:
Question: [-0.5504775   0.31225264 -0.07330009 -0.06485221 -0.30499658]...
Positive: [-0.6236328  -0.2962152  -0.18471776 -0.6544239  -0.531056  ]...
Negative: [-0.5705106  -0.17978194 -0.13786103 -0.3468474  -0.24918976]...





In [1]:
# 单元格 1: 导入和设置
import json
import os
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.optim.lr_scheduler import StepLR
from transformers import AutoModel, AutoTokenizer
import numpy as np
from tqdm import tqdm

# 设置路径
BASE_PATH = "Data/HybridQA"
PREPARED_DATA_FILE = os.path.join(BASE_PATH, "prepared_dpr_training_data_1_samples.json")
TEST_DATA_FILE = os.path.join(BASE_PATH, "prepared_dpr_test_data_1_samples.json")

# 设置设备
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# 单元格 2: 模型定义
class DPRModel(nn.Module):
    def __init__(self, model_name='bert-base-uncased'):
        super(DPRModel, self).__init__()
        self.question_encoder = AutoModel.from_pretrained(model_name)
        self.passage_encoder = AutoModel.from_pretrained(model_name)

    def encode_question(self, input_ids, attention_mask):
        return self.question_encoder(input_ids=input_ids, attention_mask=attention_mask).last_hidden_state[:, 0, :]

    def encode_passage(self, input_ids, attention_mask):
        return self.passage_encoder(input_ids=input_ids, attention_mask=attention_mask).last_hidden_state[:, 0, :]

# 单元格 3: 数据集类定义
class DPRDataset(Dataset):
    def __init__(self, data, tokenizer, max_length=512):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        question = item['question']
        positive_row = ' '.join(str(v) for v in item['positive_row'].values())
        negative_row = ' '.join(str(v) for v in item['negative_row'].values())

        question_encoding = self.tokenizer(question, truncation=True, padding='max_length', 
                                           max_length=self.max_length, return_tensors='pt')
        positive_encoding = self.tokenizer(positive_row, truncation=True, padding='max_length', 
                                           max_length=self.max_length, return_tensors='pt')
        negative_encoding = self.tokenizer(negative_row, truncation=True, padding='max_length', 
                                           max_length=self.max_length, return_tensors='pt')

        return {
            'question': {k: v.squeeze(0) for k, v in question_encoding.items()},
            'positive': {k: v.squeeze(0) for k, v in positive_encoding.items()},
            'negative': {k: v.squeeze(0) for k, v in negative_encoding.items()}
        }

class DPRTestDataset(Dataset):
    def __init__(self, data, tokenizer, max_length=512):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        question = item['question']
        rows = item['rows']
        
        question_encoding = self.tokenizer(question, truncation=True, padding='max_length', 
                                           max_length=self.max_length, return_tensors='pt')
        
        row_encodings = [self.tokenizer(' '.join(str(v) for v in row.values()), 
                                        truncation=True, padding='max_length',
                                        max_length=self.max_length, return_tensors='pt')
                         for row in rows]
        
        return {
            'question': {k: v.squeeze(0) for k, v in question_encoding.items()},
            'rows': [{k: v.squeeze(0) for k, v in encoding.items()} for encoding in row_encodings],
            'table_id': item['table_id'],
            'correct_index': item['correct_row_index']
        }

# 单元格 4: 编码和训练函数
def encode_data(model, dataset, device):
    model.eval()
    dataloader = DataLoader(dataset, batch_size=32, shuffle=False)
    
    question_embeddings = []
    positive_embeddings = []
    negative_embeddings = []
    
    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Encoding"):
            q_emb = model.encode_question(
                batch['question']['input_ids'].to(device),
                batch['question']['attention_mask'].to(device)
            )
            p_emb = model.encode_passage(
                batch['positive']['input_ids'].to(device),
                batch['positive']['attention_mask'].to(device)
            )
            n_emb = model.encode_passage(
                batch['negative']['input_ids'].to(device),
                batch['negative']['attention_mask'].to(device)
            )
            
            question_embeddings.extend(q_emb.cpu().numpy())
            positive_embeddings.extend(p_emb.cpu().numpy())
            negative_embeddings.extend(n_emb.cpu().numpy())
    
    return np.array(question_embeddings), np.array(positive_embeddings), np.array(negative_embeddings)

def train(model, train_dataloader, optimizer, scheduler, device, epochs):
    model.train()
    for epoch in range(epochs):
        total_loss = 0
        for batch in tqdm(train_dataloader, desc=f"Training Epoch {epoch+1}"):
            optimizer.zero_grad()
            
            q_emb = model.encode_question(
                batch['question']['input_ids'].to(device),
                batch['question']['attention_mask'].to(device)
            )
            p_emb = model.encode_passage(
                batch['positive']['input_ids'].to(device),
                batch['positive']['attention_mask'].to(device)
            )
            n_emb = model.encode_passage(
                batch['negative']['input_ids'].to(device),
                batch['negative']['attention_mask'].to(device)
            )
            
            # 计算相似度
            pos_score = F.cosine_similarity(q_emb, p_emb)
            neg_score = F.cosine_similarity(q_emb, n_emb)
            
            # 计算损失
            loss = F.margin_ranking_loss(pos_score, neg_score, 
                                         torch.ones_like(pos_score), 
                                         margin=0.1)
            
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)  # 梯度裁剪
            optimizer.step()
            
            total_loss += loss.item()
        
        scheduler.step()
        avg_loss = total_loss / len(train_dataloader)
        print(f"Epoch {epoch+1}/{epochs}, Average Loss: {avg_loss:.4f}")

# 单元格 5: 测试函数
def test_model(model, test_dataset, device):
    model.eval()
    results = []
    
    with torch.no_grad():
        for idx in range(len(test_dataset)):
            batch = test_dataset[idx]
            q_emb = model.encode_question(
                batch['question']['input_ids'].unsqueeze(0).to(device),
                batch['question']['attention_mask'].unsqueeze(0).to(device)
            )
            
            row_embeddings = []
            for row in batch['rows']:
                r_emb = model.encode_passage(
                    row['input_ids'].unsqueeze(0).to(device),
                    row['attention_mask'].unsqueeze(0).to(device)
                )
                row_embeddings.append(r_emb)
            
            row_embeddings = torch.cat(row_embeddings, dim=0)
            
            similarities = torch.matmul(q_emb, row_embeddings.t()).squeeze()
            predicted_index = similarities.argmax().item()
            
            results.append({
                'table_id': batch['table_id'],
                'label': predicted_index,
                'row': test_dataset.data[idx]['rows'][predicted_index],
                'answer_node': batch['correct_index']
            })
    
    return results

# 单元格 6: 数据加载和模型初始化
# 加载训练数据
print(f"Loading prepared training data from {PREPARED_DATA_FILE}")
with open(PREPARED_DATA_FILE, "r") as f:
    training_data = json.load(f)
print(f"Loaded {len(training_data)} prepared training samples.")

# 加载测试数据
print(f"Loading test data from {TEST_DATA_FILE}")
with open(TEST_DATA_FILE, "r") as f:
    test_data = json.load(f)
print(f"Loaded {len(test_data)} test samples.")

# 初始化tokenizer和模型
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
model = DPRModel('bert-base-uncased').to(device)

# 单元格 7: 训练模型
# 创建训练数据集和数据加载器
train_dataset = DPRDataset(training_data, tokenizer)
train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True)

# 设置优化器和学习率调度器
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5, weight_decay=0.01)
scheduler = StepLR(optimizer, step_size=1, gamma=0.9)

# 训练模型
print("Starting model training...")
train(model, train_dataloader, optimizer, scheduler, device, epochs=5)

# 单元格 8: 编码数据
# 编码训练数据
print("Encoding training data...")
question_embeddings, positive_embeddings, negative_embeddings = encode_data(model, train_dataset, device)

# 保存编码后的向量
np.save(os.path.join(BASE_PATH, "question_embeddings.npy"), question_embeddings)
np.save(os.path.join(BASE_PATH, "positive_embeddings.npy"), positive_embeddings)
np.save(os.path.join(BASE_PATH, "negative_embeddings.npy"), negative_embeddings)

print("Encoded data saved.")

# 展示编码结果的形状
print(f"Question embeddings shape: {question_embeddings.shape}")
print(f"Positive embeddings shape: {positive_embeddings.shape}")
print(f"Negative embeddings shape: {negative_embeddings.shape}")

# 展示一个样本的编码结果
print("\nSample embeddings:")
print(f"Question: {question_embeddings[0][:5]}...")  # 只显示前5个元素
print(f"Positive: {positive_embeddings[0][:5]}...")
print(f"Negative: {negative_embeddings[0][:5]}...")

# 单元格 9: 测试模型
# 创建测试数据集
test_dataset = DPRTestDataset(test_data, tokenizer)

# 测试模型
test_results = test_model(model, test_dataset, device)

# 输出测试结果
print("\nTest Results:")
for result in test_results:
    print(f"Table ID: {result['table_id']}")
    print(f"Predicted Row: {result['label']}")
    print(f"Correct Row: {result['answer_node']}")
    print(f"Retrieved Row Content: {result['row']}")
    print("---")

# 保存测试结果
with open("predict_result.json", "w") as f:
    json.dump(test_results, f, indent=2)
print("Test results saved to predict_result.json")

Using device: cpu
Loading prepared training data from Data/HybridQA/prepared_dpr_training_data_1_samples.json
Loaded 21 prepared training samples.
Loading test data from Data/HybridQA/prepared_dpr_test_data_1_samples.json
Loaded 1 test samples.
Starting model training...


Training Epoch 1: 100%|███████████████████████████| 2/2 [01:41<00:00, 50.87s/it]


Epoch 1/5, Average Loss: 0.0814


Training Epoch 2: 100%|███████████████████████████| 2/2 [01:44<00:00, 52.32s/it]


Epoch 2/5, Average Loss: 0.0850


Training Epoch 3: 100%|███████████████████████████| 2/2 [01:47<00:00, 53.91s/it]


Epoch 3/5, Average Loss: 0.0622


Training Epoch 4: 100%|███████████████████████████| 2/2 [01:44<00:00, 52.44s/it]


Epoch 4/5, Average Loss: 0.0370


Training Epoch 5: 100%|███████████████████████████| 2/2 [01:46<00:00, 53.25s/it]


Epoch 5/5, Average Loss: 0.0070
Encoding training data...


Encoding: 100%|███████████████████████████████████| 1/1 [00:13<00:00, 13.26s/it]


Encoded data saved.
Question embeddings shape: (21, 768)
Positive embeddings shape: (21, 768)
Negative embeddings shape: (21, 768)

Sample embeddings:
Question: [-1.1217889   0.34976014 -0.4115776  -0.44641808 -0.4475675 ]...
Positive: [-0.4455727  -0.06987131 -0.17461737 -0.593914   -0.49142316]...
Negative: [-0.21959794 -0.02402857 -0.05738551 -0.30179015 -0.08461066]...

Test Results:
Table ID: List_of_libraries_in_Barcelona_0
Predicted Row: 5
Correct Row: 1
Retrieved Row Content: {'Name': 'Canyelles', 'Locality': 'Canyelles', 'District': 'Nou Barris', 'Opened': '1994', 'Named after': "The neighbourhood 's name", 'Transport links': 'Canyelles'}
---
Test results saved to predict_result.json
