In [1]:
import pandas as pd
train_file_path = '/shareddata/data/project2/Task1/squad_v2/squad_v2/train-00000-of-00001.parquet'
validation_file_path = '/shareddata/data/project2/Task1/squad_v2/squad_v2/validation-00000-of-00001.parquet'

# Reading the parquet files into pandas DataFrames
train_df = pd.read_parquet(train_file_path)
test_df = pd.read_parquet(validation_file_path)

In [2]:
from datasets import Dataset, DatasetDict
def process_squad_v2(data):
    questions = data['question']
    contexts = data['context']
    answers = data['answers']
    
    inputs = [f"question: {q} context: {c}" for q, c in zip(questions, contexts)]
    targets = [a["text"][0] if len(a["text"]) > 0 else "no answer" for a in answers]
    return {"input_texts": inputs, "target_texts": targets}

# 划分训练集和验证集 
validation_size = 5000
validation_df = train_df[:validation_size]
train_df = train_df[validation_size:]

train_inputs, train_targets = process_squad_v2(train_df)
val_inputs, val_targets = process_squad_v2(validation_df)
test_inputs, test_targets = process_squad_v2(test_df)

# 转换为 HuggingFace Dataset 格式
train_dataset = Dataset.from_pandas(train_df)
validation_dataset = Dataset.from_pandas(validation_df)
test_dataset = Dataset.from_pandas(test_df)

# 处理数据
train_dataset = train_dataset.map(process_squad_v2, batched=True, remove_columns=train_dataset.column_names)
validation_dataset = validation_dataset.map(process_squad_v2, batched=True, remove_columns=validation_dataset.column_names)
test_dataset = test_dataset.map(process_squad_v2, batched=True, remove_columns=test_dataset.column_names)

# 检查处理后的数据
print(train_dataset[0])

Map:   0%|          | 0/125319 [00:00<?, ? examples/s]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

Map:   0%|          | 0/11873 [00:00<?, ? examples/s]

{'input_texts': 'question: What happened after Kanye made his controversial statement? context: Myers spoke next and continued to read the script. Once it was West\'s turn to speak again, he said, "George Bush doesn\'t care about black people." At this point, telethon producer Rick Kaplan cut off the microphone and then cut away to Chris Tucker, who was unaware of the cut for a few seconds. Still, West\'s comment reached much of the United States.', 'target_texts': 'Rick Kaplan cut off the microphone and then cut away to Chris Tucker'}


In [3]:
import ray
ray.shutdown()

import json
import pandas as pd
from ray.util.dask import enable_dask_on_ray

import dask.dataframe as dd
import pyarrow as pa
import os
from ray.air import session
from ray.air.config import ScalingConfig, RunConfig
import ray.train as train
from ray.train.torch import TorchCheckpoint, TorchTrainer
import torch
import torch.nn as nn
from transformers import T5ForConditionalGeneration, T5Tokenizer
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import ray.data as ray_data
import torch.optim as optim
from torch.utils.data import Dataset

from ray import tune
from ray.train import Checkpoint

from torch.optim import AdamW
from datasets import Dataset, DatasetDict
model_path = "/data/lab/project2/flan-t5-small"

In [4]:
def load_dataset(path: str, *, include_label=True):
    df = pd.read_csv(path)
    arrow_table = pa.Table.from_pandas(df)
    ray_dataset = ray.data.from_arrow(arrow_table)
    return ray_dataset

# Initial Ray
ray.shutdown()
ray.init(ignore_reinit_error=True, num_cpus=4,local_mode=True)

# Process Dataset in Ray
train_ray = load_dataset('./Q1_data/train_data.csv')
test_ray = load_dataset('./Q1_data/test_data.csv')
val_ray = load_dataset('./Q1_data/val_data.csv')

2024-06-03 09:02:34,151	INFO worker.py:1740 -- Started a local Ray instance. View the dashboard at [1m[32m127.0.0.1:8267 [39m[22m


:job_id:01000000
:task_name:get_table_block_metadata
:actor_name:_StatsActor


:job_id:01000000
:task_name:get_table_block_metadata
:actor_name:_StatsActor


:task_name:get_table_block_metadata
:task_name:get_table_block_metadata


:task_name:get_table_block_metadata
:task_name:get_table_block_metadata


In [9]:
# 尝试加载本地模型和 tokenizer
try:
    tokenizer = AutoTokenizer.from_pretrained(model_path, local_files_only=True)
    model = AutoModelForSeq2SeqLM.from_pretrained(model_path, local_files_only=True)
    print("模型和tokenizer加载成功")
except Exception as e:
    print(f"加载失败: {e}")

模型和tokenizer加载成功


In [6]:
class QADataset(Dataset):
    def __init__(self, inputs, targets, tokenizer, max_length=512):
        self.inputs = inputs
        self.targets = targets
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, idx):
        input_text = self.inputs[idx]
        target_text = self.targets[idx]

        input_encodings = self.tokenizer(input_text, truncation=True, padding='max_length', max_length=self.max_length)
        target_encodings = self.tokenizer(target_text, truncation=True, padding='max_length', max_length=self.max_length)

        input_ids = input_encodings['input_ids']
        attention_mask = input_encodings['attention_mask']
        labels = target_encodings['input_ids']

        return {
            'input_ids': torch.tensor(input_ids, dtype=torch.long),
            'attention_mask': torch.tensor(attention_mask, dtype=torch.long),
            'labels': torch.tensor(labels, dtype=torch.long)
        }

# 初始化分词器
tokenizer = AutoTokenizer.from_pretrained(model_path, local_files_only=True)

# 创建数据集实例
train_dataset = QADataset(train_inputs, train_targets, tokenizer)
val_dataset = QADataset(val_inputs, val_targets, tokenizer)
test_dataset = QADataset(test_inputs, test_targets, tokenizer)

# 创建数据加载器
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=16)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=16)

In [11]:
train_loader.dataset[0]

{'input_ids': tensor([  822,    10,   363,   844,   410,   493,    63, 14549,  5978,    16,
           116,   255,    47,  1710,    95,    58,  2625,    10,   493,    63,
           106,    75,   154,  3156,     7,   693,  8900,   965,    18,  6936,
           449,    41,    87,   115,    23,     2,   354,     2,    29,     7,
            15,     2,    87,    36,    15,    18,   476,  4170,    18,  8735,
            61,    41,  7473,  1600,  6464, 15465,    61,    19,    46,   797,
          7634,     6,     3, 21101,     6,  1368,  8211,    11, 15676,     5,
         12896,    11,  3279,    16,  8018,     6,  2514,     6,   255,  3032,
            16,   796,  8782,    11, 10410,  2259,     7,    38,     3,     9,
           861,     6,    11,  4659,    12, 10393,    16,     8,  1480,  5541,
             7,    38,   991,  7634,    13,   391,   184,   279,  3202,    18,
         10739, 19344,    63,    31,     7,  9364,     5, 19607,    26,    57,
           160,  2353,     6,  9762,   

In [7]:
def tokenize_batch(batch, tokenizer, max_length=512):
    inputs = list(batch["input_texts"])
    targets = list(batch["target_texts"])
    
    model_inputs = tokenizer(inputs, max_length=max_length, truncation=True, padding="max_length")
    labels = tokenizer(targets, max_length=128, truncation=True, padding="max_length", text_target=targets)
    
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Tokenize the datasets
tokenized_train_ray = train_ray.map_batches(tokenize_batch, batch_format="pandas", fn_kwargs={"tokenizer": tokenizer})
tokenized_val_ray = val_ray.map_batches(tokenize_batch, batch_format="pandas", fn_kwargs={"tokenizer": tokenizer})
tokenized_test_ray = test_ray.map_batches(tokenize_batch, batch_format="pandas", fn_kwargs={"tokenizer": tokenizer})

In [18]:
# Convert Ray Datasets to Hugging Face Datasets
train_dataset = Dataset.from_pandas(tokenized_train_ray.to_pandas())
val_dataset = Dataset.from_pandas(tokenized_val_ray.to_pandas())
test_dataset = Dataset.from_pandas(tokenized_test_ray.to_pandas())

In [20]:
# Create a DatasetDict
dataset_dict = DatasetDict({
    "train": train_dataset,
    "validation": val_dataset,
    "test": test_dataset
})

# Save the tokenized datasets to disk
dataset_dict.save_to_disk('./squad_v2_tokenized_datasets')