In [2]:
import os
from transformers import T5ForConditionalGeneration, T5Tokenizer, Seq2SeqTrainer, Seq2SeqTrainingArguments
import ray
from ray.train.huggingface.transformers import (
    RayTrainReportCallback,
    prepare_trainer,
)
from ray.train import ScalingConfig
from ray.train.torch import TorchTrainer
import datasets
from config import *

# 指定checkpoint目录
checkpoint_dir = "/root/ray_results/TorchTrainer_2024-05-29_07-12-59/TorchTrainer_e1617_00000_0_2024-05-29_07-12-59/checkpoint_000000/checkpoint"

# 确认 checkpoint 目录存在
if os.path.exists(checkpoint_dir):
    # 加载模型和分词器
    model = T5ForConditionalGeneration.from_pretrained(checkpoint_dir)
    tokenizer = T5Tokenizer.from_pretrained(checkpoint_dir)
    print("Model and tokenizer loaded from checkpoint:", checkpoint_dir)
else:
    raise FileNotFoundError("Checkpoint directory not found: {}".format(checkpoint_dir))


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Model and tokenizer loaded from checkpoint: /root/ray_results/TorchTrainer_2024-05-29_07-12-59/TorchTrainer_e1617_00000_0_2024-05-29_07-12-59/checkpoint_000000/checkpoint


In [16]:
model.save_pretrained("./my_model")
tokenizer.save_pretrained("./my_model")

('./my_model/tokenizer_config.json',
 './my_model/special_tokens_map.json',
 './my_model/spiece.model',
 './my_model/added_tokens.json')

In [4]:
dataset = datasets.load_from_disk("/data/lab/assignments/proj2/project2/Task1/squad_v2_tokenized_datasets")
dataset = dataset.train_test_split(test_size=test_size,seed=666)

# 打印数据集的列名
print("Columns in the dataset:")
print(dataset["train"].column_names)

import torch
# 获取训练数据集的前几行
num_samples = 10  # 您想查看的样本数量
samples = dataset["train"].select(range(num_samples))
# 显示输入数据并生成输出
for i, sample in enumerate(samples):
    input_ids = sample['input_ids']
    attention_mask = sample['attention_mask']

 # 生成输出，设置 max_new_tokens 控制生成长度
    outputs = model.generate(input_ids=torch.tensor([input_ids]), attention_mask=torch.tensor([attention_mask]), max_new_tokens=50)
    output_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

    print(f"Sample {i+1}:")
    print(f"Input IDs: {input_ids}")
    print(f"Attention Mask: {attention_mask}")
    print(f"Output: {output_text}")
    print()

Columns in the dataset:
['input_ids', 'attention_mask', 'labels']
Sample 1:
Input IDs: [863, 1525, 8, 822, 3, 390, 30, 8, 787, 2625, 10, 2625, 10, 438, 15895, 16, 22146, 3, 11366, 6, 167, 941, 1181, 17359, 7, 33, 4772, 7, 63, 29, 17194, 14172, 13, 796, 793, 18042, 5, 506, 560, 6, 21, 677, 6, 8, 12637, 18, 40, 2708, 265, 14458, 7, 6, 84, 560, 8, 4550, 31027, 7, 41, 29462, 57, 3, 9940, 23, 16, 8, 3, 729, 302, 4511, 1294, 195, 2552, 201, 8, 197, 21367, 32, 11842, 77, 7, 6, 11, 8, 9566, 9, 855, 9660, 7, 5, 2570, 8861, 7, 24, 33, 341, 12996, 45, 840, 9329, 7, 33, 8, 17925, 122, 120, 509, 1583, 7, 6, 3, 10339, 119, 1181, 17359, 7, 318, 1161, 677, 6, 8, 3, 7, 83, 89, 106, 24216, 7, 6, 8, 3, 25653, 40, 782, 7, 6, 11, 8, 3, 32, 226, 17694, 8130, 77, 782, 7, 318, 355, 2546, 4199, 120, 57, 5368, 3, 17282, 5, 1404, 1181, 17359, 18042, 33, 4352, 422, 19166, 28, 3, 9, 2288, 109, 4866, 1293, 13, 705, 145, 2766, 3, 20844, 3294, 3173, 5, 6306, 13903, 906, 908, 822, 10, 363, 19, 4550, 31027, 7, 2546, 57

In [5]:
! pip install -q --upgrade transformers[onnx]==4.35.2 optimum sentencepiece onnx==1.14.0

[0m

In [11]:
import transformers
# Model name, either HF (e.g. "google/flan-t5-base") or a local path
MODEL_NAME = "my_model"


# Path to store the exported models
EXPORT_PATH = f"onnx_models/{MODEL_NAME}"

In [17]:
!optimum-cli export onnx --task text2text-generation-with-past --model {MODEL_NAME} {EXPORT_PATH}

  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(
Framework not specified. Using pt to export the model.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Using the export variant default. Available variants are:
    - default: The default ONNX variant.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.

***** Exporting submodel 1/3: T5Stack *****
Using framework PyTorch: 2.3.0+cu121
Overriding 1 configuration item(s)
	- use_cache -> False

***** Exporting submodel 2/3: T5ForConditionalGeneration *****
Using framework PyTorch: 2.3.0+cu121
Overriding 1 conf

In [18]:
!ls -l {EXPORT_PATH}

total 810564
drwxr-xr-x 2 root root      4096 May 30 12:30 assets
-rw-r--r-- 1 root root      1515 May 30 12:42 config.json
-rw-r--r-- 1 root root 232553676 May 30 12:42 decoder_model.onnx
-rw-r--r-- 1 root root 232784389 May 30 12:42 decoder_model_merged.onnx
-rw-r--r-- 1 root root 219953983 May 30 12:42 decoder_with_past_model.onnx
-rw-r--r-- 1 root root 141456358 May 30 12:42 encoder_model.onnx
-rw-r--r-- 1 root root       142 May 30 12:42 generation_config.json
-rw-r--r-- 1 root root      2543 May 30 12:42 special_tokens_map.json
-rw-r--r-- 1 root root    791656 May 30 12:42 spiece.model
-rw-r--r-- 1 root root   2424156 May 30 12:42 tokenizer.json
-rw-r--r-- 1 root root     20817 May 30 12:42 tokenizer_config.json


In [14]:
! mkdir -p {EXPORT_PATH}/assets

In [19]:

! mv -t {EXPORT_PATH}/assets {EXPORT_PATH}/spiece.model

In [20]:
!ls -l {EXPORT_PATH}/assets

total 776
-rw-r--r-- 1 root root 791656 May 30 12:42 spiece.model


In [21]:
! wget -q http://setup.johnsnowlabs.com/colab.sh -O - | bash

Installing PySpark 3.2.3 and Spark NLP 5.3.3
setup Colab for PySpark 3.2.3 and Spark NLP 5.3.3
[0m

In [22]:
import sparknlp

# let's start Spark with Spark NLP
spark = sparknlp.start()

:: loading settings :: url = jar:file:/opt/module/spark-3.5.0-bin-hadoop3/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /root/.ivy2/cache
The jars for the packages stored in: /root/.ivy2/jars
com.johnsnowlabs.nlp#spark-nlp_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-d374ebd5-d7db-498a-aeb0-8985541cc5d7;1.0
	confs: [default]
	found com.johnsnowlabs.nlp#spark-nlp_2.12;5.3.3 in central
	found com.typesafe#config;1.4.2 in central
	found org.rocksdb#rocksdbjni;6.29.5 in central
	found com.amazonaws#aws-java-sdk-s3;1.12.500 in central
	found com.amazonaws#aws-java-sdk-kms;1.12.500 in central
	found com.amazonaws#aws-java-sdk-core;1.12.500 in central
	found commons-logging#commons-logging;1.1.3 in central
	found commons-codec#commons-codec;1.15 in central
	found org.apache.httpcomponents#httpclient;4.5.13 in central
	found org.apache.httpcomponents#httpcore;4.4.13 in central
	found software.amazon.ion#ion-java;1.0.2 in central
	found joda-time#joda-time;2.8.1 in central
	found com.amazonaws#jmespath-java;1.12.500 in central
	found com.g

In [23]:
from sparknlp.annotator import *

T5 = T5Transformer.loadSavedModel(EXPORT_PATH, spark)\
  .setUseCache(True) \
  .setTask("answer:") \
  .setMaxOutputLength(200)

Using CPUs
Using CPUs


In [24]:
T5.write().overwrite().save(f"{MODEL_NAME}_spark_nlp")

In [25]:
!rm -rf {EXPORT_PATH}

In [26]:
! ls -l {MODEL_NAME}_spark_nlp

total 366308
-rw-r--r-- 1 root root 232820052 May 30 12:59 decoder.onxx
-rw-r--r-- 1 root root 141478081 May 30 12:59 encoder.onxx
drwxr-xr-x 2 root root      4096 May 30 12:59 metadata
-rw-r--r-- 1 root root    791656 May 30 12:59 t5_spp


In [28]:
from sparknlp.base import *
from sparknlp.annotator import *
from pyspark.ml import Pipeline
from pyspark.sql import SparkSession

# Initialize Spark session
spark = SparkSession.builder \
    .appName("QA using T5 with Spark NLP") \
    .getOrCreate()

# Read data from Parquet file
file_path = "./Task1/squad_v2/squad_v2/validation-00000-of-00001.parquet"
test_data = spark.read.parquet(file_path)

# Display the schema of the loaded DataFrame to understand its structure
test_data.printSchema()

# Document Assembler
document_assembler = DocumentAssembler() \
    .setInputCol("context") \
    .setOutputCol("document")

# T5 Transformer for question answering
T5 = T5Transformer.load(f"{MODEL_NAME}_spark_nlp") \
    .setTask("question: ") \
    .setInputCols(["document"]) \
    .setOutputCol("answer")

# Pipeline
pipeline = Pipeline().setStages([document_assembler, T5])

# Transform data
result = pipeline.fit(test_data).transform(test_data)

# Show results
result.select("id", "question", "answer.result").show(truncate=False)


root
 |-- id: string (nullable = true)
 |-- title: string (nullable = true)
 |-- context: string (nullable = true)
 |-- question: string (nullable = true)
 |-- answers: struct (nullable = true)
 |    |-- text: array (nullable = true)
 |    |    |-- element: string (containsNull = true)
 |    |-- answer_start: array (nullable = true)
 |    |    |-- element: integer (containsNull = true)

Using CPUs
Using CPUs


[Stage 3:>                                                          (0 + 1) / 1]

+------------------------+-----------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|id                      |question                                                               |result                                                                                                                                                                                                                                                                     |
+------------------------+-----------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------

                                                                                

In [29]:
from pyspark.sql import SparkSession
import sparknlp

# 创建 Spark 会话
spark = SparkSession.builder.appName("Check Versions").getOrCreate()

# 打印 Spark 版本
print(f"Spark version: {spark.version}")

# 打印 Spark NLP 版本
print(f"Spark NLP version: {sparknlp.version()}")


Spark version: 3.5.0
Spark NLP version: 5.3.3
