In [1]:
import torch
import torch.nn as nn
from huggingface_hub import PyTorchModelHubMixin
from transformers import AutoModelForSequenceClassification, AutoTokenizer

  from .autonotebook import tqdm as notebook_tqdm


### Upload model and datasets to huggingface hub

#### Publish a model and a tokenizer to the Hugging Face Hub

In [5]:
# Model source to be uploaded to huggingface hub
model_path = "./models/full/nlu/deberta_v3_large_sample_False"

In [6]:

# need to add tok into final_model
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSequenceClassification.from_pretrained(model_path)



In [7]:
model_name = 'deberta-v3-large-peacock-knowledge-linking'
# model.push_to_hub(model_name)

model.safetensors: 100%|██████████| 1.74G/1.74G [02:28<00:00, 11.7MB/s]


CommitInfo(commit_url='https://huggingface.co/theirislin/deberta-v3-large-peacock-knowledge-linking/commit/1e74e2d4cdd4455d1e89dcf3541df6e47190c2e3', commit_message='Upload DebertaV2ForSequenceClassification', commit_description='', oid='1e74e2d4cdd4455d1e89dcf3541df6e47190c2e3', pr_url=None, pr_revision=None, pr_num=None)

In [8]:
# tokenizer.push_to_hub(model_name)

README.md: 100%|██████████| 5.18k/5.18k [00:00<00:00, 2.77MB/s]
spm.model: 100%|██████████| 2.46M/2.46M [00:00<00:00, 4.45MB/s]


CommitInfo(commit_url='https://huggingface.co/theirislin/deberta-v3-large-peacock-knowledge-linking/commit/c854b1b45b1fb2eb3a442b0b672d584673009a5f', commit_message='Upload tokenizer', commit_description='', oid='c854b1b45b1fb2eb3a442b0b672d584673009a5f', pr_url=None, pr_revision=None, pr_num=None)

#### Publish datasets to the Hugging Face Hub

In [2]:
from datasets import Dataset, DatasetDict, load_dataset
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from transformers import TrainingArguments, Trainer
from datasets import ClassLabel
import pandas as pd
import numpy as np
from nltk.tokenize import sent_tokenize
import nltk
import evaluate
import time


# fix seeding for pytorch and huggingface
import torch
torch.manual_seed(0)
np.random.seed(0)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

In [3]:
# Dataset config
linking_data_path = "./dimiss_items/data/model_gpt-3.5-turbo-0125"

DS_TYPE = "relation" # "full" or "head" or "tail"
USE_TAG = True

LABEL_TO_ID = {"entailment": 0, "not_entailment": 1}
ID_TO_LABEL = {0: "entailment", 1: "not_entailment"}

COMFACT_LABEL_TO_ID = {True: 0, False: 1}

In [9]:
TRAIN_DATA_PATH = f'{linking_data_path}/processed/gpt_label_full_train_df.json'
VALID_DATA_PATH = f'{linking_data_path}/processed/gpt_label_full_valid_df.json'

train_df = pd.read_json(TRAIN_DATA_PATH)
valid_df = pd.read_json(VALID_DATA_PATH)

# Make train_df and valid_df into a dataset
train_dataset = Dataset.from_pandas(train_df)
valid_dataset = Dataset.from_pandas(valid_df)

raw_dataset = DatasetDict({"train": train_dataset, "validation": valid_dataset})

In [10]:
raw_dataset

DatasetDict({
    train: Dataset({
        features: ['dialog_id', 'relation', 'head', 'tail', 'text', 'gpt_tagged_head_old_label', 'gpt_tagged_head_gpt_output', 'gpt_tagged_head_fact_text', 'gpt_tagged_head_gold_reference', 'gpt_tagged_tail_gpt_output', 'gpt_tagged_tail_old_label', 'gpt_tagged_tail_action', 'gpt_tagged_tail_fact_text', 'gpt_tagged_tail_gold_reference', '__index_level_0__'],
        num_rows: 35821
    })
    validation: Dataset({
        features: ['dialog_id', 'relation', 'head', 'tail', 'text', 'gpt_tagged_head_old_label', 'gpt_tagged_head_gpt_output', 'gpt_tagged_head_fact_text', 'gpt_tagged_head_gold_reference', 'gpt_tagged_tail_gpt_output', 'gpt_tagged_tail_old_label', 'gpt_tagged_tail_action', 'gpt_tagged_tail_fact_text', 'gpt_tagged_tail_gold_reference', '__index_level_0__'],
        num_rows: 3981
    })
})

In [11]:
my_dataset_name = 'SynCPKL'
raw_dataset.push_to_hub(my_dataset_name)

Creating parquet from Arrow format: 100%|██████████| 36/36 [00:00<00:00, 250.86ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:02<00:00,  2.80s/it]
Creating parquet from Arrow format: 100%|██████████| 4/4 [00:00<00:00, 239.76ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:00<00:00,  1.68it/s]


CommitInfo(commit_url='https://huggingface.co/datasets/theirislin/SynCPKL/commit/c05fdec1104ba881a5f5b2560b6c5b6e1c86cae7', commit_message='Upload dataset', commit_description='', oid='c05fdec1104ba881a5f5b2560b6c5b6e1c86cae7', pr_url=None, pr_revision=None, pr_num=None)