In [29]:
import pandas as pd
data = pd.read_csv("data/summarization.csv")
# Split the data into training and validation sets
train_data = data.sample(frac=0.8, random_state=42)
val_data = data.drop(train_data.index)

In [30]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from transformers import Trainer, TrainingArguments
from torch.utils.data import Dataset


class MyDataset(Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        return {key: val[idx] for key, val in self.encodings.items()}

    def __len__(self):
        return len(self.encodings['input_ids'])

# Initialize the tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained('distilgpt2')
model = GPT2LMHeadModel.from_pretrained('distilgpt2')

# Set the padding token
tokenizer.pad_token = tokenizer.eos_token

# Tokenize the training data
train_inputs = tokenizer(train_data['resolution'].tolist(), padding='max_length', max_length=512, truncation=True)
train_labels = tokenizer(train_data['summary'].tolist(), padding='max_length', max_length=512, truncation=True)
train_inputs['labels'] = train_labels['input_ids']

# Tokenize the evaluation data
eval_inputs = tokenizer(val_data['resolution'].tolist(), padding='max_length', max_length=512, truncation=True)
eval_labels = tokenizer(val_data['summary'].tolist(), padding='max_length', max_length=512, truncation=True)
eval_inputs['labels'] = eval_labels['input_ids']

# Create the training dataset
train_dataset = MyDataset(train_inputs)

# Create the evaluation dataset
eval_dataset = MyDataset(eval_inputs)

loading file vocab.json from cache at C:\Users\Administrator/.cache\huggingface\hub\models--distilgpt2\snapshots\f241065e938b44ac52db2c5de82c8bd2fafc76d0\vocab.json
loading file merges.txt from cache at C:\Users\Administrator/.cache\huggingface\hub\models--distilgpt2\snapshots\f241065e938b44ac52db2c5de82c8bd2fafc76d0\merges.txt
loading file added_tokens.json from cache at None
loading file special_tokens_map.json from cache at None
loading file tokenizer_config.json from cache at None
loading configuration file config.json from cache at C:\Users\Administrator/.cache\huggingface\hub\models--distilgpt2\snapshots\f241065e938b44ac52db2c5de82c8bd2fafc76d0\config.json
Model config GPT2Config {
  "_name_or_path": "distilgpt2",
  "_num_labels": 1,
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "id2label": {
    "0": "LABEL_0"
  },
  "initializer_range": 0.02,
  "l

In [31]:
# Define the training arguments
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy='epoch',
    logging_dir='./logs',
    per_device_train_batch_size=1,
    report_to=[]
)

# Initialize the trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
)

# Fine-tune the model
trainer.train()

PyTorch: setting up devices
***** Running training *****
  Num examples = 727
  Num Epochs = 3
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 1
  Gradient Accumulation steps = 1
  Total optimization steps = 2181
  Number of trainable parameters = 81912576


  0%|          | 0/2181 [00:00<?, ?it/s]

Saving model checkpoint to ./results\checkpoint-500
Configuration saved in ./results\checkpoint-500\config.json


{'loss': 0.4849, 'learning_rate': 3.853736817973407e-05, 'epoch': 0.69}


Model weights saved in ./results\checkpoint-500\pytorch_model.bin
***** Running Evaluation *****
  Num examples = 182
  Batch size = 8


  0%|          | 0/23 [00:00<?, ?it/s]

{'eval_loss': 0.36495301127433777, 'eval_runtime': 2.7424, 'eval_samples_per_second': 66.366, 'eval_steps_per_second': 8.387, 'epoch': 1.0}


Saving model checkpoint to ./results\checkpoint-1000
Configuration saved in ./results\checkpoint-1000\config.json


{'loss': 0.3694, 'learning_rate': 2.7074736359468134e-05, 'epoch': 1.38}


Model weights saved in ./results\checkpoint-1000\pytorch_model.bin
***** Running Evaluation *****
  Num examples = 182
  Batch size = 8


  0%|          | 0/23 [00:00<?, ?it/s]

{'eval_loss': 0.3476416766643524, 'eval_runtime': 2.7546, 'eval_samples_per_second': 66.072, 'eval_steps_per_second': 8.35, 'epoch': 2.0}


Saving model checkpoint to ./results\checkpoint-1500
Configuration saved in ./results\checkpoint-1500\config.json


{'loss': 0.3442, 'learning_rate': 1.56121045392022e-05, 'epoch': 2.06}


Model weights saved in ./results\checkpoint-1500\pytorch_model.bin
Saving model checkpoint to ./results\checkpoint-2000
Configuration saved in ./results\checkpoint-2000\config.json


{'loss': 0.3379, 'learning_rate': 4.149472718936268e-06, 'epoch': 2.75}


Model weights saved in ./results\checkpoint-2000\pytorch_model.bin
***** Running Evaluation *****
  Num examples = 182
  Batch size = 8


  0%|          | 0/23 [00:00<?, ?it/s]



Training completed. Do not forget to share your model on huggingface.co/models =)




{'eval_loss': 0.34726694226264954, 'eval_runtime': 2.7414, 'eval_samples_per_second': 66.389, 'eval_steps_per_second': 8.39, 'epoch': 3.0}
{'train_runtime': 168.1788, 'train_samples_per_second': 12.968, 'train_steps_per_second': 12.968, 'train_loss': 0.3781577577726494, 'epoch': 3.0}


TrainOutput(global_step=2181, training_loss=0.3781577577726494, metrics={'train_runtime': 168.1788, 'train_samples_per_second': 12.968, 'train_steps_per_second': 12.968, 'train_loss': 0.3781577577726494, 'epoch': 3.0})

In [32]:
model = GPT2LMHeadModel.from_pretrained("results/checkpoint-2000")

loading configuration file results/checkpoint-2000\config.json
Model config GPT2Config {
  "_name_or_path": "distilgpt2",
  "_num_labels": 1,
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "id2label": {
    "0": "LABEL_0"
  },
  "initializer_range": 0.02,
  "label2id": {
    "LABEL_0": 0
  },
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 6,
  "n_positions": 1024,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "task_specific_params": {
    "text-generation": {
      "do_sample": true,
      "max_length": 50
    }
  },
  "torch_dty

In [33]:
input_text = "resolution of the miami city commission accepting the bids received on february 18, 2021 pursuant to invitation for bid (ifb) no. 1254386 from responsive and responsible bidders bioresponse, corp., florida profit corporation, jrp sons corporation fl cleanup, florida profit corporation, and muma sa lc, florida limited liability company, for the ifb group prequalified pool of vendors for the provision of bio hazardous waste, spills, and decontamination services, on an as needed basis, citywide for an initial term of two (2) years with the option to renew for two (2) additional two (2) year periods; allocating funds from the end user departments and such other sources of funds, subject to the availability of funds and budgetary approval at the time of need; authorizing the city manager to negotiate and execute any and all documents, including any amendments, renewals, and extensions, subject to all allocations, appropriations, prior budgetary approvals, compliance with all applicable provisions of the code of the city of miami, florida, as amended city code), including the citys procurement ordinance, anti deficiency act, and financial integrity principles, all as set forth in chapter 18 of the city code, in forms acceptable to the city attorney, and in compliance with all applicable laws, rules, and regulations, as may be deemed necessary for said purpose. city commission meeting agenda january 13, 2022 city of miami page printed on 2022 ca.2 11189 department of real estate and asset management resolution"                                      
input_text = input_text + "TL;DR:"
# Generate a summary
input_ids = tokenizer.encode(input_text, return_tensors='pt')
attention_mask = input_ids.ne(tokenizer.pad_token_id).long()
output_ids = model.generate(input_ids, attention_mask=attention_mask, max_length=512, pad_token_id=tokenizer.eos_token_id)
output_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
# Print the summary
print(output_text)

resolution of the miami city commission accepting the bids received on february 18, 2021 pursuant to invitation for bid (ifb) no. 1254386 from responsive and responsible bidders bioresponse, corp., florida profit corporation, jrp sons corporation fl cleanup, florida profit corporation, and muma sa lc, florida limited liability company, for the ifb group prequalified pool of vendors for the provision of bio hazardous waste, spills, and decontamination services, on an as needed basis, citywide for an initial term of two (2) years with the option to renew for two (2) additional two (2) year periods; allocating funds from the end user departments and such other sources of funds, subject to the availability of funds and budgetary approval at the time of need; authorizing the city manager to negotiate and execute any and all documents, including any amendments, renewals, and extensions, subject to all allocations, appropriations, prior budgetary approvals, compliance with all applicable prov

In [34]:
from transformers import pipeline, set_seed

set_seed(42)

pipe = pipeline('text-generation', model = 'distilgpt2' )

gpt2_query = input_text

pipe_out = pipe(gpt2_query, max_length = 512, clean_up_tokenization_spaces = True)

loading configuration file config.json from cache at C:\Users\Administrator/.cache\huggingface\hub\models--distilgpt2\snapshots\f241065e938b44ac52db2c5de82c8bd2fafc76d0\config.json
Model config GPT2Config {
  "_name_or_path": "distilgpt2",
  "_num_labels": 1,
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "id2label": {
    "0": "LABEL_0"
  },
  "initializer_range": 0.02,
  "label2id": {
    "LABEL_0": 0
  },
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 6,
  "n_positions": 1024,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "tas