In [1]:
import os
import yaml
from transformers import GPT2LMHeadModel, GPT2Tokenizer
from torch.utils.data import Dataset, DataLoader
from transformers import AdamW, get_linear_schedule_with_warmup
from torch.nn.utils.rnn import pad_sequence



  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def read_data(data_dir):
    examples = []

    # Iterate over files in the data directory
    for file_name in os.listdir(data_dir):
        if file_name.endswith('.txt'):
            # Read text example
            with open(os.path.join(data_dir, file_name), 'r') as file:
                text = file.read().strip()

            # Corresponding YAML file
            yaml_file = file_name.replace('.txt', '.yml')
            yaml_path = os.path.join(data_dir, yaml_file)

            # Read YAML configuration
            with open(yaml_path, 'r') as file:
                yaml_config = yaml.safe_load(file)

            # Append to examples list
            examples.append({'text': text, 'yaml_config': yaml_config})

    return examples

# Path to the dataset directory
data_dir = 'data'

# Read data from the dataset directory
examples = read_data(data_dir)

# Display the first example
print("Text Example:")
print(examples[0]['text'])
print("\nCorresponding YAML Configuration:")
print(yaml.dump(examples[0]['yaml_config']))

Text Example:
"I would like to configure the network with the following setup:

    eth1 should be up and running.
    eth2 should be up and running.
    bond0 should be up and running, with eth1 and eth2 configured for active-backup mode.
    br0 should be up and running, with bond0 as its port.
    vlan29 should be up and running, with bond0 as its base interface and VLAN ID 29.
    br29 should be up and running, with vlan29 as its port."

Corresponding YAML Configuration:
interfaces:
- name: vlan29
  state: up
  type: vlan
  vlan:
    base-iface: bond0
    id: 29
- bridge:
    port:
    - name: vlan29
  name: br29
  state: up
  type: linux-bridge
- bridge:
    port:
    - name: bond0
  name: br0
  state: up
  type: linux-bridge
- link-aggregation:
    mode: active-backup
    port:
    - eth1
    - eth2
  name: bond0
  state: up
  type: bond
- name: eth1
  state: up
  type: ethernet
- name: eth2
  state: up
  type: ethernet



In [7]:
# Load pre-trained GPT tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")


ConnectionError: HTTPSConnectionPool(host='huggingface.co', port=443): Max retries exceeded with url: /api/models/gpt2 (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x7f316abb9150>: Failed to establish a new connection: [Errno -3] Temporary failure in name resolution',))

In [1]:
# Set a padding token in the tokenizer
tokenizer.pad_token = tokenizer.eos_token

# Get the padding token ID
padding_token_id = tokenizer.pad_token_id

# If padding token ID is None, set it to a default value (e.g., 0)
if padding_token_id is None:
    padding_token_id = -100  # You can adjust this value as needed


# Define the maximum sequence length
max_seq_length = 128

# Define a custom collate function for padding
def custom_collate(batch):
    input_ids = [item['input_ids'] for item in batch]
    labels = [item['labels'] for item in batch]

    # Pad the sequences to the maximum length
    input_ids_padded = pad_sequence(input_ids, batch_first=True, padding_value=float(padding_token_id))
    labels_padded = pad_sequence(labels, batch_first=True, padding_value=float(padding_token_id))

    return {
        'input_ids': input_ids_padded,
        'labels': labels_padded
    }

# Tokenize and encode the input text
tokenized_text = [tokenizer.encode(example['text'], return_tensors='pt', max_length=max_seq_length, padding='max_length', truncation=True) for example in examples]

# Encode the YAML configurations
encoded_yaml_configs = [tokenizer.encode(yaml.dump(example['yaml_config']), return_tensors='pt', max_length=max_seq_length, padding='max_length', truncation=True) for example in examples]


labels = encoded_yaml_configs

NameError: name 'tokenizer' is not defined

In [15]:
import torch
# Define the file path to save the preprocessed data
preprocessed_data_file = 'preprocessed_data.pt'

# Save the preprocessed data
torch.save({
    'tokenized_text': tokenized_text,
    'encoded_yaml_configs': encoded_yaml_configs,
    'labels': labels
}, preprocessed_data_file)

print("Preprocessed data saved to:", preprocessed_data_file)


Preprocessed data saved to: preprocessed_data.pt


In [19]:
# Load the preprocessed data
preprocessed_data = torch.load(preprocessed_data_file)


# Extract tokenized text, encoded YAML configurations, and labels from the preprocessed data
tokenized_text = preprocessed_data['tokenized_text']
encoded_yaml_configs = preprocessed_data['encoded_yaml_configs']
labels = preprocessed_data['labels']

# Define the GPT-2 tokenizer
model_name = 'gpt2'
tokenizer = GPT2Tokenizer.from_pretrained(model_name)

# Instantiate the GPT-2 model
model = GPT2LMHeadModel.from_pretrained(model_name)
# Define a custom dataset class to load the preprocessed data
class CustomDataset(Dataset):
    def __init__(self, tokenized_text, encoded_yaml_configs, labels):
        self.tokenized_text = tokenized_text
        self.encoded_yaml_configs = encoded_yaml_configs
        self.labels = labels
    
    def __len__(self):
        return len(self.tokenized_text)
    
    def __getitem__(self, idx):
        return {
            'input_ids': self.tokenized_text[idx],
            'labels': self.encoded_yaml_configs[idx]
        }

# Instantiate the custom dataset
dataset = CustomDataset(tokenized_text, encoded_yaml_configs, labels)

# Define data loader for training
batch_size = 4
train_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

# Define data loader for training with the custom collate function
train_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True, collate_fn=custom_collate)

# Define optimizer and learning rate scheduler
num_epochs = 3
optimizer = AdamW(model.parameters(), lr=5e-5)
num_training_steps = len(train_loader) * num_epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)

# Fine-tune the GPT-2 model
num_epochs = 3
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
model.train()

for epoch in range(num_epochs):
    for batch in train_loader:
        input_ids = batch['input_ids'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids=input_ids, labels=labels)
        loss = outputs.loss

        loss.backward()
        optimizer.step()
        scheduler.step()
        model.zero_grad()

# Save the fine-tuned model
fine_tuned_model_file = 'fine_tuned_gpt2_model.pt'
torch.save(model.state_dict(), fine_tuned_model_file)

print("Fine-tuned GPT-2 model saved to:", fine_tuned_model_file)