In [35]:
from datasets import load_dataset
from torch.utils.data import DataLoader

In [36]:
data = load_dataset('flytech/python-codes-25k')

In [37]:
data

DatasetDict({
    train: Dataset({
        features: ['output', 'instruction', 'input', 'text'],
        num_rows: 49626
    })
})

In [38]:
data['train'][0]

{'output': "```python\ntasks = []\nwhile True:\n    task = input('Enter a task or type 'done' to finish: ')\n    if task == 'done': break\n    tasks.append(task)\nprint(f'Your to-do list for today: {tasks}')\n```",
 'instruction': 'Help me set up my daily to-do list!',
 'input': 'Setting up your daily to-do list...',
 'text': "Help me set up my daily to-do list! Setting up your daily to-do list... ```python\ntasks = []\nwhile True:\n    task = input('Enter a task or type 'done' to finish: ')\n    if task == 'done': break\n    tasks.append(task)\nprint(f'Your to-do list for today: {tasks}')\n```"}

In [39]:
split_dataset = data['train'].train_test_split(test_size=0.2, seed=42)
split_dataset

DatasetDict({
    train: Dataset({
        features: ['output', 'instruction', 'input', 'text'],
        num_rows: 39700
    })
    test: Dataset({
        features: ['output', 'instruction', 'input', 'text'],
        num_rows: 9926
    })
})

In [40]:
def format_data(data):
    # Alpeca style 
    instruction_text = (
        f"Below is an instruction that describes a task. "
        f"Write a response that appropriately complets the request."
        f"\n\n### Instruction:\n{data['instruction']}"
    )

    # Add input if it exists
    if data.get("input") and data["input"].strip():
        instruction_text += f"\n\n### Input:\n{data['input']}"
        
    # Add the Response header and the actual output
    # IMPORTANT: Add the EOS token at the very end so the model learns to STOP.
    instruction_text += f"\n\n### Response:\n{data['output']}<|endoftext|>"
    return {"text": instruction_text}

In [41]:
print(format_data(data['train'][2])['text'])

Below is an instruction that describes a task. Write a response that appropriately complets the request.

### Instruction:
Calculate how much time I spend on my phone per week!

### Input:
Calculating weekly phone usage...

### Response:
```python
total_time = 0
for i in range(1, 8):
    time = float(input(f'Enter phone usage in hours for day {i}: '))
    total_time += time
print(f'You spend approximately {total_time} hours per week on your phone.')
```<|endoftext|>


In [42]:
dataset = data.map(format_data, remove_columns=['instruction', 'input', 'output'])

In [43]:
dataset['train'][0]['text']

"Below is an instruction that describes a task. Write a response that appropriately complets the request.\n\n### Instruction:\nHelp me set up my daily to-do list!\n\n### Input:\nSetting up your daily to-do list...\n\n### Response:\n```python\ntasks = []\nwhile True:\n    task = input('Enter a task or type 'done' to finish: ')\n    if task == 'done': break\n    tasks.append(task)\nprint(f'Your to-do list for today: {tasks}')\n```<|endoftext|>"

In [44]:
from datasets import DatasetDict

train_data = dataset['train'].train_test_split(train_size=0.8, seed=42)
test_data = dataset['train'].train_test_split(test_size=0.1)
split_dataset = DatasetDict({
    "train": train_data['train'],
    "val": test_data['test'],
    'test': train_data['test']
})
split_dataset

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 39700
    })
    val: Dataset({
        features: ['text'],
        num_rows: 4963
    })
    test: Dataset({
        features: ['text'],
        num_rows: 9926
    })
})

In [69]:
split_dataset.save_to_disk('../final_dataset')

Saving the dataset (1/1 shards): 100%|██████████| 39700/39700 [00:00<00:00, 168037.13 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 4963/4963 [00:00<00:00, 163324.26 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 9926/9926 [00:00<00:00, 178644.91 examples/s]


In [45]:
# data['train'].select(range(5)) # select dataset 

In [46]:
# shuffle dataset 
# shuffled_data = data['train'].shuffle(seed=42)

# # calculate indicies 
# train_end = int(len(shuffled_data) * 0.8)
# test_end = (train_end + int(len(shuffled_data) * 0.1))

# train_data = shuffled_data.select(range(train_end)) # 0 to 39700 
# test_data = shuffled_data.select(range(train_end, test_end)) # 39700 to 39700+4962
# val_data = shuffled_data.select(range(test_end, len(shuffled_data))) # rest 

In [47]:
# train_data, test_data, val_data

In [48]:
empty = 0 
for c in data['train']:
    if len(c['input']) <= 0: 
        empty += 1

print(empty)

42296


In [49]:
data['train']['input'][600]

'Setting the optimizer and learning rate scheduler...'

In [50]:
from dataset import InstructDataset, collate_fn
import tiktoken
from functools import partial

In [51]:
tokenizer = tiktoken.get_encoding('gpt2')
device='cpu'

In [52]:
customized_collate_fn = partial(
    collate_fn, 
    device=device, 
    allowed_mask_length=1024
)

In [53]:
train_dataset = InstructDataset(split_dataset['train'], tokenizer)
test_dataset = InstructDataset(split_dataset['test'], tokenizer)
val_dataset = InstructDataset(split_dataset['val'], tokenizer)

In [54]:
train_dataset.data

Dataset({
    features: ['text'],
    num_rows: 39700
})

In [55]:
batch_size = 5

train_loader = DataLoader(
    train_dataset, 
    batch_size=batch_size,
    shuffle=True, 
    collate_fn=customized_collate_fn, 
    drop_last=True, 
    num_workers=0
)

In [56]:
train_loader.collate_fn, len(train_loader) # 7940 * 5 = 39700

(functools.partial(<function collate_fn at 0x7a5b3c5fd6c0>, device='cpu', allowed_mask_length=1024),
 7940)

In [57]:

test_loader = DataLoader(
    dataset=test_dataset, 
    batch_size=batch_size, 
    shuffle=True, 
    collate_fn=customized_collate_fn, 
    drop_last=True, 
    num_workers=0 
)

In [58]:

val_loader = DataLoader(
    dataset=val_dataset, 
    batch_size=batch_size, 
    collate_fn=customized_collate_fn, 
    shuffle=True, 
    drop_last=True, 
    num_workers=0
)

In [59]:
train_data

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 39700
    })
    test: Dataset({
        features: ['text'],
        num_rows: 9926
    })
})

In [60]:
len(test_loader), len(val_loader)

(1985, 992)

In [61]:
print('Train loader..')
for inputs, targets in train_loader:
    pass
print(inputs.shape, targets.shape)

Train loader..
torch.Size([5, 216]) torch.Size([5, 216])


In [62]:
print('vall loader..')
for inputs, targets in val_loader:
    pass
print(inputs.shape, targets.shape)

vall loader..
torch.Size([5, 319]) torch.Size([5, 319])


In [63]:
train_tokens = 0 
for inputs, targets in train_loader:
    train_tokens += inputs.numel()

train_tokens

12770460

In [64]:
len(inputs[0])

255

In [65]:
inputs[0]

tensor([21106,   318,   281, 12064,   326,  8477,   257,  4876,    13, 19430,
          257,  2882,   326, 20431,  1224,   912,   262,  2581,    13,   198,
          198, 21017, 46486,    25,   198, 16447,   257,  2163,   287, 11361,
          326,  2753,   734, 13042,   290, 21001,   262,  3435,   286,  1123,
         4731,  5291,   262,  1502,   286,   262,  3435, 22944,   198,  5657,
          198,   198, 21017, 18261,    25,   198, 15506,    63, 29412,   198,
         4299, 12082,     7,    82,    16,    11,   264,    17,  2599,   198,
          220,   220,   220,  1255,   796, 13538,   220,   198,   220,   220,
          220,   329,  1312,   287,  2837,     7,  9806,     7, 11925,     7,
           82,    16,   828, 18896,     7,    82,    17,  4008,  2599,   198,
          220,   220,   220,   220,   220,   220,   220,   611,  1312,  1279,
        18896,     7,    82,    16,  2599,   198,   220,   220,   220,   220,
          220,   220,   220,   220,   220,   220,   220,  1255, 

In [66]:
tokenizer.decode(inputs[0].tolist())

'Below is an instruction that describes a task. Write a response that appropriately complets the request.\n\n### Instruction:\nCreate a function in Python that takes two strings and combines the characters of each string keeping the order of the characters foo\nbar\n\n### Response:\n```python\ndef combine(s1, s2):\n    result = "" \n    for i in range(max(len(s1), len(s2))):\n        if i < len(s1):\n            result += s1[i]\n        if i < len(s2):\n            result += s2[i]\n    return result\n```<|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftex

In [67]:
# as we can see the first token is shifted by 1 and exccesive tokens is converted into mask -100 which will be ignored by cross_entropy_loss 
targets[0]

tensor([  318,   281, 12064,   326,  8477,   257,  4876,    13, 19430,   257,
         2882,   326, 20431,  1224,   912,   262,  2581,    13,   198,   198,
        21017, 46486,    25,   198, 16447,   257,  2163,   287, 11361,   326,
         2753,   734, 13042,   290, 21001,   262,  3435,   286,  1123,  4731,
         5291,   262,  1502,   286,   262,  3435, 22944,   198,  5657,   198,
          198, 21017, 18261,    25,   198, 15506,    63, 29412,   198,  4299,
        12082,     7,    82,    16,    11,   264,    17,  2599,   198,   220,
          220,   220,  1255,   796, 13538,   220,   198,   220,   220,   220,
          329,  1312,   287,  2837,     7,  9806,     7, 11925,     7,    82,
           16,   828, 18896,     7,    82,    17,  4008,  2599,   198,   220,
          220,   220,   220,   220,   220,   220,   611,  1312,  1279, 18896,
            7,    82,    16,  2599,   198,   220,   220,   220,   220,   220,
          220,   220,   220,   220,   220,   220,  1255, 15853, 