In [33]:
from datasets import load_dataset
from torch.utils.data import DataLoader

In [34]:
data = load_dataset('flytech/python-codes-25k')

In [35]:
data

DatasetDict({
    train: Dataset({
        features: ['output', 'instruction', 'input', 'text'],
        num_rows: 49626
    })
})

In [36]:
data['train'][0]

{'output': "```python\ntasks = []\nwhile True:\n    task = input('Enter a task or type 'done' to finish: ')\n    if task == 'done': break\n    tasks.append(task)\nprint(f'Your to-do list for today: {tasks}')\n```",
 'instruction': 'Help me set up my daily to-do list!',
 'input': 'Setting up your daily to-do list...',
 'text': "Help me set up my daily to-do list! Setting up your daily to-do list... ```python\ntasks = []\nwhile True:\n    task = input('Enter a task or type 'done' to finish: ')\n    if task == 'done': break\n    tasks.append(task)\nprint(f'Your to-do list for today: {tasks}')\n```"}

In [37]:
split_dataset = data['train'].train_test_split(test_size=0.2, seed=42)
split_dataset

DatasetDict({
    train: Dataset({
        features: ['output', 'instruction', 'input', 'text'],
        num_rows: 39700
    })
    test: Dataset({
        features: ['output', 'instruction', 'input', 'text'],
        num_rows: 9926
    })
})

In [65]:
def format_data(data):
    # Alpeca style 
    instruction_text = (
        f"Below is an instruction that describes a task. "
        f"Write a response that appropriately complets the request."
        f"\n\n### Instruction:\n{data['instruction']}"
    )

    # Add input if it exists
    if data.get("input") and data["input"].strip():
        instruction_text += f"\n\n### Input:\n{data['input']}"
        
    # Add the Response header and the actual output
    # IMPORTANT: Add the EOS token at the very end so the model learns to STOP.
    instruction_text += f"\n\n### Response:\n{data['output']}<|endoftext|>"
    return {"text": instruction_text}

In [67]:
print(format_data(data['train'][2])['text'])

Below is an instruction that describes a task. Write a response that appropriately complets the request.

### Instruction:
Calculate how much time I spend on my phone per week!

### Input:
Calculating weekly phone usage...

### Response:
```python
total_time = 0
for i in range(1, 8):
    time = float(input(f'Enter phone usage in hours for day {i}: '))
    total_time += time
print(f'You spend approximately {total_time} hours per week on your phone.')
```<|endoftext|>


In [69]:
dataset = data.map(format_data)

Map: 100%|██████████| 49626/49626 [00:02<00:00, 18750.66 examples/s]


In [73]:
dataset['train'][0]['text']

"Below is an instruction that describes a task. Write a response that appropriately complets the request.\n\n### Instruction:\nHelp me set up my daily to-do list!\n\n### Input:\nSetting up your daily to-do list...\n\n### Response:\n```python\ntasks = []\nwhile True:\n    task = input('Enter a task or type 'done' to finish: ')\n    if task == 'done': break\n    tasks.append(task)\nprint(f'Your to-do list for today: {tasks}')\n```<|endoftext|>"

In [76]:
from datasets import DatasetDict

train_data = dataset['train'].train_test_split(train_size=0.8, seed=42)
test_data = dataset['train'].train_test_split(test_size=0.1)
split_dataset = DatasetDict({
    "train": train_data['train'],
    "val": test_data['test'],
    'test': train_data['test']
})
split_dataset

DatasetDict({
    train: Dataset({
        features: ['output', 'instruction', 'input', 'text'],
        num_rows: 39700
    })
    val: Dataset({
        features: ['output', 'instruction', 'input', 'text'],
        num_rows: 4963
    })
    test: Dataset({
        features: ['output', 'instruction', 'input', 'text'],
        num_rows: 9926
    })
})

In [None]:
# data['train'].select(range(5)) # select dataset 

Dataset({
    features: ['output', 'instruction', 'input', 'text'],
    num_rows: 5
})

In [None]:
# shuffle dataset 
# shuffled_data = data['train'].shuffle(seed=42)

# # calculate indicies 
# train_end = int(len(shuffled_data) * 0.8)
# test_end = (train_end + int(len(shuffled_data) * 0.1))

# train_data = shuffled_data.select(range(train_end)) # 0 to 39700 
# test_data = shuffled_data.select(range(train_end, test_end)) # 39700 to 39700+4962
# val_data = shuffled_data.select(range(test_end, len(shuffled_data))) # rest 

In [None]:
# train_data, test_data, val_data

(Dataset({
     features: ['output', 'instruction', 'input', 'text'],
     num_rows: 39700
 }),
 Dataset({
     features: ['output', 'instruction', 'input', 'text'],
     num_rows: 4962
 }),
 Dataset({
     features: ['output', 'instruction', 'input', 'text'],
     num_rows: 4964
 }))

In [43]:
print(format_data(train_data[2]))

Below is an instruction that describes a task. Write a response that appropriately complets the request.

### Instruction:
Generate a function in Python that takes two parameters and returns their product


In [77]:
train_data[10]

KeyError: "Invalid key: 10. Please first select a split. For example: `my_dataset_dictionary['train'][10]`. Available splits: ['test', 'train']"

In [45]:
empty = 0 
for c in data['train']:
    if len(c['input']) <= 0: 
        empty += 1

print(empty)

42296


In [46]:
data['train']['input'][600]

'Setting the optimizer and learning rate scheduler...'

In [78]:
from dataset import InstructDataset, collate_fn
import tiktoken
from functools import partial

In [83]:
tokenizer = tiktoken.get_encoding('gpt2')
device='cpu'

In [80]:
customized_collate_fn = partial(
    collate_fn, 
    device=device, 
    allowed_mask_length=1024
)

In [84]:
train_dataset = InstructDataset(split_dataset['train'], tokenizer)
test_dataset = InstructDataset(split_dataset['test'], tokenizer)
val_dataset = InstructDataset(split_dataset['val'], tokenizer)

In [86]:
train_dataset.data

Dataset({
    features: ['output', 'instruction', 'input', 'text'],
    num_rows: 39700
})

In [88]:
batch_size = 5

train_loader = DataLoader(
    train_dataset, 
    batch_size=batch_size,
    shuffle=True, 
    collate_fn=customized_collate_fn, 
    drop_last=True, 
    num_workers=0
)

In [94]:
train_loader.collate_fn, len(train_loader) # 7940 * 5 = 39700

(functools.partial(<function collate_fn at 0x7a9a8d621940>, device='cpu', allowed_mask_length=1024),
 7940)

In [96]:

test_loader = DataLoader(
    dataset=test_dataset, 
    batch_size=batch_size, 
    shuffle=True, 
    collate_fn=customized_collate_fn, 
    drop_last=True, 
    num_workers=0 
)

In [97]:

val_loader = DataLoader(
    dataset=val_dataset, 
    batch_size=batch_size, 
    collate_fn=customized_collate_fn, 
    shuffle=True, 
    drop_last=True, 
    num_workers=0
)

In [98]:
len(test_loader), len(val_loader)

(1985, 992)

In [99]:
print('Train loader..')
for inputs, targets in train_loader:
    pass
print(inputs.shape, targets.shape)

Train loader..
torch.Size([5, 200]) torch.Size([5, 200])


In [100]:
print('vall loader..')
for inputs, targets in val_loader:
    pass
print(inputs.shape, targets.shape)

vall loader..
torch.Size([5, 234]) torch.Size([5, 234])


In [101]:
train_tokens = 0 
for inputs, targets in train_loader:
    train_tokens += inputs.numel()

train_tokens

12653170

In [102]:
len(inputs[0])

215

In [103]:
inputs[0]

tensor([21106,   318,   281, 12064,   326,  8477,   257,  4876,    13, 19430,
          257,  2882,   326, 20431,  1224,   912,   262,  2581,    13,   198,
          198, 21017, 46486,    25,   198, 16594,   257,  2163,   287, 11361,
          284, 15284,   262,   299,   400,  1988,   286,   262, 41566,   261,
        44456,  8379,   767,   628, 18261,    25,   198, 15506,    63, 29412,
          198,  4299, 12900,   261, 44456,     7,    77,  2599,   198,   220,
          611,   299,  6624,   657,   393,   299,  6624,   352,    25,   198,
          220,   220,   220,  1441,   299,   198,   220,  2073,    25,   198,
          220,   220,   220,  1441, 12900,   261, 44456,     7,    77,   532,
          352,     8,  1343, 12900,   261, 44456,     7,    77,   532,   362,
            8,   198,   220,   220,   198, 20274,   796, 12900,   261, 44456,
            7,    22,     8,   198,  4798,     7, 20274,     8,   198, 15506,
           63, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 

In [106]:
tokenizer.decode(inputs[0].tolist())

'Below is an instruction that describes a task. Write a response that appropriately complets the request.\n\n### Instruction:\nWrite a function in Python to calculate the nth value of the Fibonacci sequence 7\n\n Response:\n```python\ndef fibonacci(n):\n  if n == 0 or n == 1:\n    return n\n  else:\n    return fibonacci(n - 1) + fibonacci(n - 2)\n  \nresult = fibonacci(7)\nprint(result)\n```<|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoft

In [None]:
# as we can see the first token is shifted by 1 and exccesive tokens is converted into mask -100 which will be ignored by cross_entropy_loss 
targets[0]

tensor([  318,   281, 12064,   326,  8477,   257,  4876,    13, 19430,   257,
         2882,   326, 20431,  1224,   912,   262,  2581,    13,   198,   198,
        21017, 46486,    25,   198, 16594,   257,  2163,   287, 11361,   284,
        15284,   262,   299,   400,  1988,   286,   262, 41566,   261, 44456,
         8379,   767,   628, 18261,    25,   198, 15506,    63, 29412,   198,
         4299, 12900,   261, 44456,     7,    77,  2599,   198,   220,   611,
          299,  6624,   657,   393,   299,  6624,   352,    25,   198,   220,
          220,   220,  1441,   299,   198,   220,  2073,    25,   198,   220,
          220,   220,  1441, 12900,   261, 44456,     7,    77,   532,   352,
            8,  1343, 12900,   261, 44456,     7,    77,   532,   362,     8,
          198,   220,   220,   198, 20274,   796, 12900,   261, 44456,     7,
           22,     8,   198,  4798,     7, 20274,     8,   198, 15506,    63,
        50256,  -100,  -100,  -100,  -100,  -100,  -100,  -100, 