In [1]:
from datasets import load_dataset
from torch.utils.data import DataLoader

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
data = load_dataset('flytech/python-codes-25k')

In [3]:
data

DatasetDict({
    train: Dataset({
        features: ['output', 'instruction', 'input', 'text'],
        num_rows: 49626
    })
})

In [4]:
data['train'][0]

{'output': "```python\ntasks = []\nwhile True:\n    task = input('Enter a task or type 'done' to finish: ')\n    if task == 'done': break\n    tasks.append(task)\nprint(f'Your to-do list for today: {tasks}')\n```",
 'instruction': 'Help me set up my daily to-do list!',
 'input': 'Setting up your daily to-do list...',
 'text': "Help me set up my daily to-do list! Setting up your daily to-do list... ```python\ntasks = []\nwhile True:\n    task = input('Enter a task or type 'done' to finish: ')\n    if task == 'done': break\n    tasks.append(task)\nprint(f'Your to-do list for today: {tasks}')\n```"}

In [5]:
split_dataset = data['train'].train_test_split(test_size=0.2, seed=42)
split_dataset

DatasetDict({
    train: Dataset({
        features: ['output', 'instruction', 'input', 'text'],
        num_rows: 39700
    })
    test: Dataset({
        features: ['output', 'instruction', 'input', 'text'],
        num_rows: 9926
    })
})

In [6]:
def format_data(data):
    # Alpeca style 
    instruction_text = (
        f"Below is an instruction that describes a task. "
        f"Write a response that appropriately complets the request."
        f"\n\n### Instruction:\n{data['instruction']}"
    )

    # Add input if it exists
    if data.get("input") and data["input"].strip():
        instruction_text += f"\n\n### Input:\n{data['input']}"
        
    # Add the Response header and the actual output
    # IMPORTANT: Add the EOS token at the very end so the model learns to STOP.
    instruction_text += f"\n\n### Response:\n{data['output']}<|endoftext|>"
    return {"text": instruction_text}

In [7]:
print(format_data(data['train'][2])['text'])

Below is an instruction that describes a task. Write a response that appropriately complets the request.

### Instruction:
Calculate how much time I spend on my phone per week!

### Input:
Calculating weekly phone usage...

### Response:
```python
total_time = 0
for i in range(1, 8):
    time = float(input(f'Enter phone usage in hours for day {i}: '))
    total_time += time
print(f'You spend approximately {total_time} hours per week on your phone.')
```<|endoftext|>


In [8]:
dataset = data.map(format_data, remove_columns=['instruction', 'input', 'output'])

In [9]:
dataset['train'][0]['text']

"Below is an instruction that describes a task. Write a response that appropriately complets the request.\n\n### Instruction:\nHelp me set up my daily to-do list!\n\n### Input:\nSetting up your daily to-do list...\n\n### Response:\n```python\ntasks = []\nwhile True:\n    task = input('Enter a task or type 'done' to finish: ')\n    if task == 'done': break\n    tasks.append(task)\nprint(f'Your to-do list for today: {tasks}')\n```<|endoftext|>"

In [10]:
from datasets import DatasetDict

train_data = dataset['train'].train_test_split(train_size=0.8, seed=42)
test_data = dataset['train'].train_test_split(test_size=0.1)
split_dataset = DatasetDict({
    "train": train_data['train'],
    "val": test_data['test'],
    'test': train_data['test']
})
split_dataset

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 39700
    })
    val: Dataset({
        features: ['text'],
        num_rows: 4963
    })
    test: Dataset({
        features: ['text'],
        num_rows: 9926
    })
})

In [11]:
split_dataset.save_to_disk('../final_dataset')

Saving the dataset (1/1 shards): 100%|██████████| 39700/39700 [00:00<00:00, 169897.03 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 4963/4963 [00:00<00:00, 183854.12 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 9926/9926 [00:00<00:00, 190723.55 examples/s]


In [12]:
# data['train'].select(range(5)) # select dataset 

In [13]:
# shuffle dataset 
# shuffled_data = data['train'].shuffle(seed=42)

# # calculate indicies 
# train_end = int(len(shuffled_data) * 0.8)
# test_end = (train_end + int(len(shuffled_data) * 0.1))

# train_data = shuffled_data.select(range(train_end)) # 0 to 39700 
# test_data = shuffled_data.select(range(train_end, test_end)) # 39700 to 39700+4962
# val_data = shuffled_data.select(range(test_end, len(shuffled_data))) # rest 

In [14]:
# train_data, test_data, val_data

In [15]:
empty = 0 
for c in data['train']:
    if len(c['input']) <= 0: 
        empty += 1

print(empty)

42296


In [16]:
data['train']['input'][600]

'Setting the optimizer and learning rate scheduler...'

In [17]:
from dataset import InstructDataset, collate_fn
import tiktoken
from functools import partial

In [18]:
tokenizer = tiktoken.get_encoding('gpt2')
device='cpu'

In [19]:
customized_collate_fn = partial(
    collate_fn, 
    device=device, 
    allowed_mask_length=1024
)

In [20]:
train_dataset = InstructDataset(split_dataset['train'], tokenizer)
test_dataset = InstructDataset(split_dataset['test'], tokenizer)
val_dataset = InstructDataset(split_dataset['val'], tokenizer)

In [21]:
train_dataset.data

Dataset({
    features: ['text'],
    num_rows: 39700
})

In [35]:
batch_size = 8

train_loader = DataLoader(
    train_dataset, 
    batch_size=batch_size,
    shuffle=True, 
    collate_fn=customized_collate_fn, 
    drop_last=True, 
    num_workers=0
)

In [36]:
train_loader.collate_fn, len(train_loader) # 7940 * 5 = 39700

(functools.partial(<function collate_fn at 0x7a2a090e36a0>, device='cpu', allowed_mask_length=1024),
 4962)

In [37]:

test_loader = DataLoader(
    dataset=test_dataset, 
    batch_size=batch_size, 
    shuffle=True, 
    collate_fn=customized_collate_fn, 
    drop_last=True, 
    num_workers=0 
)

In [38]:

val_loader = DataLoader(
    dataset=val_dataset, 
    batch_size=batch_size, 
    collate_fn=customized_collate_fn, 
    shuffle=True, 
    drop_last=True, 
    num_workers=0
)

In [39]:
train_data

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 39700
    })
    test: Dataset({
        features: ['text'],
        num_rows: 9926
    })
})

In [40]:
len(test_loader), len(val_loader)

(1240, 620)

In [41]:
print('Train loader..')
for inputs, targets in train_loader:
    pass
print(inputs.shape, targets.shape)

Train loader..
torch.Size([8, 157]) torch.Size([8, 157])


In [42]:
print('vall loader..')
for inputs, targets in val_loader:
    pass
print(inputs.shape, targets.shape)

vall loader..
torch.Size([8, 241]) torch.Size([8, 241])


In [43]:
train_tokens = 0 
for inputs, targets in train_loader:
    train_tokens += inputs.numel()

train_tokens

14458752

In [44]:
len(inputs[0])

506

In [45]:
inputs[0]

tensor([21106,   318,   281, 12064,   326,  8477,   257,  4876,    13, 19430,
          257,  2882,   326, 20431,  1224,   912,   262,  2581,    13,   198,
          198, 21017, 46486,    25,   198, 16447,   257,  2163,   326,  6673,
          262,  1573,   705,  1069,  5666,     6,   355,   262,   938,  5002,
          287,   597,  1813,  1351,    13,   383,  1351,   481,  2291,  5701,
           12,  5363,  2456,    13,   383,  2163,   815,   664,  1834,  2280,
          751,   262,  1573,   705,  1069,  5666,     6,  1566,   262,  1351,
         4129, 21767,   838,    13,  4418,    11,   878,  4375,   262,  5002,
           11,   262,  2163,   815,  2198,   611,   705,  1069,  5666,     6,
         1541,  7160,   287,   262,  1351,    13,  1002,   705,  1069,  5666,
            6,  1541,  7160,    11,   262,  2163,   815,  2427,   751,   262,
         1573,   705, 11274,  4458, 27814,   262,  2457,  8341,   618,   262,
         5128,  1351,  1595,   470,   423,   705,  1069,  5666, 

In [46]:
tokenizer.decode(inputs[0].tolist())

"Below is an instruction that describes a task. Write a response that appropriately complets the request.\n\n### Instruction:\nCreate a function that adds the word 'excellent' as the last element in any given list. The list will include sports-related words. The function should recursively add the word 'excellent' until the list length equals 10. Also, before adding the element, the function should check if 'excellent' already exists in the list. If 'excellent' already exists, the function should instead add the word 'good'. Compare the final lists when the input list doesn't have 'excellent' and does have 'excellent'.\n\nOriginal list - ['soccer', 'rugby', 'hockey']\n\n### Response:\nHere is the Python code:\n\n\ndef add_word(lst):\n    if len(lst)<10:\n        if 'excellent' in lst:\n            lst.append('good')\n        else:\n            lst.append('excellent')\n        return add_word(lst)\n    else:\n        return lst\n\n# Original list without 'excellent'\nlst1 = ['soccer', '

In [47]:
# as we can see the first token is shifted by 1 and exccesive tokens is converted into mask -100 which will be ignored by cross_entropy_loss 
targets[0]

tensor([  318,   281, 12064,   326,  8477,   257,  4876,    13, 19430,   257,
         2882,   326, 20431,  1224,   912,   262,  2581,    13,   198,   198,
        21017, 46486,    25,   198, 16447,   257,  2163,   326,  6673,   262,
         1573,   705,  1069,  5666,     6,   355,   262,   938,  5002,   287,
          597,  1813,  1351,    13,   383,  1351,   481,  2291,  5701,    12,
         5363,  2456,    13,   383,  2163,   815,   664,  1834,  2280,   751,
          262,  1573,   705,  1069,  5666,     6,  1566,   262,  1351,  4129,
        21767,   838,    13,  4418,    11,   878,  4375,   262,  5002,    11,
          262,  2163,   815,  2198,   611,   705,  1069,  5666,     6,  1541,
         7160,   287,   262,  1351,    13,  1002,   705,  1069,  5666,     6,
         1541,  7160,    11,   262,  2163,   815,  2427,   751,   262,  1573,
          705, 11274,  4458, 27814,   262,  2457,  8341,   618,   262,  5128,
         1351,  1595,   470,   423,   705,  1069,  5666,     6, 