In [1]:
from datasets import load_dataset
from torch.utils.data import DataLoader

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
data = load_dataset('flytech/python-codes-25k')

In [3]:
data

DatasetDict({
    train: Dataset({
        features: ['output', 'instruction', 'input', 'text'],
        num_rows: 49626
    })
})

In [4]:
data['train'][0]

{'output': "```python\ntasks = []\nwhile True:\n    task = input('Enter a task or type 'done' to finish: ')\n    if task == 'done': break\n    tasks.append(task)\nprint(f'Your to-do list for today: {tasks}')\n```",
 'instruction': 'Help me set up my daily to-do list!',
 'input': 'Setting up your daily to-do list...',
 'text': "Help me set up my daily to-do list! Setting up your daily to-do list... ```python\ntasks = []\nwhile True:\n    task = input('Enter a task or type 'done' to finish: ')\n    if task == 'done': break\n    tasks.append(task)\nprint(f'Your to-do list for today: {tasks}')\n```"}

In [5]:
def format_data(data):
    # Alpeca style 
    instruction_text = (
        f"Below is an instruction that describes a task. "
        f"Write a response that appropriately complets the request."
        f"\n\n### Instruction:\n{data['instruction']}"
    )

    input_text = f"\n\n### Input:\n{data['input']}" if data['input'] else '' 
    combined_text = instruction_text + input_text
    return combined_text

In [6]:
print(format_data(data['train'][2]))

Below is an instruction that describes a task. Write a response that appropriately complets the request.

### Instruction:
Calculate how much time I spend on my phone per week!

### Input:
Calculating weekly phone usage...


In [7]:
data['train'].select(range(5)) # select dataset 

Dataset({
    features: ['output', 'instruction', 'input', 'text'],
    num_rows: 5
})

In [8]:
# shuffle dataset 
shuffled_data = data['train'].shuffle(seed=42)

# calculate indicies 
train_end = int(len(shuffled_data) * 0.8)
test_end = (train_end + int(len(shuffled_data) * 0.1))

train_data = shuffled_data.select(range(train_end)) # 0 to 39700 
test_data = shuffled_data.select(range(train_end, test_end)) # 39700 to 39700+4962
val_data = shuffled_data.select(range(test_end, len(shuffled_data))) # rest 

In [9]:
train_data, test_data, val_data

(Dataset({
     features: ['output', 'instruction', 'input', 'text'],
     num_rows: 39700
 }),
 Dataset({
     features: ['output', 'instruction', 'input', 'text'],
     num_rows: 4962
 }),
 Dataset({
     features: ['output', 'instruction', 'input', 'text'],
     num_rows: 4964
 }))

In [10]:
print(format_data(train_data[2]))

Below is an instruction that describes a task. Write a response that appropriately complets the request.

### Instruction:
Generate a function in Python that takes two parameters and returns their product


In [11]:
train_data[10]

{'output': '```python\ndef gcf(a, b):\n    if (a == 0):\n        return b\n    if (b == 0):\n        return a\n\n    if (a == b):\n        return a\n\n    if (a > b):\n        return gcf(a-b, b)\n    return gcf(a, b-a)\n```',
 'instruction': 'Generate a function to calculate the greatest common factor (GCF) of two numbers in Python',
 'input': '',
 'text': "Generate a function to calculate the greatest common factor (GCF) of two numbers in Python Let's turn up the heat! It's getting hot in here! ```python\ndef gcf(a, b):\n    if (a == 0):\n        return b\n    if (b == 0):\n        return a\n\n    if (a == b):\n        return a\n\n    if (a > b):\n        return gcf(a-b, b)\n    return gcf(a, b-a)\n```"}

In [12]:
empty = 0 
for c in data['train']:
    if len(c['input']) <= 0: 
        empty += 1

print(empty)

42296


In [13]:
data['train']['input'][600]

'Setting the optimizer and learning rate scheduler...'

In [14]:
from dataset import InstructDataset, collate_fn
import tiktoken
from functools import partial

In [15]:
device='cpu'

In [16]:
customized_collate_fn = partial(
    collate_fn, 
    device=device, 
    allowed_mask_length=1024
)

In [17]:
train_data

Dataset({
    features: ['output', 'instruction', 'input', 'text'],
    num_rows: 39700
})

In [18]:
tokenizer = tiktoken.get_encoding('gpt2')

train_dataset = InstructDataset(data=train_data, tokenizer=tokenizer)

In [19]:
len(train_dataset), type(train_dataset)

(39700, dataset.InstructDataset)

In [20]:
batch_size = 5

train_loader = DataLoader(
    train_dataset, 
    batch_size=batch_size,
    shuffle=True, 
    collate_fn=customized_collate_fn, 
    drop_last=True, 
    num_workers=0
)

In [21]:
len(train_loader)

7940

In [22]:
7940*5

39700

In [23]:
test_dataset = InstructDataset(data=test_data, tokenizer=tokenizer)
test_loader = DataLoader(
    dataset=test_dataset, 
    batch_size=batch_size, 
    shuffle=True, 
    collate_fn=customized_collate_fn, 
    drop_last=True, 
    num_workers=0 
)

In [24]:
val_dataset = InstructDataset(data=val_data, tokenizer=tokenizer)
val_loader = DataLoader(
    dataset=val_dataset, 
    batch_size=batch_size, 
    collate_fn=customized_collate_fn, 
    shuffle=True, 
    drop_last=True, 
    num_workers=0
)

In [25]:
len(test_loader), len(val_loader)

(992, 992)

In [26]:
print('Train loader..')
for inputs, targets in train_loader:
    print(inputs.shape, targets.shape)

Train loader..
torch.Size([5, 433]) torch.Size([5, 433])
torch.Size([5, 285]) torch.Size([5, 285])
torch.Size([5, 493]) torch.Size([5, 493])
torch.Size([5, 466]) torch.Size([5, 466])
torch.Size([5, 302]) torch.Size([5, 302])
torch.Size([5, 291]) torch.Size([5, 291])
torch.Size([5, 216]) torch.Size([5, 216])
torch.Size([5, 531]) torch.Size([5, 531])
torch.Size([5, 340]) torch.Size([5, 340])
torch.Size([5, 273]) torch.Size([5, 273])
torch.Size([5, 318]) torch.Size([5, 318])
torch.Size([5, 189]) torch.Size([5, 189])
torch.Size([5, 223]) torch.Size([5, 223])
torch.Size([5, 202]) torch.Size([5, 202])
torch.Size([5, 405]) torch.Size([5, 405])
torch.Size([5, 419]) torch.Size([5, 419])
torch.Size([5, 242]) torch.Size([5, 242])
torch.Size([5, 182]) torch.Size([5, 182])
torch.Size([5, 493]) torch.Size([5, 493])
torch.Size([5, 405]) torch.Size([5, 405])
torch.Size([5, 379]) torch.Size([5, 379])
torch.Size([5, 390]) torch.Size([5, 390])
torch.Size([5, 233]) torch.Size([5, 233])
torch.Size([5, 164]