In [1]:
from datasets import load_dataset

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
data = load_dataset('flytech/python-codes-25k')

In [3]:
data

DatasetDict({
    train: Dataset({
        features: ['output', 'instruction', 'input', 'text'],
        num_rows: 49626
    })
})

In [4]:
data['train'][0]

{'output': "```python\ntasks = []\nwhile True:\n    task = input('Enter a task or type 'done' to finish: ')\n    if task == 'done': break\n    tasks.append(task)\nprint(f'Your to-do list for today: {tasks}')\n```",
 'instruction': 'Help me set up my daily to-do list!',
 'input': 'Setting up your daily to-do list...',
 'text': "Help me set up my daily to-do list! Setting up your daily to-do list... ```python\ntasks = []\nwhile True:\n    task = input('Enter a task or type 'done' to finish: ')\n    if task == 'done': break\n    tasks.append(task)\nprint(f'Your to-do list for today: {tasks}')\n```"}

In [5]:
def format_data(data):
    # Alpeca style 
    instruction_text = (
        f"Below is an instruction that describes a task. "
        f"Write a response that appropriately complets the request."
        f"\n\n### Instruction:\n{data['instruction']}"
    )

    input_text = f"\n\n### Input:\n{data['input']}" if data['input'] else '' 
    combined_text = instruction_text + input_text
    return combined_text

In [6]:
print(format_data(data['train'][2]))

Below is an instruction that describes a task. Write a response that appropriately complets the request.

### Instruction:
Calculate how much time I spend on my phone per week!

### Input:
Calculating weekly phone usage...


In [7]:
data['train'].select(range(5)) # select dataset 

Dataset({
    features: ['output', 'instruction', 'input', 'text'],
    num_rows: 5
})

In [8]:
# shuffle dataset 
shuffled_data = data['train'].shuffle(seed=42)

# calculate indicies 
train_end = int(len(shuffled_data) * 0.8)
test_end = (train_end + int(len(shuffled_data) * 0.1))

train_data = shuffled_data.select(range(train_end)) # 0 to 39700 
test_data = shuffled_data.select(range(train_end, test_end)) # 39700 to 39700+4962
val_data = shuffled_data.select(range(test_end, len(shuffled_data))) # rest 

In [9]:
train_data, test_data, val_data

(Dataset({
     features: ['output', 'instruction', 'input', 'text'],
     num_rows: 39700
 }),
 Dataset({
     features: ['output', 'instruction', 'input', 'text'],
     num_rows: 4962
 }),
 Dataset({
     features: ['output', 'instruction', 'input', 'text'],
     num_rows: 4964
 }))

In [10]:
print(format_data(train_data[2]))

Below is an instruction that describes a task. Write a response that appropriately complets the request.

### Instruction:
Generate a function in Python that takes two parameters and returns their product


In [11]:
train_data[10]

{'output': '```python\ndef gcf(a, b):\n    if (a == 0):\n        return b\n    if (b == 0):\n        return a\n\n    if (a == b):\n        return a\n\n    if (a > b):\n        return gcf(a-b, b)\n    return gcf(a, b-a)\n```',
 'instruction': 'Generate a function to calculate the greatest common factor (GCF) of two numbers in Python',
 'input': '',
 'text': "Generate a function to calculate the greatest common factor (GCF) of two numbers in Python Let's turn up the heat! It's getting hot in here! ```python\ndef gcf(a, b):\n    if (a == 0):\n        return b\n    if (b == 0):\n        return a\n\n    if (a == b):\n        return a\n\n    if (a > b):\n        return gcf(a-b, b)\n    return gcf(a, b-a)\n```"}

In [12]:
empty = 0 
for c in data['train']:
    if len(c['input']) <= 0: 
        empty += 1

print(empty)

42296


In [13]:
data['train']['input'][600]

'Setting the optimizer and learning rate scheduler...'

In [14]:
from dataset import InstructDataset
import tiktoken

In [15]:
tokenizer = tiktoken.get_encoding('gpt2')

dataset = InstructDataset(data=data['train'], tokenizer=tokenizer)