In [None]:
%%capture
!pip install datasets transformers

In [None]:
import pandas as pd
from datasets import load_dataset
from tqdm import tqdm

In [None]:
!huggingface-cli login

Alright, so here's the deal: I've got this piece of code that's like my little automation sidekick.

What's it doing?

It's pulling in a bunch of datasets from HuggingFace – you know, that datasets library.

I've set up this dictionary, datasets, where I've dropped in all the info about each dataset – like where to find it, which columns I want to keep, and if I want to rename some of those columns.

So, I hit run, and this code gets to work.

It goes through each dataset in that dictionary, grabs it, and checks if it's got a "test" or "train" part. Whichever it finds, it turns that into a pandas DataFrame.

If I've specified only certain columns, it filters out the rest.

If I've given it a new name for a column, it'll rename it – no questions asked.

Then, here's the cool part: it adds this 'source' column to each DataFrame.

Why?

So I always know where the data's coming from.

And once it's done all that, it neatly stores each DataFrame in a variable named after the dataset.


So, by the end, I've got all these DataFrames ready to play with.

Man, I love it when code just does its thing and saves me a ton of manual work.

Efficiency? Check.

Convenience? Double-check.

In [None]:
datasets = {
    "tiny_codes": {
        "path": "nampdn-ai/tiny-codes",
        "columns": ["prompt", "response"],
        "rename": {"prompt": "instruction", "response": "response"},
        "type":"code"
    },
    "sciphi_textbooks": {
        "path": "emrgnt-cmplxty/sciphi-textbooks-are-all-you-need",
        "columns": ["formatted_prompt", "completion"],
        "rename": {"formatted_prompt": "instruction", "completion": "response"},
        "type":"non-code"
    },
    # "sciphi_python_textbooks": {
    #     "path": "emrgnt-cmplxty/sciphi-python-textbook",
    #     "columns": ["formatted_prompt", "completion"],
    #     "rename": {"formatted_prompt": "instruction", "completion": "response"},
    #     "type":"code"
    # },
    "open_platypus": {
        "path": "garage-bAInd/Open-Platypus",
        "columns": ["instruction", "output"],
        "rename": {"output": "response"},
        "type":"non-code"
    },
    "oig": {
        "path": "0-hero/OIG-small-chip2",
        "columns": ["user", "chip2"],
        "rename": {"user": "instruction", "chip2": "response"},
        "type":"non-code"
    },
    "theorem_qa": {
        "path": "wenhu/TheoremQA",
        "columns": ["Question", "Answer", "theorem_def"],
        "rename": {"Question": "instruction", "Answer": "response", "theorem_def": "input"},
        "type":"non-code"

    },
    "code_instructions": {
        "path": "iamtarun/code_instructions_120k_alpaca",
        "columns": ["instruction", "input", "output"],
        "rename": {"output": "response"},
        "type":"code"
    },
    "code_search_python": {
        "path": "Nan-Do/code-search-net-python",
        "columns": ["summary", "docstring", "code"],
        "rename": {"summary": "instruction", "docstring": "input", "code": "response"},
        "type":"code"
    },
    "instructional_code_search": {
        "path": "Nan-Do/instructional_code-search-net-python",
        "columns": ["INSTRUCTION", "RESPONSE"],
        "rename": {"INSTRUCTION": "instruction", "RESPONSE": "response"},
        "type":"code"
    },
    "wizard_evol_instruct": {
        "path": "WizardLM/WizardLM_evol_instruct_70k",
        "columns": ["output", "instruction"],
        "rename": {"output": "response"},
        "type":"code"
    },
    "lighteval_logic": {
        "path": "lighteval/logiqa_harness",
        "type":"non-code",
        "rename": {"label": "response", "context": "input", "options": "input2", "question": "instruction"}
    },
    "databricks_dolly": {
        "path": "databricks/databricks-dolly-15k",
        "columns": ["instruction", "context", "response"],
        "rename": {"context": "input"},
        "type":"non-code"
    },
    "lighteval_bool": {
        "path": "lighteval/boolq_helm",
        "columns": ["passage", "question", "answer"],
        "rename": {"passage": "input", "question": "instruction", "answer": "response"},
        "type":"non-code"
    }
}

for name, info in tqdm(datasets.items(), desc="Downloading", dynamic_ncols=True):
    path = info["path"]

    # Update tqdm description to show the current dataset name
    tqdm.write(f"Downloading {name}...")

    dataset = load_dataset(path)

    # Check which key exists and convert to pandas dataframe
    if "test" in dataset:
        df = dataset["test"].to_pandas()
    elif "train" in dataset:
        df = dataset["train"].to_pandas()
    else:
        print(f"Neither 'test' nor 'train' key found for {name}. Skipping...")
        continue

    # If specific columns are provided, filter the dataframe
    if "columns" in info:
        df = df[info["columns"]]

    # Rename the columns if rename info is provided
    if "rename" in info:
        df.rename(columns=info["rename"], inplace=True)

    # Add a source column with the path value
    df['source'] = info["path"]

    # Add a type column from the dictionary
    df['type'] = info["type"]

    # Store the dataframe with the desired variable name
    globals()[f"{name}_df"] = df

Downloading:   0%|          | 0/12 [00:00<?, ?it/s]

Downloading tiny_codes...


Downloading readme:   0%|          | 0.00/3.66k [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/120M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/120M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/120M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/120M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/120M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/120M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/120M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/120M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/19.6M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.rename(columns=info["rename"], inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['source'] = info["path"]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['type'] = info["type"]
Downloading:   8%|▊         | 1/12 [01:34<17:16, 94.21s/it]

Downloading sciphi_textbooks...


Downloading readme:   0%|          | 0.00/1.27k [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/184M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/181M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/181M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/181M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/183M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/185M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/185M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/681845 [00:00<?, ? examples/s]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.rename(columns=info["rename"], inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['source'] = info["path"]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['type'] = info["type"]
Downloading:  17%|█▋        | 2/12 [04:34<24:09, 144.96s/it]

Downloading open_platypus...


Downloading readme:   0%|          | 0.00/5.34k [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/15.6M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/24926 [00:00<?, ? examples/s]

Downloading:  25%|██▌       | 3/12 [04:39<12:08, 80.95s/it]

Downloading oig...


Downloading readme:   0%|          | 0.00/480 [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/51.7M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/210289 [00:00<?, ? examples/s]

Downloading:  33%|███▎      | 4/12 [04:48<07:00, 52.50s/it]

Downloading theorem_qa...


Downloading readme:   0%|          | 0.00/1.13k [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/1.62M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating test split: 0 examples [00:00, ? examples/s]

Downloading:  42%|████▏     | 5/12 [04:52<04:04, 34.90s/it]

Downloading code_instructions...


Downloading readme:   0%|          | 0.00/753 [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/72.3M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/121959 [00:00<?, ? examples/s]

Downloading:  50%|█████     | 6/12 [05:01<02:36, 26.08s/it]

Downloading code_search_python...


Downloading readme:   0%|          | 0.00/2.65k [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/155M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/139M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/153M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/151M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/455243 [00:00<?, ? examples/s]

Downloading:  58%|█████▊    | 7/12 [05:54<02:55, 35.00s/it]

Downloading instructional_code_search...


Downloading readme:   0%|          | 0.00/1.88k [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/173M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/418545 [00:00<?, ? examples/s]

Downloading:  67%|██████▋   | 8/12 [06:13<01:59, 29.81s/it]

Downloading wizard_evol_instruct...


Downloading readme:   0%|          | 0.00/4.00k [00:00<?, ?B/s]

Repo card metadata block was not found. Setting CardData to empty.


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/137M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Downloading:  75%|███████▌  | 9/12 [06:45<01:32, 30.71s/it]

Downloading lighteval_logic...


Downloading builder script:   0%|          | 0.00/4.51k [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/165k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/164k [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Downloading:  83%|████████▎ | 10/12 [06:51<00:45, 22.85s/it]

Downloading databricks_dolly...


Downloading readme:   0%|          | 0.00/8.20k [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/13.1M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Downloading:  92%|█████████▏| 11/12 [06:54<00:16, 16.88s/it]

Downloading lighteval_bool...


Downloading builder script:   0%|          | 0.00/1.93k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/6.49M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/2.34M [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Downloading: 100%|██████████| 12/12 [06:59<00:00, 34.92s/it]


In [None]:
def get_response_value(row):
    # Mapping of response to index
    mapping = {'a': 0, 'b': 1, 'c': 2, 'd': 3}
    index = mapping[row['response']]
    return row['input2'][index]

# Assuming df is your dataframe
lighteval_logic_df['response'] = lighteval_logic_df.apply(get_response_value, axis=1)

In [None]:
lighteval_logic_df['input'] = lighteval_logic_df.apply(lambda row: row['input'] + ', ' + ', '.join(row['input2']), axis=1)
mask = lighteval_logic_df['instruction'].str.startswith("Based on the above statement")
lighteval_logic_df.loc[mask, 'instruction'] = lighteval_logic_df.loc[mask, 'instruction'].str.replace("Based on the above statement", "Based on the following statement", n=1)
lighteval_logic_df.drop(columns="input2", inplace=True)

In [None]:
# Retrieve dataframes using globals() and construct the list of dataframes
dfs = [globals()[key + '_df'] for key in list(datasets.keys())]

# Concatenate the dataframes
result_df = pd.concat(dfs, ignore_index=True, sort=False)

# Replace NaN values with empty strings
result_df.fillna("", inplace=True)

In [None]:
result_df.shape

(3641005, 5)

In [None]:
from pandas import Series

def format_example(row: Series) -> Series:
    """
    Formats the instruction, input, and response from a dataframe row by wrapping
    them in structured text. The format varies depending on whether the
    'input' field is present and whether the 'instruction' field already starts
    with a specific phrase.

    Parameters:
    - row (Series): A pandas Series containing 'instruction', 'input', and 'response' fields.

    Returns:
    - Series: A pandas Series with formatted 'instruction', 'input', and 'response' fields.
    """

    # Check if the instruction already starts with the desired phrase
    if not row['instruction'].startswith("Below is an instruction that describes a task"):
        if pd.notna(row['input']) and row['input'].strip() != "":
            row['instruction'] = "Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n\n" + row['instruction']
        else:
            row['instruction'] = "Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n" + row['instruction']

    row['instruction'] = "### Instruction:\n" + row['instruction']

    if pd.notna(row['input']) and row['input'].strip() != "":
        row['input'] = "### Input:\n" + row['input']

    row['response'] = "### Response:\n" + row['response']

    return row

In [None]:
result_df.head()

Unnamed: 0,instruction,response,source,type,input
0,Develop a Python program snippet to Determine ...,Here is python code which determines high snee...,nampdn-ai/tiny-codes,code,
1,Build a Rust module snippet that Displays Extr...,Here is a possible implementation of such a fu...,nampdn-ai/tiny-codes,code,
2,Write a Ruby code snippet to Check High Food h...,Here’s some sample ruby code which implements ...,nampdn-ai/tiny-codes,code,
3,Develop a C# program snippet to Update Low Onl...,Here's some sample code that demonstrates how ...,nampdn-ai/tiny-codes,code,
4,Develop a Julia program snippet to Display Hig...,Here is a possible implementation of this func...,nampdn-ai/tiny-codes,code,


In [None]:
formatted_df = result_df.apply(format_example, axis=1)

# Count words, then count tokens.

We have a huge dataset.

Counting tokens on this will take ages.

Let's first count the number of words and if a row exceeds more than 3000 words then I will assume it will be larger than our token limit.




In [None]:
def count_words(text: str) -> int:
    """Count the number of words in a text string."""
    if pd.isna(text) or text.strip() == "":
        return 0
    return len(text.split())

formatted_df['total_word_count'] = formatted_df[['instruction', 'response', 'input']].apply(lambda row: sum(count_words(text) for text in row), axis=1)

In [None]:
filter_df = formatted_df[formatted_df['total_word_count'] <=3000]

In [39]:
filter_df[['instruction', 'response', 'input','source', 'type']].to_csv('neurips_challenge_dataset.csv', escapechar='\\')

In [None]:
from datasets import load_dataset, Features, Value

# Assuming df is your DataFrame
schema = Features({
    'instruction': Value('string'),
    'response': Value('string'),
    'input': Value('string'),
    'source': Value('string')
})


In [None]:
from datasets import load_dataset

dataset = load_dataset('csv',
                       data_files='neurips_challenge_dataset.csv',
                       features=schema)


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

In [40]:
dataset.push_to_hub("TeamDLD/neurips_challenge_dataset")

Pushing dataset shards to the dataset hub:   0%|          | 0/17 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/215 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/215 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/215 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/215 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/215 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/215 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/215 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/215 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/215 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/215 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/215 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/215 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/215 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/215 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/215 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/215 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/215 [00:00<?, ?ba/s]