# Explore public datasets

## Goal

This dataset is a tool to explore public data and see if it can be useful for fine-tuning models.

I want to know:

- Text length: how big are the texts in the dataset, measured in tokens
- Unique prompts: How many unique prompts each dataset has
- Prompt sampling: Inspect how random prompts from the dataset look

## Imports

In [None]:
import os
import glob
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
import pandas as pd
from tqdm.auto import tqdm

from transformers import AutoTokenizer

plt.plot()
plt.close('all')
plt.rcParams["figure.figsize"] = (20, 5)
mpl.rcParams['lines.linewidth'] = 3
mpl.rcParams['font.size'] = 16

pd.set_option('display.max_colwidth', 200)

## Code

In [None]:
tokenizer = AutoTokenizer.from_pretrained('/home/gbarbadillo/data/Mistral-7B-Instruct-v0.2/')

In [None]:
def text_length_distribution(df):
    df['all_text'] = df['original_text'] + df['rewritten_text'] + df['rewrite_prompt']
    df['text_length'] = df['all_text'].apply(lambda x: len(tokenizer.tokenize(str(x))))
    plt.hist(df['text_length'].values, bins=np.linspace(0, 4000, 100))
    plt.grid()
    plt.xlabel('Number of tokens')

In [None]:
def get_dataset_name(filepath):
    name = os.path.basename(filepath).split('.')[0]
    folder = os.path.basename(os.path.dirname(filepath))
    if folder != 'data':
        name = f'{folder}/{name}'
    return name

In [None]:
def create_datasets_summary_table(datasets, dataset_names):
    rows = []
    for dataset in datasets:
        n_prompts = len(dataset['rewrite_prompt'].unique())
        rows.append(dict(
                         n=len(dataset),
                         n_prompts=n_prompts,
                         ratio=round(n_prompts/len(dataset), 2),
                         median_tokens=int(dataset['text_length'].median()),
                         ))
    return pd.DataFrame(rows, index=dataset_names)

In [None]:
def sample_prompts(dataset, n=5, random_seed=7):
    unique_prompts = dataset['rewrite_prompt'].apply(lambda x: str(x).strip()).unique()
    if n > len(unique_prompts):
        return unique_prompts
    np.random.seed(random_seed)
    return sorted(np.random.choice(unique_prompts, n, replace=False))

## Load data

In [None]:
dataset_filepaths = sorted(glob.glob('/mnt/hdd0/Kaggle/llm_prompt_recovery/data/*_curated.csv'))
dataset_filepaths += sorted(glob.glob('/mnt/hdd0/Kaggle/llm_prompt_recovery/data/*/*_curated.csv'))
dataset_names = [get_dataset_name(filepath) for filepath in dataset_filepaths]
datasets = [pd.read_csv(filepath) for filepath in tqdm(dataset_filepaths)]

In [None]:
print('\n'.join(dataset_filepaths))

In [None]:
print('\n'.join(dataset_names))

## Token length distribution

In [None]:
for dataset_name, dataset in zip(dataset_names, datasets):
    text_length_distribution(dataset)
    title = f'Token length distribution of {dataset_name} (n={len(dataset)})'
    plt.title(title)
    plt.show()

## Unique prompts

unique prompts vs dataset lenght, maybe a table with that data and also median token length.

In [None]:
create_datasets_summary_table(datasets, dataset_names)

Around 9 trainings, 2 days of submissions.

## Sampling prompts

In [None]:
n = 20
for dataset_name, dataset in zip(dataset_names, datasets):
    prompts = sample_prompts(dataset, n=n)
    print(f'\n\n\t\t{dataset_name} ({len(dataset)} samples)\n')
    print('\n'.join(prompts))
    print('\n' + '*'*100)

In [None]:
df = pd.read_csv('/mnt/hdd0/Kaggle/llm_prompt_recovery/data/high_quality_dataset_v2.csv')
df.head()

In [None]:
print('\n'.join(df.sample(40).rewrite_prompt.values))