1. **Extract** all data from CSV files.
2. **Transform** data into inputs/targets.
3. **Split** inputs/targets into train/test sets.
4. **Sample** inputs/targets for the testing sets.
5. **Load** outputs inputs/targets to parquet files.

# IMPORTS

## schemas
```python

class MetadataSchema(Schema):
    """Schema for metadata in outputs."""

    timestamp: papd.Series[padt.String] = pa.Field()
    model_version: papd.Series[padt.String] = pa.Field()


class InputsSchema(Schema):
    """Schema for validating large string inputs."""

    input: papd.Series[padt.String] = pa.Field()


class OutputsSchema(Schema):
    """Schema for structured JSON outputs."""

    response: papd.Series[padt.String] = pa.Field()
    metadata: papd.Series[padt.Object] = pa.Field()


class TargetsSchema(Schema):
    """Schema for the project target."""

    input: papd.Series[padt.String] = pa.Field()
    response: papd.Series[padt.String] = pa.Field()

```

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

df_llm = pd.read_json("hf://datasets/Vezora/Tested-143k-Python-Alpaca/143k-Tested-Python-Alpaca-Vezora.json")
df_llm.head()

In [13]:
df_input = df_llm.drop(columns=['input', 'output'])
df_input = df_input.rename(columns={'instruction': 'input'})

df_target = df_llm.drop(columns=['instruction'])
df_target = df_target.rename(columns={'output': 'response'})

# CONFIGS

## Extract

## Transform

# Split

In [14]:
SHUFFLE = False # time-sensitive
TEST_SIZE = 0.2

# Sample

In [15]:
SAMPLE_RATIO = 0.15
SAMPLE_RANDOM_STATE = 0

## Load

In [16]:
INPUTS_TRAIN_FILE = "../data/inputs_train.parquet"
INPUTS_TEST_FILE = "../data/inputs_test.parquet"
TARGETS_TRAIN_FILE = "../data/targets_train.parquet"
TARGETS_TEST_FILE = "../data/targets_test.parquet"
INPUTS_SAMPLE_FILE = "../tests/data/inputs_sample.parquet"
TARGETS_SAMPLE_FILE = "../tests/data/targets_sample.parquet"

# SPLIT

In [None]:
inputs_train, inputs_test, targets_train, targets_test = train_test_split(
    df_input, df_target, test_size=TEST_SIZE, shuffle=SHUFFLE
)
inputs_train.shape, inputs_test.shape, targets_train.shape, targets_test.shape

# SAMPLE

In [None]:
inputs_train_sample = inputs_train.sample(frac=SAMPLE_RATIO, random_state=SAMPLE_RANDOM_STATE)
targets_train_sample = targets_train.sample(frac=SAMPLE_RATIO, random_state=SAMPLE_RANDOM_STATE)
inputs_train_sample.shape, targets_train_sample.shape

# LOAD

In [19]:
inputs_train.to_parquet(INPUTS_TRAIN_FILE)
inputs_test.to_parquet(INPUTS_TEST_FILE)
targets_train.to_parquet(TARGETS_TRAIN_FILE)
targets_test.to_parquet(TARGETS_TEST_FILE)
inputs_train_sample.to_parquet(INPUTS_SAMPLE_FILE)
targets_train_sample.to_parquet(TARGETS_SAMPLE_FILE)