# Data Preparation for ML Experiments

In [1]:
import os

from dotenv import load_dotenv
from sklearn.model_selection import train_test_split

from graildient_descent.preprocessing import preprocess_text
from graildient_descent.utils import load_data, set_random_seed

In [2]:
load_dotenv()
random_state = set_random_seed()

In [3]:
N_ROWS = 25000
TRAIN_SIZE = 0.6
EVAL_SIZE = 0.2

In [4]:
# Load data from S3
df = load_data(
    filename="data/raw/sold_listings.csv", from_s3=True, bucket_name="grailed"
)

In [5]:
# Remove duplicates and select the first 25k rows
df.drop_duplicates(inplace=True)
df = df.head(N_ROWS)

assert len(df) == N_ROWS

In [6]:
# Preprocess text columns
df["item_name"] = df["item_name"].apply(preprocess_text)
df["description"] = df["description"].apply(preprocess_text)
df["hashtags"] = df["hashtags"].apply(preprocess_text)

In [7]:
# Split data into train, eval, and test sets (60/20/20)
train_data, temp_data = train_test_split(
    df, train_size=TRAIN_SIZE, random_state=random_state
)
eval_size_adjusted = EVAL_SIZE / (1 - TRAIN_SIZE)
eval_data, test_data = train_test_split(
    temp_data, train_size=eval_size_adjusted, random_state=random_state
)

In [8]:
# Save datasets locally
save_dir = "../data/splits/25k"

train_data.to_csv(os.path.join(save_dir, "train_25k.csv"), index=False)
eval_data.to_csv(os.path.join(save_dir, "eval_25k.csv"), index=False)
test_data.to_csv(os.path.join(save_dir, "test_25k.csv"), index=False)