In [6]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split

In [7]:
df = pd.read_csv('fairjob.csv')

In [8]:
df.shape

(1072226, 56)

In [9]:
df.head()

Unnamed: 0,click,protected_attribute,senior,displayrandom,rank,user_id,impression_id,product_id,cat0,cat1,...,num41,num42,num43,num44,num45,num46,num47,num48,num49,num50
0,0,0,1,1,1,31020,175316,45953,6,6,...,-0.333033,-0.542801,0.074884,-0.751637,-0.299223,-0.647541,-0.252768,-0.577173,-0.264359,-0.737299
1,0,0,1,0,3,515,129699,10569,8,8,...,-0.333033,-0.542801,-0.258659,0.503731,-0.523668,1.088225,0.935563,-0.577173,0.50779,0.610287
2,0,1,1,0,1,12859,134573,51550,6,6,...,-0.333033,-0.542801,-0.236209,-0.11151,0.084159,0.422154,-0.160055,-0.577173,-0.264359,-0.063506
3,0,1,1,0,1,18781,84169,17843,3,3,...,-0.333033,-0.542801,-0.393358,-0.987701,-0.523668,-0.778977,-0.34548,-0.577173,4.368537,-0.99645
4,0,0,1,0,9,19283,76506,11354,8,8,...,-0.333033,-0.542801,1.553377,-0.636441,-0.523668,-0.541088,-0.34548,-0.577173,-0.264359,-0.581808


In [15]:
def stratified_split_and_save(df, output_dir="output"):
    np.random.seed(42)  # Ensure reproducibility
    
    # 1. Gender-Based Split
    df['gender'] = df['protected_attribute'].apply(lambda x: 'male' if x > 0 else 'female')
    for gender in ['male', 'female']:
        gender_df = df[df['gender'] == gender]
        train, temp = train_test_split(gender_df, test_size=0.1, stratify=gender_df['click'])
        dev, test = train_test_split(temp, test_size=0.5, stratify=temp['click'])

        train.drop(columns=['gender'], inplace=True, errors='ignore')
        dev.drop(columns=['gender'], inplace=True, errors='ignore')
        test.drop(columns=['gedmner'], inplace=True, errors='ignore')
        df.drop(columns=['gedmner'], inplace=True, errors='ignore')
        
        train.to_csv(f"{output_dir}/train_{gender}.csv", index=False)
        dev.to_csv(f"{output_dir}/dev_{gender}.csv", index=False)
        test.to_csv(f"{output_dir}/test_{gender}.csv", index=False)

    # 2. Click-Based Split
    for click_status, click_df in {'clicked': df[df['click'] == 1], 'not_clicked': df[df['click'] == 0]}.items():
        train, temp = train_test_split(click_df, test_size=0.1, stratify=click_df['gender'])
        dev, test = train_test_split(temp, test_size=0.5, stratify=temp['gender'])
        
        train.to_csv(f"{output_dir}/train_{click_status}.csv", index=False)
        dev.to_csv(f"{output_dir}/dev_{click_status}.csv", index=False)
        test.to_csv(f"{output_dir}/test_{click_status}.csv", index=False)

    # 3. Rank-Based Split
    df['rank_category'] = df['rank'].apply(lambda x: 'above_10' if x > 10 else 'below_10')
    for rank_category in ['above_10', 'below_10']:
        rank_df = df[df['rank_category'] == rank_category]
        train, temp = train_test_split(rank_df, test_size=0.1, stratify=rank_df[['click', 'gender']])
        dev, test = train_test_split(temp, test_size=0.5, stratify=temp[['click', 'gender']])
        
        train.drop(columns=['rank_category'], inplace=True, errors='ignore')
        dev.drop(columns=['rank_category'], inplace=True, errors='ignore')
        test.drop(columns=['rank_category'], inplace=True, errors='ignore')
        df.drop(columns=['rank_category'], inplace=True, errors='ignore')
        
        train.to_csv(f"{output_dir}/train_rank_{rank_category}.csv", index=False)
        dev.to_csv(f"{output_dir}/dev_rank_{rank_category}.csv", index=False)
        test.to_csv(f"{output_dir}/test_rank_{rank_category}.csv", index=False)

    print("All 18 CSV files have been created and saved.")

In [16]:
output_directory = "splits"
import os
os.makedirs(output_directory, exist_ok=True)

stratified_split_and_save(df, output_directory)

All 18 CSV files have been created and saved.


In [20]:
def print_dataset_sizes(output_dir="output"):
    size_summary = []
    
    files = [f for f in os.listdir(output_dir) if f.endswith(".csv")]
    
    for file in files:
        file_path = os.path.join(output_dir, file)
        data = pd.read_csv(file_path)
        size_summary.append((file, len(data)))
    
    print("Dataset Sizes:")
    for name, size in size_summary:
        print(f"{name}: {size}")

In [21]:
print_dataset_sizes(output_dir="splits/")

Dataset Sizes:
test_male.csv: 26806
test_rank_above_10.csv: 9958
dev_female.csv: 26806
train_not_clicked.csv: 958263
dev_clicked.csv: 374
dev_not_clicked.csv: 53237
test_rank_below_10.csv: 43655
dev_rank_below_10.csv: 43654
train_female.csv: 482501
train_rank_below_10.csv: 785773
dev_male.csv: 26806
test_not_clicked.csv: 53237
dev_rank_above_10.csv: 9957
test_clicked.csv: 375
train_rank_above_10.csv: 179229
train_male.csv: 482501
train_clicked.csv: 6740
test_female.csv: 26806
