In [10]:
import pandas as pd
import numpy as np

# ---------------------------------------------------------------------
# CONFIG
# ---------------------------------------------------------------------
INPUT_CSV = "/Users/hasancan/Desktop/irony_detection/datasets/additional_datasets_cleaned_combined.csv"
OUTPUT_DIR = "splits_10"   # will save 9*3 = 27 files here
N_SPLITS = 10
VAL_RATIO_PER_CLASS = 0.10   # 10% per class for val
RANDOM_SEED = 42

In [11]:
df = pd.read_csv(INPUT_CSV)

# 1) separate by class
df_irony = df[df["label"] == 1].reset_index(drop=True)
df_lit   = df[df["label"] == 0].reset_index(drop=True)

print("Original counts:", len(df_irony), "ironic;", len(df_lit), "literal")

Original counts: 2222 ironic; 2396 literal


In [12]:
# downsample literal to match ironic
target_n = len(df_irony)
df_lit = df_lit.sample(n=target_n, random_state=RANDOM_SEED).reset_index(drop=True)

print("After balancing:", len(df_irony), "ironic;", len(df_lit), "literal")

After balancing: 2222 ironic; 2222 literal


In [13]:
# 2) shuffle each class and split into 9 chunks
# np.array_split will distribute almost equally (some folds will have +1)
df_irony = df_irony.sample(frac=1, random_state=RANDOM_SEED).reset_index(drop=True)
df_lit   = df_lit.sample(frac=1, random_state=RANDOM_SEED).reset_index(drop=True)

irony_chunks = np.array_split(df_irony, N_SPLITS)
lit_chunks   = np.array_split(df_lit,   N_SPLITS)

# how many per class do we want in val?
val_per_class = int(len(df_irony) * VAL_RATIO_PER_CLASS)  # e.g. 222 for 2222

import os
os.makedirs(OUTPUT_DIR, exist_ok=True)

for i in range(N_SPLITS):
    # -------------------
    # TEST = current chunk
    # -------------------
    test_irony = irony_chunks[i]
    test_lit   = lit_chunks[i]
    test_df = pd.concat([test_irony, test_lit], ignore_index=True)
    
    # -------------------
    # REMAINDER = all other chunks
    # -------------------
    remaining_irony = pd.concat(
        [chunk for j, chunk in enumerate(irony_chunks) if j != i],
        ignore_index=True
    )
    remaining_lit = pd.concat(
        [chunk for j, chunk in enumerate(lit_chunks) if j != i],
        ignore_index=True
    )
    
    # shuffle remainder so val isn't always the same rows
    remaining_irony = remaining_irony.sample(frac=1, random_state=RANDOM_SEED + i).reset_index(drop=True)
    remaining_lit   = remaining_lit.sample(frac=1, random_state=RANDOM_SEED + i).reset_index(drop=True)
    
    # -------------------
    # VAL = first X from each class
    # -------------------
    val_irony = remaining_irony.iloc[:val_per_class]
    train_irony = remaining_irony.iloc[val_per_class:]
    
    val_lit = remaining_lit.iloc[:val_per_class]
    train_lit = remaining_lit.iloc[val_per_class:]
    
    val_df = pd.concat([val_irony, val_lit], ignore_index=True)
    train_df = pd.concat([train_irony, train_lit], ignore_index=True)
    
    # final shuffle inside each split
    train_df = train_df.sample(frac=1, random_state=RANDOM_SEED + 100 + i).reset_index(drop=True)
    val_df   = val_df.sample(frac=1, random_state=RANDOM_SEED + 200 + i).reset_index(drop=True)
    test_df  = test_df.sample(frac=1, random_state=RANDOM_SEED + 300 + i).reset_index(drop=True)
    
    # save
    train_df.to_csv(os.path.join(OUTPUT_DIR, f"train_{i}.csv"), index=False)
    val_df.to_csv(os.path.join(OUTPUT_DIR, f"val_{i}.csv"), index=False)
    test_df.to_csv(os.path.join(OUTPUT_DIR, f"test_{i}.csv"), index=False)
    
    print(f"Fold {i}:",
          f"train={len(train_df)} ({(train_df['label']==1).sum()} irony / {(train_df['label']==0).sum()} literal)",
          f"val={len(val_df)} ({(val_df['label']==1).sum()} irony / {(val_df['label']==0).sum()} literal)",
          f"test={len(test_df)} ({(test_df['label']==1).sum()} irony / {(test_df['label']==0).sum()} literal)",
          )

print("Done.")

Fold 0: train=3554 (1777 irony / 1777 literal) val=444 (222 irony / 222 literal) test=446 (223 irony / 223 literal)
Fold 1: train=3554 (1777 irony / 1777 literal) val=444 (222 irony / 222 literal) test=446 (223 irony / 223 literal)
Fold 2: train=3556 (1778 irony / 1778 literal) val=444 (222 irony / 222 literal) test=444 (222 irony / 222 literal)
Fold 3: train=3556 (1778 irony / 1778 literal) val=444 (222 irony / 222 literal) test=444 (222 irony / 222 literal)
Fold 4: train=3556 (1778 irony / 1778 literal) val=444 (222 irony / 222 literal) test=444 (222 irony / 222 literal)
Fold 5: train=3556 (1778 irony / 1778 literal) val=444 (222 irony / 222 literal) test=444 (222 irony / 222 literal)
Fold 6: train=3556 (1778 irony / 1778 literal) val=444 (222 irony / 222 literal) test=444 (222 irony / 222 literal)
Fold 7: train=3556 (1778 irony / 1778 literal) val=444 (222 irony / 222 literal) test=444 (222 irony / 222 literal)
Fold 8: train=3556 (1778 irony / 1778 literal) val=444 (222 irony / 222 

  return bound(*args, **kwds)


In [5]:
pwd

'/Users/hasancan'

In [16]:
import pandas as pd
import numpy as np
import os

# ---------------------------------------------------------------------
# CONFIG
# ---------------------------------------------------------------------
INPUT_CSV = "/Users/hasancan/Desktop/irony_detection/datasets/all_datasets_combined_irony.csv" 
OUTPUT_DIR = "splits_10"                         # will save 27 files here
N_SPLITS = 10
VAL_RATIO_PER_CLASS = 0.10   # 10% per class for val
RANDOM_SEED = 42
# ---------------------------------------------------------------------

df = pd.read_csv(INPUT_CSV)

# 1) split by label
df_irony = df[df["label"] == 1].reset_index(drop=True)   # 2549
df_lit   = df[df["label"] == 0].reset_index(drop=True)   # 2723

print("Original counts:",
      len(df_irony), "ironic;",
      len(df_lit), "literal")

# 2) downsample literal to match ironic
target_n = len(df_irony)  # 2549
df_lit = df_lit.sample(n=target_n, random_state=RANDOM_SEED).reset_index(drop=True)

print("After balancing:",
      len(df_irony), "ironic;",
      len(df_lit), "literal")

# total after balancing = 2549 * 2 = 5098

# 3) shuffle each class
df_irony = df_irony.sample(frac=1, random_state=RANDOM_SEED).reset_index(drop=True)
df_lit   = df_lit.sample(frac=1, random_state=RANDOM_SEED).reset_index(drop=True)

# 4) split each class into 9 chunks
irony_chunks = np.array_split(df_irony, N_SPLITS)
lit_chunks   = np.array_split(df_lit,   N_SPLITS)

# val size per class (10% of ALL ironic, not per-chunk)
val_per_class = int(len(df_irony) * VAL_RATIO_PER_CLASS)  # int(2549 * 0.1) = 254

os.makedirs(OUTPUT_DIR, exist_ok=True)

for i in range(N_SPLITS):
    # ---------- TEST = current chunk (balanced) ----------
    test_irony = irony_chunks[i]
    test_lit   = lit_chunks[i]
    test_df = pd.concat([test_irony, test_lit], ignore_index=True)

    # ---------- REMAINDER = all other chunks ----------
    remaining_irony = pd.concat(
        [chunk for j, chunk in enumerate(irony_chunks) if j != i],
        ignore_index=True
    )
    remaining_lit = pd.concat(
        [chunk for j, chunk in enumerate(lit_chunks) if j != i],
        ignore_index=True
    )

    # shuffle remainder so val/train differ per fold
    remaining_irony = remaining_irony.sample(frac=1, random_state=RANDOM_SEED + i).reset_index(drop=True)
    remaining_lit   = remaining_lit.sample(frac=1, random_state=RANDOM_SEED + i).reset_index(drop=True)

    # ---------- VAL = first X from each class ----------
    val_irony = remaining_irony.iloc[:val_per_class]
    train_irony = remaining_irony.iloc[val_per_class:]

    val_lit = remaining_lit.iloc[:val_per_class]
    train_lit = remaining_lit.iloc[val_per_class:]

    val_df = pd.concat([val_irony, val_lit], ignore_index=True)
    train_df = pd.concat([train_irony, train_lit], ignore_index=True)

    # final shuffle inside each split
    train_df = train_df.sample(frac=1, random_state=RANDOM_SEED + 100 + i).reset_index(drop=True)
    val_df   = val_df.sample(frac=1, random_state=RANDOM_SEED + 200 + i).reset_index(drop=True)
    test_df  = test_df.sample(frac=1, random_state=RANDOM_SEED + 300 + i).reset_index(drop=True)

    # save
    train_df.to_csv(os.path.join(OUTPUT_DIR, f"train_{i}.csv"), index=False)
    val_df.to_csv(os.path.join(OUTPUT_DIR, f"val_{i}.csv"), index=False)
    test_df.to_csv(os.path.join(OUTPUT_DIR, f"test_{i}.csv"), index=False)

    print(
        f"Fold {i}: "
        f"train={len(train_df)} "
        f"({(train_df['label']==1).sum()} irony / {(train_df['label']==0).sum()} literal) | "
        f"val={len(val_df)} "
        f"({(val_df['label']==1).sum()} irony / {(val_df['label']==0).sum()} literal) | "
        f"test={len(test_df)} "
        f"({(test_df['label']==1).sum()} irony / {(test_df['label']==0).sum()} literal)"
    )

print("Done.")

Original counts: 2549 ironic; 2723 literal
After balancing: 2549 ironic; 2549 literal
Fold 0: train=4080 (2040 irony / 2040 literal) | val=508 (254 irony / 254 literal) | test=510 (255 irony / 255 literal)
Fold 1: train=4080 (2040 irony / 2040 literal) | val=508 (254 irony / 254 literal) | test=510 (255 irony / 255 literal)
Fold 2: train=4080 (2040 irony / 2040 literal) | val=508 (254 irony / 254 literal) | test=510 (255 irony / 255 literal)
Fold 3: train=4080 (2040 irony / 2040 literal) | val=508 (254 irony / 254 literal) | test=510 (255 irony / 255 literal)
Fold 4: train=4080 (2040 irony / 2040 literal) | val=508 (254 irony / 254 literal) | test=510 (255 irony / 255 literal)
Fold 5: train=4080 (2040 irony / 2040 literal) | val=508 (254 irony / 254 literal) | test=510 (255 irony / 255 literal)
Fold 6: train=4080 (2040 irony / 2040 literal) | val=508 (254 irony / 254 literal) | test=510 (255 irony / 255 literal)
Fold 7: train=4080 (2040 irony / 2040 literal) | val=508 (254 irony / 254 

  return bound(*args, **kwds)
