In [None]:
#Kagggle installation and Semeval dataset download setup
!pip -q install kaggle

import os
import shutil
from google.colab import files

os.makedirs("/root/.kaggle", exist_ok=True)
print("Upload kaggle.json")
uploaded = files.upload()

fname = next(iter(uploaded.keys()))
shutil.move(fname, "/root/.kaggle/kaggle.json")
os.chmod("/root/.kaggle/kaggle.json", 0o600)

!kaggle --version
!kaggle datasets list -s "semeval task 13" | head -n 10

SLUG = "daniilor/semeval-2026-task13"
TARGET = "/content/semeval_task13"
!mkdir -p "$TARGET"
!kaggle datasets download -d "$SLUG" -p "$TARGET"
!unzip -o "$TARGET"/*.zip -d "$TARGET"


In [None]:
#set up paths for both subtask A and subtask B
from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

ROOT = Path(TARGET)

#handle extra folder layer if present
if (ROOT / "SemEval-2026-Task13").exists():
    BASE_DIR = ROOT / "SemEval-2026-Task13"
else:
    BASE_DIR = ROOT

TASK_A_DIR = BASE_DIR / "task_a"
TASK_B_DIR = BASE_DIR / "task_b"

print("BASE_DIR:", BASE_DIR)
print("TASK_A_DIR:", TASK_A_DIR)
print("TASK_B_DIR:", TASK_B_DIR)

print("\nFiles in task_a:")
for p in TASK_A_DIR.iterdir():
    print(" -", p.name)

print("\nFiles in task_b:")
for p in TASK_B_DIR.iterdir():
    print(" -", p.name)


In [None]:
#load train / validation / test for both tasks

train_a = pd.read_parquet(TASK_A_DIR / "task_a_training_set_1.parquet")
val_a  = pd.read_parquet(TASK_A_DIR / "task_a_validation_set.parquet")
test_a  = pd.read_parquet(TASK_A_DIR / "task_a_test_set_sample.parquet")
train_b =pd.read_parquet(TASK_B_DIR/"task_b_training_set.parquet")
val_b  = pd.read_parquet(TASK_B_DIR / "task_b_validation_set.parquet")
test_b  = pd.read_parquet(TASK_B_DIR / "task_b_test_set_sample.parquet")

print("Task A shapes:")
print("Train:", train_a.shape)
print("Val  :", val_a.shape)
print("Test :", test_a.shape)

print("\nTask B shapes:")
print("Train:", train_b.shape)
print("Val  :", val_b.shape)
print("Test :", test_b.shape)


In [None]:
#data exploration function

def explore_dataset(df, name: str):
    """
    Print basic information about a dataset.
    """
    print("\n" + "=" * 30)
    print(f"{name.upper()} set analysis")
    print("=" * 30)

    print(f"shape: {df.shape}")
    print(f"columns: {list(df.columns)}")

    print("\nmissing values:")
    print(df.isnull().sum())

    print("\ndata types:")
    print(df.dtypes)

    if "label" in df.columns:
        print("\nlabel distribution:")
        label_counts = df["label"].value_counts().sort_index()
        for label, count in label_counts.items():
            percentage = (count / len(df)) * 100
            print(f"  {label}: {count:>6} samples ({percentage:5.1f}%)")

    return df


def analyze_column(df, column_name: str, dataset_name: str):
    """
    Print value counts for a given column, with percentages.
    """
    if column_name not in df.columns:
        return

    print(f"\n{column_name.upper()} - {dataset_name}:")
    print(f"unique values: {df[column_name].nunique()}")

    value_counts = df[column_name].value_counts()
    if df[column_name].nunique() <= 10:
        for value, count in value_counts.items():
            percentage = (count / len(df)) * 100
            print(f"    {value}: {count:>6} ({percentage:5.1f}%)")
    else:
        print("top 5 values:")
        top_5 = value_counts.head(5)
        for value, count in top_5.items():
            percentage = (count / len(df)) * 100
            print(f"      {value}: {count:>6} ({percentage:5.1f}%)")


def add_code_stats(df):
    """
    Add code_length and line_count columns to a DataFrame.
    """
    df = df.copy()
    df["code_length"] = df["code"].str.len()
    df["line_count"] = df["code"].str.count("\n") + 1
    return df


def analyze_code_content(df, name: str):
    """
    Print summary statistics for code_length and line_count.
    """
    print(f"\n{name}:")
    df = add_code_stats(df)

    print("code length (characters):")
    print(f"Min   : {df['code_length'].min():.0f}")
    print(f"Max   : {df['code_length'].max():.0f}")
    print(f"Mean  : {df['code_length'].mean():.1f}")
    print(f"Median: {df['code_length'].median():.1f}")

    print("line count:")
    print(f"Min   : {df['line_count'].min():.0f}")
    print(f"Max   : {df['line_count'].max():.0f}")
    print(f"Mean  : {df['line_count'].mean():.1f}")
    print(f"Median: {df['line_count'].median():.1f}")

    return df


def compare_human_machine(df, name: str):
    """
    For Task A (binary labels 0/1): compare human vs machine subsets.
    """
    df = add_code_stats(df)

    human_df = df[df["label"] == 0]
    machine_df = df[df["label"] == 1]

    print(f"\n{name}:")
    print(f"human samples   : {len(human_df):,}")
    print(f"machine samples : {len(machine_df):,}")

    print("\ncode length comparison:")
    print(f"human   - mean: {human_df['code_length'].mean():.1f}, "
          f"median: {human_df['code_length'].median():.1f}")
    print(f"machine - mean: {machine_df['code_length'].mean():.1f}, "
          f"median: {machine_df['code_length'].median():.1f}")

    print("\nline count comparison:")
    print(f"human   -mean: {human_df['line_count'].mean():.1f}, "
          f"median: {human_df['line_count'].median():.1f}")
    print(f"machine - mean: {machine_df['line_count'].mean():.1f}, "
          f"median: {machine_df['line_count'].median():.1f}")

    if "language" in df.columns:
        print("\nlanguage distribution by label:")
        human_lang = human_df["language"].value_counts()
        machine_lang = machine_df["language"].value_counts()

        for lang in df["language"].unique():
            human_count = human_lang.get(lang, 0)
            machine_count = machine_lang.get(lang, 0)
            print(f"{lang}: human={human_count}, machine={machine_count}")


def analyze_generators_task_b(df, name: str):
    """
    For Task B: print generator and label distributions.
    """
    print(f"\n{name}: generator and label distribution")

    if "generator" in df.columns:
        gen_counts = df["generator"].value_counts()
        print("\ngenerator value counts:")
        for g, c in gen_counts.items():
            pct = c / len(df) * 100
            print(f"  {g}: {c:>6} ({pct:5.1f}%)")

    if "label" in df.columns:
        label_counts = df["label"].value_counts().sort_index()
        print("\nlabel ID value counts:")
        for lbl, c in label_counts.items():
            pct = c / len(df) * 100
            print(f"  {lbl}: {c:>6} ({pct:5.1f}%)")


In [None]:
#exploration and plots of taskA

print("TASK A: BASIC OVERVIEW")
train_a = explore_dataset(train_a, "Task A - Training")
val_a   = explore_dataset(val_a,   "Task A - Validation")
test_a  = explore_dataset(test_a,  "Task A - Test")

print("\nCOLUMN ANALYSIS")
for col in ["language", "generator", "domain"]:
    analyze_column(train_a, col, "Task A - Training")
    analyze_column(val_a,   col, "Task A - Validation")
    analyze_column(test_a,  col, "Task A - Test")

print("\nCODE CONTENT ANALYSIS")
train_a = analyze_code_content(train_a, "Task A - Training set")
val_a   = analyze_code_content(val_a,   "Task A - Validation set")
test_a  = analyze_code_content(test_a,  "Task A - Test set")

print("\nHUMAN VS MACHINE COMPARISON ")
compare_human_machine(train_a, "Task A - Training set")
compare_human_machine(val_a,   "Task A - Validation set")


In [None]:
#visualization of Task A

plt.style.use("default")
fig, axes = plt.subplots(2, 3, figsize=(18, 12))
fig.suptitle("SemEval Task 13 - Subtask A Data Analysis", fontsize=16, fontweight="bold")

#label distribution (Training and Validation)
label_data = [train_a["label"].value_counts(), val_a["label"].value_counts()]
titles = ["Training Set", "Validation Set"]

for i, (data, title) in enumerate(zip(label_data, titles)):
    axes[0, i].pie(
        data.values,
        labels=["Human", "Machine"],
        autopct="%1.1f%%",
        startangle=90,
    )
    axes[0, i].set_title(f"{title}\nlabel Distribution", fontweight="bold")

axes[0, 2].axis("off")

#language distribution
language_data_train = train_a["language"].value_counts()
language_data_val = val_a["language"].value_counts()

axes[1, 0].bar(language_data_train.index, language_data_train.values)
axes[1, 0].set_title("Training Set - Languages", fontweight="bold")
axes[1, 0].set_ylabel("Count")
axes[1, 0].tick_params(axis="x", rotation=45)

axes[1, 1].bar(language_data_val.index, language_data_val.values)
axes[1, 1].set_title("Validation Set - Languages", fontweight="bold")
axes[1, 1].set_ylabel("Count")
axes[1, 1].tick_params(axis="x", rotation=45)

#code length distribution
axes[1, 2].hist(train_a["code_length"], bins=50, alpha=0.7, label="Training")
axes[1, 2].hist(val_a["code_length"],   bins=50, alpha=0.7, label="Validation")
axes[1, 2].set_title("Code Length Distribution", fontweight="bold")
axes[1, 2].set_xlabel("Code Length (characters)")
axes[1, 2].set_ylabel("Frequency")
axes[1, 2].legend()

plt.tight_layout()
plt.show()


In [None]:
#exploartion of Task B

print("\n\nTASK B: BASIC OVERVIEW")
train_b = explore_dataset(train_b, "Task B - Training")
val_b   = explore_dataset(val_b,   "Task B - Validation")
test_b  = explore_dataset(test_b,  "Task B - Test")

print("\nCOLUMN ANALYSIS")
for col in ["language", "generator", "domain"]:
    analyze_column(train_b, col, "Task B - Training")
    analyze_column(val_b,   col, "Task B - Validation")
    analyze_column(test_b,  col, "Task B - Test")

print("\nCODE CONTENT ANALYSIS")
train_b = analyze_code_content(train_b, "Task B - Training set")
val_b   = analyze_code_content(val_b,   "Task B - Validation set")
test_b  = analyze_code_content(test_b,  "Task B - Test set")

print("\nGENERATOR AND LABEL ANALYSIS")
analyze_generators_task_b(train_b, "Task B - Training set")
analyze_generators_task_b(val_b,   "Task B - Validation set")


In [None]:
#visualization of task b

plt.style.use("default")
fig, axes = plt.subplots(1, 3, figsize=(18, 5))
fig.suptitle("SemEval Task 13 - Subtask B Data Analysis", fontsize=16, fontweight="bold")

#label distribution (multi-class)
label_counts_b = train_b["label"].value_counts().sort_index()
axes[0].bar(label_counts_b.index.astype(str), label_counts_b.values)
axes[0].set_title("Task B - Training label distribution", fontweight="bold")
axes[0].set_xlabel("Label ID")
axes[0].set_ylabel("Count")
axes[0].tick_params(axis="x", rotation=45)

#generator distribution
if "generator" in train_b.columns:
    gen_counts_b = train_b["generator"].value_counts()
    axes[1].bar(gen_counts_b.index, gen_counts_b.values)
    axes[1].set_title("Task B - Training generators", fontweight="bold")
    axes[1].set_xlabel("Generator")
    axes[1].set_ylabel("Count")
    axes[1].tick_params(axis="x", rotation=45)
else:
    axes[1].axis("off")

#code length distribution
axes[2].hist(train_b["code_length"], bins=50)
axes[2].set_title("Task B - Code length distribution", fontweight="bold")
axes[2].set_xlabel("Code Length (characters)")
axes[2].set_ylabel("Frequency")

plt.tight_layout()
plt.show()
