In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import f1_score, accuracy_score
import os
from google.colab import files

In [None]:

from pathlib import Path
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, accuracy_score

import pandas as pd
import numpy as np
import joblib
#data already downloaded using 01_data_overview_task_ab.ipynb
ROOT = Path("/content/semeval_task13")

#handling extra folder layer if present
if (ROOT / "SemEval-2026-Task13").exists():
    BASE_DIR = ROOT / "SemEval-2026-Task13"
else:
    BASE_DIR = ROOT

TASK_A_DIR = BASE_DIR / "task_a"

print("BASE_DIR:", BASE_DIR)
print("TASK_A_DIR:", TASK_A_DIR)
print("Files in task_a:")
for p in TASK_A_DIR.iterdir():
    print(" -", p.name)


In [None]:
#loading the subtask A data
train_path = TASK_A_DIR / "task_a_training_set_1.parquet"
val_path   = TASK_A_DIR / "task_a_validation_set.parquet"
test_path  = TASK_A_DIR / "task_a_test_set_sample.parquet"

train_df = pd.read_parquet(train_path)
val_df   = pd.read_parquet(val_path)
test_df  = pd.read_parquet(test_path)

print("Train shape:", train_df.shape)
print("Val shape  :", val_df.shape)
print("Test shape :", test_df.shape)
print("\nColumns:", train_df.columns.tolist())

train_df.head()


In [None]:
#basic preprocessing(whitespace normalization)
def clean_code(text):
    if pd.isna(text):
        return ""
    return ' '.join(text.split())

train_df['code_clean'] = train_df['code'].apply(clean_code)
val_df['code_clean'] = val_df['code'].apply(clean_code)
test_df['code_clean'] = test_df['code'].apply(clean_code)

print("Basic preprocessing completed")

In [None]:
#preparation of training data
X_train = train_df['code_clean']
y_train = train_df['label']

X_val = val_df['code_clean']
y_val = val_df['label']

X_test = test_df['code_clean']
y_test = test_df['label'] if 'label' in test_df.columns else None

print(f"Training: {len(X_train)} samples")
print(f"Validation: {len(X_val)} samples")
print(f"Test: {len(X_test)} samples")

In [None]:
#TF-IDF + LR pipeline
tfidf_lr_pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(
        token_pattern=r'\b\w+\b|\{|\}|\(|\)|;|,|=|\+|\-|\*|/',
        max_features=20000,
        min_df=2,
        max_df=0.9,
        ngram_range=(1, 2)
    )),
    ('clf', LogisticRegression(
        random_state=42,
        class_weight='balanced',
        max_iter=1000
    ))
])

print("Training TF-IDF + Logistic Regression model")
tfidf_lr_pipeline.fit(X_train, y_train)
print("Model trained!")

In [None]:
# Evaluate on validation set
val_predictions = tfidf_lr_pipeline.predict(X_val)
val_f1 = f1_score(y_val, val_predictions, average='macro')
val_accuracy = accuracy_score(y_val, val_predictions)

print(f"VALIDATION RESULTS:")
print(f"Macro F1: {val_f1:.4f}")
print(f"Accuracy: {val_accuracy:.4f}")
print(f"Predictions - Human: {(val_predictions == 0).sum()}, Machine: {(val_predictions == 1).sum()}")

In [None]:
#submission file
sample_path = "sample_submission.csv"
if not os.path.exists(sample_path):
    print("Please upload sample_submission.csv")
    uploaded = files.upload()
    if not uploaded:
        raise FileNotFoundError("sample_submission.csv not provided.")
    sample_path = list(uploaded.keys())[0]

sample_submission = pd.read_csv(sample_path)
print(f"Loaded sample submission: {sample_path}")
print(f"Columns: {list(sample_submission.columns)}")
print(f"Shape: {sample_submission.shape}")

In [None]:
#creatng submission file and dataframe
test_predictions = tfidf_lr_pipeline.predict(X_test)
submission_df = sample_submission.copy()
submission_df['label'] = test_predictions
submission_path = "submission_tfidf_baseline.csv"  #this is the sample submission file provided by Semeval 2026 task 13
submission_df.to_csv(submission_path, index=False)

print(f"Submission file created: {submission_path}")
print(f"Submission statistics:")
print(f"Total predictions: {len(submission_df)}")
print(f"Human (0): {(test_predictions == 0).sum()}")
print(f"Machine (1): {(test_predictions == 1).sum()}")

In [None]:
#download the submission file
files.download(submission_path)
print("Submissione file downloaded:submission_tfidf_baseline.csv")
