In [1]:
import pandas as pd

# training data
train_path = "../data/raw/train_v2_drcat_02.csv"
df_train = pd.read_csv(train_path)

# test data
test_path = "../data/ood/ood_data.csv"
df_test_candidate = pd.read_csv(test_path)

print(f"Original Training Set Size: {len(df_train)}")
print(f"Sample Test Set Size: {len(df_test_candidate)}")


Original Training Set Size: 44868
Sample Test Set Size: 487235


In [2]:
# remove overlaps

clean_test_df = df_test_candidate[~df_test_candidate['text'].isin(df_train['text'])]

num_removed = len(df_test_candidate) - len(clean_test_df)

print(f"--- Data Analysis ---")
print(f"Overlapping essays removed: {num_removed}")
print(f"Clean, valid 'New Data' essays remaining: {len(clean_test_df)}")

if len(clean_test_df) > 0:
    print("SUCCESS: Enough unique data present.")
    clean_test_df.to_csv('../data/raw/test_set.csv', index=False)
    print("Saved clean test set to data/raw")
else:
    print("Error")

--- Data Analysis ---
Overlapping essays removed: 40890
Clean, valid 'New Data' essays remaining: 446345
SUCCESS: Enough unique data present.
Saved clean test set to data/raw


In [3]:
import pandas as pd
import numpy as np
import re
import torch
import torch.nn as nn
import torch.nn.functional as F
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import re

In [4]:
VOCAB_SIZE = 20000 + 1
EMBED_DIM = 100
NUM_FILTERS = 64
FILTER_SIZES = [3, 4, 5]
OUTPUT_DIM = 2
DROPOUT = 0.5
MAX_LEN = 400
BATCH_SIZE = 64

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

class TextCNN(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_filters, filter_sizes, output_dim, dropout):
        super(TextCNN, self).__init__()
        
        # embedding Layer
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        
        # Conv Layers (parallel)
        self.convos = nn.ModuleList([
            nn.Conv1d(
                in_channels=embed_dim,
                out_channels=num_filters,
                kernel_size=kern_size
            ) for kern_size in filter_sizes
        ])
        
        # dropout and fc
        self.dropout = nn.Dropout(dropout)
        
        # input size num filters * len(filter_sizes)
        self.fc1 = nn.Linear(num_filters * len(filter_sizes), 64)
        self.fc2 = nn.Linear(64, output_dim)

    def forward(self, text):
        # embedding Layer
        embedded = self.embedding(text)
        # reshape for conv [batch_size, embed_dim, max_len]
        embedded = embedded.permute(0, 2, 1)
        
        convos_res = [F.relu(conv(embedded)) for conv in self.convos]
        
        # global max pooling
        pool_res = [F.max_pool1d(conv, conv.shape[2]).squeeze(2) for conv in convos_res]
        
        # concatenate results
        concat_res = torch.cat(pool_res, dim=1)
        
        # dropout and fc
        drop_res = self.dropout(concat_res)
        x1 = F.relu(self.fc1(drop_res))
        x2 = self.fc2(x1)
        
        return x2

Using device: cuda


In [5]:
data_path = '../data/raw/'

def normalize_whitespace(text):
    if not isinstance(text, str): return ""
    text = re.sub(r'\s+', ' ', text)
    return text.strip()

try:
    # load original training data
    df_train_orig = pd.read_csv(f'{data_path}train_v2_drcat_02.csv')
    
    # clean
    df_train_orig['text_cleaned'] = df_train_orig['text'].apply(normalize_whitespace)
    
    # fit tokenizer
    tokenizer = Tokenizer(num_words=VOCAB_SIZE - 1)
    tokenizer.fit_on_texts(df_train_orig['text_cleaned'])
    
    print(f"Tokenizer fitted. Vocab size: {len(tokenizer.word_index)}")
    
except FileNotFoundError:
    print("ERROR: Training data not found. Cannot rebuild tokenizer.")


Tokenizer fitted. Vocab size: 82222


In [6]:
print("--- Processing Test Set ---")
# load test set
df_test = pd.read_csv(f'{data_path}test_set.csv')

# clean
df_test['text_cleaned'] = df_test['text'].apply(normalize_whitespace)

# windowing
W_SIZE = 400
STRIDE = 200
test_windowed_data = []

for idx, row in df_test.iterrows():
    text = row['text_cleaned']
    label = row['generated'] if 'generated' in row else -1
    
    tokens = text.split()
    
    if len(tokens) <= W_SIZE:
        test_windowed_data.append({
            'essay_id': idx, 
            'text_window': text, 
            'generated': label
        })
    else:
        for i in range(0, len(tokens) - W_SIZE + 1, STRIDE):
            window_tokens = tokens[i : i + W_SIZE]
            window_text = " ".join(window_tokens)
            test_windowed_data.append({
                'essay_id': idx, 
                'text_window': window_text, 
                'generated': label
            })

df_test_windows = pd.DataFrame(test_windowed_data)
print(f"Created {len(df_test_windows)} windows from {len(df_test)} essays.")

# tokenize and pad
X_test_seq = tokenizer.texts_to_sequences(df_test_windows['text_window'])
X_test_pad = pad_sequences(X_test_seq, maxlen=MAX_LEN, padding='post', truncating='post')

X_test_tensor = torch.tensor(X_test_pad, dtype=torch.long)

--- Processing Test Set ---
Created 512472 windows from 446345 essays.


In [7]:
print("--- Running Prediction ---")

test_model = TextCNN(
    vocab_size=VOCAB_SIZE,
    embed_dim=EMBED_DIM,
    num_filters=NUM_FILTERS,
    filter_sizes=FILTER_SIZES,
    output_dim=OUTPUT_DIM,
    dropout=DROPOUT
)

try:
    test_model.load_state_dict(torch.load('best_cnn_model.pth', map_location=device))
    test_model.to(device)
    test_model.eval()
    print("Successfully loaded 'best_cnn_model.pth'")
except FileNotFoundError:
    print("Weight file not found...")

# inference
all_probs = []
with torch.no_grad():
    for i in range(0, len(X_test_tensor), BATCH_SIZE):
        batch = X_test_tensor[i : i + BATCH_SIZE].to(device)
        predictions = test_model(batch)
        
        # get probs
        probs = F.softmax(predictions, dim=1)[:, 1]
        all_probs.extend(probs.cpu().numpy())

df_test_windows['chunk_score'] = all_probs

# aggregate by essay
print("\n--- Final Aggregation ---")
final_results = df_test_windows.groupby('essay_id').agg({
    'chunk_score': 'mean',
    'generated': 'first'
}).reset_index()

# abstain setup
LOW_CONF = 0.20
HIGH_CONF = 0.80

def get_decision(score):
    if score < LOW_CONF: return 0
    elif score > HIGH_CONF: return 1
    else: return -1

final_results['prediction'] = final_results['chunk_score'].apply(get_decision)

# metrics
total = len(final_results)
abstained = len(final_results[final_results['prediction'] == -1])
decided = final_results[final_results['prediction'] != -1]

accuracy = 0
if len(decided) > 0:
    correct = len(decided[decided['prediction'] == decided['generated']])
    accuracy = correct / len(decided)

print(f"Total Essays: {total}")
print(f"Abstained: {abstained} ({abstained/total:.1%})")
print(f"Decided: {len(decided)}")
print(f"Accuracy (on decisions): {accuracy:.2%}")

print("\nSample Results:")
print(final_results[['chunk_score', 'prediction', 'generated']].head())

--- Running Prediction ---
Successfully loaded 'best_cnn_model.pth'

--- Final Aggregation ---
Total Essays: 446345
Abstained: 69387 (15.5%)
Decided: 376958
Accuracy (on decisions): 75.65%

Sample Results:
   chunk_score  prediction  generated
0     0.621636          -1        1.0
1     0.999188           1        1.0
2     0.949429           1        1.0
3     0.978913           1        1.0
4     0.952428           1        1.0


The 75.65% acccuracy on the new large test set is primarily due to the shift in distribution. The initial text-CNN model was trained on a relatively small DAIGT V2 dataset and seemed like it overfitted to dataset specific artifacts such as vocab, formats..etc.
Now, it requires broader training data for improvements. (I will use this large new dataset to re-train the model)