In [3]:
import pandas as pd
import numpy as np
import re

In [4]:
data_path = '../data/raw/'
try:
    df_train = pd.read_csv(f'{data_path}test_set.csv')
    if 'generated' in df_train.columns:
        df_train['generated'] = df_train['generated'].astype(int)
    print("test_set.csv data loaded successfully.")
    print(f"Total essays: {len(df_train)}")
    print("\nFirst 5 rows:")
    print(df_train.head())
except FileNotFoundError:
    print("ERROR: train_essays.csv not found in data/raw/")
    print("Please make sure you have downloaded the data and placed it in the correct folder.")

test_set.csv data loaded successfully.
Total essays: 446345

First 5 rows:
                                                text  generated
0  Car-free cities have become a subject of incre...          1
1  Car Free Cities  Car-free cities, a concept ga...          1
2    A Sustainable Urban Future  Car-free cities ...          1
3    Pioneering Sustainable Urban Living  In an e...          1
4    The Path to Sustainable Urban Living  In an ...          1


In [5]:
def normalize_whitespace(text):
    """
    Function to normalize whitespace.
    Replaces multiple whitespace chars with a single space and strips leading and trailing whilespace.
    """
    text = re.sub(r'\s+', ' ', text)
    return text.strip()

def remove_near_duplicates(df, column_name='text_cleaned'):
    """
    Removes essays that are exact duplicates based on the specified column.
    """
    initial_count = len(df)
    # keep first
    df = df.drop_duplicates(subset=[column_name], keep='first')
    final_count = len(df)
    print(f"Removed {initial_count - final_count} exact duplicate essays.")
    return df

In [6]:
print("DATA CLEANING")
print("Whitespace Normalization")
df_train['text_cleaned'] = df_train['text'].apply(normalize_whitespace)
print("Remove Dupe")
df_train = remove_near_duplicates(df_train, column_name='text_cleaned')
if 'label' in df_train.columns:
    df_train = df_train.rename(columns={'label': 'generated'})

print("\nCleaned DF head:")
print(df_train[['text_cleaned', 'generated']].head())

DATA CLEANING
Whitespace Normalization
Remove Dupe
Removed 11830 exact duplicate essays.

Cleaned DF head:
                                        text_cleaned  generated
0  Car-free cities have become a subject of incre...          1
1  Car Free Cities Car-free cities, a concept gai...          1
2  A Sustainable Urban Future Car-free cities are...          1
3  Pioneering Sustainable Urban Living In an era ...          1
4  The Path to Sustainable Urban Living In an age...          1


In [7]:
print("Data Statistics (for Progress Report)")

# print out class distribution
class_counts = df_train['generated'].value_counts()
print("\nClass Distribution (0 = Human, 1 = LLM):")
print(class_counts)

# get lengths
df_train['word_count'] = df_train['text_cleaned'].apply(lambda x: len(x.split()))
print("\nEssay Length Statistics (in words):")
print(df_train['word_count'].describe())

# example
print("\nExample of a Cleaned Training Sample")
try:
    human_example = df_train[df_train['generated'] == 0].iloc[0]
    print(f"\n[Human Example (label = 0)]")
    print(human_example['text_cleaned'][:500] + "...")

    llm_example = df_train[df_train['generated'] == 1].iloc[0]
    print(f"\n[LLM Example (label = 1)]")
    print(llm_example['text_cleaned'][:500] + "...")
except IndexError:
    print("\nError: Could not find at least one example for both classes.")

Data Statistics (for Progress Report)

Class Distribution (0 = Human, 1 = LLM):
generated
0    266658
1    167857
Name: count, dtype: int64

Essay Length Statistics (in words):
count    434515.000000
mean        393.084796
std         168.716844
min           0.000000
25%         278.000000
50%         362.000000
75%         471.000000
max        1668.000000
Name: word_count, dtype: float64

Example of a Cleaned Training Sample

[Human Example (label = 0)]
Phones Modern humans today are always on their phone. They are always on their phone more than 5 hours a day no stop .All they do is text back and forward and just have group Chats on social media. They even do it while driving. They are some really bad consequences when stuff happens when it comes to a phone. Some certain areas in the United States ban phones from class rooms just because of it. When people have phones, they know about certain apps that they have .Apps like Facebook Twitter Ins...

[LLM Example (label = 1)]
Car-free

In [12]:
from sklearn.model_selection import train_test_split

print("400-Token Windowing (essay-level split first)")

W_SIZE = 400
STRIDE = 200

def make_windows(df_in):
    windowed_data = []
    for _, row in df_in.iterrows():
        text = row['text_cleaned']
        label = row['generated']

        tokens = text.split()

        # use text as it is if smaller than window size
        if len(tokens) <= W_SIZE:
            windowed_data.append({
                'text_window': text,
                'generated': label
            })
        else:
            for i in range(0, len(tokens) - W_SIZE + 1, STRIDE):
                window_tokens = tokens[i : i + W_SIZE]
                window_text = " ".join(window_tokens)
                windowed_data.append({
                    'text_window': window_text,
                    'generated': label
                })

    return pd.DataFrame(windowed_data)

df_train_essays, df_val_essays = train_test_split(
    df_train,
    test_size=0.2,
    random_state=42,
    stratify=df_train['generated']
)

print(f"Train essays: {len(df_train_essays)}")
print(f"Val essays:   {len(df_val_essays)}")

print("\nCreating windows for TRAIN essays...")
df_train_windows = make_windows(df_train_essays)
print(f"Train windows: {len(df_train_windows)}")
print("Train class distribution (windows):")
print(df_train_windows['generated'].value_counts())

print("\nCreating windows for VAL essays...")
df_val_windows = make_windows(df_val_essays)
print(f"Val windows: {len(df_val_windows)}")
print("Val class distribution (windows):")
print(df_val_windows['generated'].value_counts())

400-Token Windowing (essay-level split first)
Train essays: 347612
Val essays:   86903

Creating windows for TRAIN essays...
Train windows: 398938
Train class distribution (windows):
generated
0    259366
1    139572
Name: count, dtype: int64

Creating windows for VAL essays...
Val windows: 100057
Val class distribution (windows):
generated
0    65101
1    34956
Name: count, dtype: int64


In [13]:
# save the result
processed_path = '../data/processed'
df_train_windows.to_csv(f'{processed_path}/new_train_windows.csv', index=False)
df_val_windows.to_csv(f'{processed_path}/new_val_windows.csv', index=False)
print(f"Saved to {processed_path}")

Saved to ../data/processed
