In [1]:
import pandas as pd
import numpy as np
import re

In [2]:
data_path = '../data/raw/'
try:
    df_train = pd.read_csv(f'{data_path}train_v2_drcat_02.csv')
    print("train_essays.csv data loaded successfully.")
    print(f"Total essays: {len(df_train)}")
    print("\nFirst 5 rows:")
    print(df_train.head())
except FileNotFoundError:
    print("ERROR: train_essays.csv not found in data/raw/")
    print("Please make sure you have downloaded the data and placed it in the correct folder.")

train_essays.csv data loaded successfully.
Total essays: 44868

First 5 rows:
                                                text  label  \
0  Phones\n\nModern humans today are always on th...      0   
1  This essay will explain if drivers should or s...      0   
2  Driving while the use of cellular devices\n\nT...      0   
3  Phones & Driving\n\nDrivers should not be able...      0   
4  Cell Phone Operation While Driving\n\nThe abil...      0   

          prompt_name           source  RDizzl3_seven  
0  Phones and driving  persuade_corpus          False  
1  Phones and driving  persuade_corpus          False  
2  Phones and driving  persuade_corpus          False  
3  Phones and driving  persuade_corpus          False  
4  Phones and driving  persuade_corpus          False  


In [3]:
def normalize_whitespace(text):
    """
    Function to normalize whitespace.
    Replaces multiple whitespace chars with a single space and strips leading and trailing whilespace.
    """
    text = re.sub(r'\s+', ' ', text)
    return text.strip()

def remove_near_duplicates(df, column_name='text_cleaned'):
    """
    Removes essays that are exact duplicates based on the specified column.
    """
    initial_count = len(df)
    # keep first
    df = df.drop_duplicates(subset=[column_name], keep='first')
    final_count = len(df)
    print(f"Removed {initial_count - final_count} exact duplicate essays.")
    return df

In [None]:
print("DATA CLEANING")
print("Whitespace Normalization")
df_train['text_cleaned'] = df_train['text'].apply(normalize_whitespace)
print("Remove Dupe")
df_train = remove_near_duplicates(df_train, column_name='text_cleaned')
if 'label' in df_train.columns:
    df_train = df_train.rename(columns={'label': 'generated'})

print("\nCleaned DF head:")
print(df_train[['text_cleaned', 'generated']].head())

DATA CLEANING
Whitespace Normalization
Remove Dupe
Removed 0 exact duplicate essays.

Cleaned DF head:
                                        text_cleaned  generated
0  Phones Modern humans today are always on their...          0
1  This essay will explain if drivers should or s...          0
2  Driving while the use of cellular devices Toda...          0
3  Phones & Driving Drivers should not be able to...          0
4  Cell Phone Operation While Driving The ability...          0


In [9]:
print("Data Statistics (for Progress Report)")

# print out class distribution
class_counts = df_train['generated'].value_counts()
print("\nClass Distribution (0 = Human, 1 = LLM):")
print(class_counts)

# get lengths
df_train['word_count'] = df_train['text_cleaned'].apply(lambda x: len(x.split()))
print("\nEssay Length Statistics (in words):")
print(df_train['word_count'].describe())

# example
print("\nExample of a Cleaned Training Sample")
try:
    human_example = df_train[df_train['generated'] == 0].iloc[0]
    print(f"\n[Human Example (label = 0)]")
    print(human_example['text_cleaned'][:500] + "...")

    llm_example = df_train[df_train['generated'] == 1].iloc[0]
    print(f"\n[LLM Example (label = 1)]")
    print(llm_example['text_cleaned'][:500] + "...")
except IndexError:
    print("\nError: Could not find at least one example for both classes.")

Data Statistics (for Progress Report)

Class Distribution (0 = Human, 1 = LLM):
generated
0    27365
1    17497
Name: count, dtype: int64

Essay Length Statistics (in words):
count    44862.000000
mean       383.611966
std        164.934406
min          4.000000
25%        274.000000
50%        352.000000
75%        451.000000
max       1656.000000
Name: word_count, dtype: float64

Example of a Cleaned Training Sample

[Human Example (label = 0)]
Phones Modern humans today are always on their phone. They are always on their phone more than 5 hours a day no stop .All they do is text back and forward and just have group Chats on social media. They even do it while driving. They are some really bad consequences when stuff happens when it comes to a phone. Some certain areas in the United States ban phones from class rooms just because of it. When people have phones, they know about certain apps that they have .Apps like Facebook Twitter Ins...

[LLM Example (label = 1)]
In recent years, t

In [11]:
from sklearn.model_selection import train_test_split

In [17]:
# 400 token windowing (might need change)
print("400-Token Windowing")
W_SIZE = 400
# overlap by 50%
STRIDE = 200

windowed_data = []
for _, row in df_train.iterrows():
    text = row['text_cleaned']
    label = row['generated']
    # tokenize by whitespace
    tokens = text.split()
    # use text as it is if smaller than window size
    if len(tokens) <= W_SIZE:
        windowed_data.append({
            'text_window': text,
            'generated': label
        })
    else:
        for i in range(0, len(tokens) - W_SIZE + 1, STRIDE):
            window_tokens = tokens[i : i + W_SIZE]
            window_text = " ".join(window_tokens)
            windowed_data.append({
                'text_window': window_text,
                'generated': label
            })

df_windows = pd.DataFrame(windowed_data)
print(f"Original essays: {len(df_train)}")
print(f"Total 400-token windows: {len(df_windows)}")
print(f"New class distribution (windows):\n{df_windows['generated'].value_counts()}")

400-Token Windowing
Original essays: 44862
Total 400-token windows: 50996
New class distribution (windows):
generated
0    33269
1    17727
Name: count, dtype: int64


In [20]:
# Train/Validation Split
print("Split into train and validation sets")

# I will use 80 / 20 split (stratify for similar class balance)
df_train_windows, df_val_windows = train_test_split(
    df_windows,
    test_size=0.2,
    random_state=42,
    stratify=df_windows['generated']
)

print(f"total training windows: {len(df_train_windows)}")
print(f"total validation windows: {len(df_val_windows)}")

# save the result
processed_path = '../data/processed'
df_train_windows.to_csv(f'{processed_path}/train_windows.csv', index=False)
df_val_windows.to_csv(f'{processed_path}/val_windows.csv', index=False)
print(f"Saved to {processed_path}")

Split into train and validation sets
total training windows: 40796
total validation windows: 10200
Saved to ../data/processed
