In [1]:
import pandas as pd
import numpy as np

# load data
processed_path = '../data/processed'
try:
    df_train = pd.read_csv(f'{processed_path}/train_windows.csv')
    df_val = pd.read_csv(f'{processed_path}/val_windows.csv')

    # separate features (X) and target (y)
    X_train = df_train['text_window']
    y_train = df_train['generated']
    
    X_val = df_val['text_window']
    y_val = df_val['generated']

    print(f"Training windows: {len(X_train)}")
    print(f"Validation windows: {len(X_val)}")
    
except FileNotFoundError:
    print("ERROR: Processed data not found.")
    print(f"Please make sure 'train_windows.csv' and 'val_windows.csv' are in {processed_path}")

Training windows: 40796
Validation windows: 10200


In [2]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
# tokenize and pad the text

VOCAB_SIZE = 20000
MAX_LEN = 400

# word->number dict
tokenizer = Tokenizer(num_words = VOCAB_SIZE)
# only on the training
tokenizer.fit_on_texts(X_train)

# convert text to sequences of numbers
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_val_seq = tokenizer.texts_to_sequences(X_val)

# pad seqs to be max len
X_train_pad = pad_sequences(X_train_seq, maxlen=MAX_LEN, padding='post')
X_val_pad = pad_sequences(X_val_seq, maxlen=MAX_LEN, padding='post')

print(f"Shape of X_train_pad: {X_train_pad.shape}")
print(f"Shape of X_val_pad: {X_val_pad.shape}")


Shape of X_train_pad: (40796, 400)
Shape of X_val_pad: (10200, 400)


In [3]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F

In [4]:
# dataset/dataloader
class TextDataset(Dataset):
    def __init__(self, seqeunces, labels):
        self.sequences = torch.tensor(seqeunces, dtype=torch.long)
        self.labels = torch.tensor(labels.values, dtype=torch.long)

    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self, idx):
        return self.sequences[idx], self.labels[idx]

In [5]:
BATCH_SIZE = 64

print("Creating dataloaders with batch size of 64")

train_dataset = TextDataset(X_train_pad, y_train)
val_dataset = TextDataset(X_val_pad, y_val)

# dataloaders
train_loader = DataLoader(
    dataset = train_dataset,
    batch_size = BATCH_SIZE,
    shuffle = True
)
val_loader = DataLoader(
    dataset=val_dataset,
    batch_size=BATCH_SIZE,
    shuffle=False
)

print("\nDataLoaders Created")

Creating dataloaders with batch size of 64

DataLoaders Created


In [None]:
# Text-CNN model

class TextCNN(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_filters, filter_sizes, output_dim, dropout):
        super(TextCNN, self).__init__()
        # embedding layer
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        # Conv layers (parallel)
        self.convos = nn.ModuleList([
            nn.Conv1d(
                in_channels=embed_dim,
                out_channels=num_filters,
                kernel_size=kern_size
            ) for kern_size in filter_sizes
        ])
        # dropout and fcl
        self.dropout = nn.Dropout(dropout)
        # input size num_filters * len(filter_sizes) as it has to accept concatenation of 3 parallel conv/pool layers
        self.fc1 = nn.Linear(num_filters * len(filter_sizes), 64)
        self.fc2 = nn.Linear(64, output_dim)

    # forward
    # text -> [Batch_size, max_len]
    def forward(self, text):
        # embedding layer
        embedded = self.embedding(text)
        # [batch_size, max_len, embed_dim]

        # reshape for conv
        embedded = embedded.permute(0,2,1)
        # [batch_size, embed_dim, max_len]

        convos_res = [F.relu(conv(embedded)) for conv in self.convos]
        # [batch_size, num_filters, (max_len - filter_size + 1)]

        # global max pooling
        pool_res = [F.max_pool1d(conv, conv.shape[2]).squeeze(2) for conv in convos_res]
        # [batch_size, num_filters]

        # concatenate results
        concat_res = torch.cat(pool_res, dim=1)
        # [batch_size, num_filters*3]

        # dropout and fc 
        drop_res = self.dropout(concat_res)
        x1 = F.relu(self.fc1(drop_res))
        x2 = self.fc2(x1)

        return x2

In [14]:
# init model & optim

# hyperparams

# VOCAB_SIZE is from the tokenizer
# add 1 for the 0-padding token
VOCAB_SIZE = tokenizer.num_words + 1
EMBED_DIM = 100
NUM_FILTERS = 64
FILTER_SIZES = [3, 4, 5]
OUTPUT_DIM = 2 # (Human=0, LLM=1)
DROPOUT = 0.5

# init model
model = TextCNN(
    vocab_size=VOCAB_SIZE,
    embed_dim=EMBED_DIM,
    num_filters=NUM_FILTERS,
    filter_sizes=FILTER_SIZES,
    output_dim=OUTPUT_DIM,
    dropout=DROPOUT
)

optimizer = optim.Adam(model.parameters())
criterion = nn.CrossEntropyLoss()

# use GPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)
criterion = criterion.to(device)

print(f"Model initialized and moved to: {device}")
print(model)

Model initialized and moved to: cuda
TextCNN(
  (embedding): Embedding(20001, 100)
  (convos): ModuleList(
    (0): Conv1d(100, 64, kernel_size=(3,), stride=(1,))
    (1): Conv1d(100, 64, kernel_size=(4,), stride=(1,))
    (2): Conv1d(100, 64, kernel_size=(5,), stride=(1,))
  )
  (dropout): Dropout(p=0.5, inplace=False)
  (fc1): Linear(in_features=192, out_features=64, bias=True)
  (fc2): Linear(in_features=64, out_features=2, bias=True)
)


In [15]:
# training and evaluation

from torchmetrics.classification import BinaryAccuracy, BinaryAUROC

def train_epoch(model, dataloader, optimizer, criterion, device):
    # train for single epoch
    model.train()
    total_loss = 0
    for texts, labels in dataloader:
        texts = texts.to(device)
        labels = labels.to(device)

        optimizer.zero_grad()
        predictions = model(texts)
        loss = criterion(predictions, labels)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    return total_loss / len(dataloader)

def evaluate_epoch(model, dataloader, criterion, device):
    # eval on validation set
    print("Running evaluation...")
    model.eval()
    total_loss = 0
    
    # init metrics
    # calculate accuracy and AUROC
    acc_metric = BinaryAccuracy().to(device)
    auroc_metric = BinaryAUROC().to(device)
    
    with torch.no_grad():
        for texts, labels in dataloader:
            texts = texts.to(device)
            labels = labels.to(device)
            
            # forward pass
            predictions = model(texts)
            
            # calculate loss
            loss = criterion(predictions, labels)
            total_loss += loss.item()
            
            # update metrics
            # get the probability of class 1 (LLM)
            p1 = F.softmax(predictions, dim=1)[:, 1]
            
            # update the metrics with the batch results
            acc_metric.update(p1, labels)
            auroc_metric.update(p1, labels)
    
    # compute final metrics
    avg_loss = total_loss / len(dataloader)
    # .compute() gets the final metric value from all batches
    accuracy = acc_metric.compute()
    auroc = auroc_metric.compute()
        
    return avg_loss, accuracy, auroc

In [16]:
# training loop

import time

N_EPOCHS = 10
best_val_auroc = 0.0
PATIENCE = 2
epochs_no_improve = 0

print(f"--- Starting Training for {N_EPOCHS} epochs ---")
print(f"Using device: {device}")

for epoch in range(N_EPOCHS):
    
    start_time = time.time()
    
    train_loss = train_epoch(model, train_loader, optimizer, criterion, device)
    val_loss, val_acc, val_auroc = evaluate_epoch(model, val_loader, criterion, device)
    
    end_time = time.time()
    epoch_mins, epoch_secs = divmod(end_time - start_time, 60)
    
    print(f"\nEpoch: {epoch+1:02} | Time: {epoch_mins:.0f}m {epoch_secs:.0f}s")
    print(f"\tTrain Loss: {train_loss:.3f}")
    print(f"\t Val. Loss: {val_loss:.3f} | Val. Acc: {val_acc.item()*100:.2f}% | Val. AUROC: {val_auroc.item():.4f}")
    
    # early stopping
    if val_auroc > best_val_auroc:
        best_val_auroc = val_auroc
        epochs_no_improve = 0
        torch.save(model.state_dict(), 'best_cnn_model.pth')
        print(f"\t^ New best model saved with AUROC: {best_val_auroc.item():.4f}")
    else:
        epochs_no_improve += 1
        print(f"\tNo improvement. Patience: {epochs_no_improve}/{PATIENCE}")
    
    if epochs_no_improve == PATIENCE:
        print(f"\n--- Early stopping triggered after {epoch+1} epochs ---")
        break

print("\n--- Training Complete ---")
print(f"Best Validation AUROC achieved: {best_val_auroc.item():.4f}")

--- Starting Training for 10 epochs ---
Using device: cuda
Running evaluation...

Epoch: 01 | Time: 0m 8s
	Train Loss: 0.290
	 Val. Loss: 0.056 | Val. Acc: 98.50% | Val. AUROC: 0.9971
	^ New best model saved with AUROC: 0.9971
Running evaluation...

Epoch: 02 | Time: 0m 5s
	Train Loss: 0.100
	 Val. Loss: 0.034 | Val. Acc: 99.20% | Val. AUROC: 0.9980
	^ New best model saved with AUROC: 0.9980
Running evaluation...

Epoch: 03 | Time: 0m 5s
	Train Loss: 0.066
	 Val. Loss: 0.029 | Val. Acc: 99.25% | Val. AUROC: 0.9985
	^ New best model saved with AUROC: 0.9985
Running evaluation...

Epoch: 04 | Time: 0m 5s
	Train Loss: 0.048
	 Val. Loss: 0.023 | Val. Acc: 99.42% | Val. AUROC: 0.9989
	^ New best model saved with AUROC: 0.9989
Running evaluation...

Epoch: 05 | Time: 0m 5s
	Train Loss: 0.033
	 Val. Loss: 0.027 | Val. Acc: 99.33% | Val. AUROC: 0.9991
	^ New best model saved with AUROC: 0.9991
Running evaluation...

Epoch: 06 | Time: 0m 5s
	Train Loss: 0.025
	 Val. Loss: 0.022 | Val. Acc: 99.4

In [24]:
# Test
print("--- Test: Manual Qualitative Test ---")

# load model
# re-initialize the model architecture
VOCAB_SIZE = tokenizer.num_words + 1 
EMBED_DIM = 100
NUM_FILTERS = 64
FILTER_SIZES = [3, 4, 5]
OUTPUT_DIM = 2
DROPOUT = 0.5

# create a new instance
test_model = TextCNN(
    vocab_size=VOCAB_SIZE,
    embed_dim=EMBED_DIM,
    num_filters=NUM_FILTERS,
    filter_sizes=FILTER_SIZES,
    output_dim=OUTPUT_DIM,
    dropout=DROPOUT
)

# load the saved weights
test_model.load_state_dict(torch.load('best_cnn_model.pth'))
test_model.to(device)
test_model.eval()
print("Successfully loaded 'best_cnn_model.pth'")

# examples
my_examples = [
    # Example 1: my writing
    "In this lab, I had to use a technique called chromatography. We used paper chromatography to separate metal ions. Generally, chromatography is a simple way of identifying an unknown mixture of chemicals or compounds, which are in liquid or gas form, by dissolving mixtures in a fluid called the mobile phase that carries through the stationary phase. The mobile phase can be a proper liquid solvent or mixture of solvents, while the stationary phase is a solid or liquid phase that is fixed in a place in the experiment. In our lab, we used a specific chromatography called paper chromatography. What I learned during this lab, in general, is that paper chromatography works based on capillary action. Capillary action is the tendency of liquid to rise in thin tubes or to be brought into small openings; It happens because there are adhesive forces between the molecules of the solvent. In paper chromatography, the liquid rises up through the paper, which is the stationary medium, since there are small holes in between the paper fibres. The main point of chromatography, however, is that it uses the difference in solubility of substances in a solvent. Solubility basically means how much of a particular substance can dissolve in a specific solvent, and the difference of it creates the various substances to leave solution at varying points as the solvent rises up the stationary phase. In this case, I noticed that the substance will travel more if it is more soluble. Also, absorption takes part in creating separation. Higher absorption to the stationary phase will slow down the molecule that moves through the column.",
    
    # Example 2: llm writing
    "In this lab, we used a technique called paper chromatography to separate and identify metal ions. Chromatography is a method used to analyze mixtures of chemicals by separating them based on how they move through two phases: a mobile phase (a liquid or gas that moves) and a stationary phase (a solid or liquid fixed in place). In our experiment, the stationary phase was the paper, and the mobile phase was the solvent. Paper chromatography works mainly due to capillary action, which is the ability of a liquid to flow through narrow spaces without external force. This happens because of adhesive forces between the liquid molecules and the paper fibers. As the solvent rises up the paper, it carries the dissolved substances with it. The key principle of chromatography is that different substances have different solubilities in the solvent and different levels of absorption (or attraction) to the stationary phase. Substances that are more soluble in the solvent travel farther up the paper, while substances that are more strongly absorbed by the paper move more slowly. From this lab, I learned that paper chromatography separates substances based on their solubility and their interaction with the stationary phase.",
    
    # Example 3: my writing
    "In this lab, I learned about how the system shifts to alleviate stress which allows the system to reach a new equilibrium. When we add a chemical species, the equilibrium system will remove the added species by shifting to the other side while the equilibrium system will replace the species by shifting if we remove some of the chemical species by the process of neutralizing, complex ion formation, or producing precipitate. This is due to a change in concentration; when concentration increases, the reaction rate increases the frequency of collision while it does opposite when concentration decreases. In the addition of a chemical species, ions considered as spectator ions did not affect the shift. Furthermore, temperature change would affect frequency of collision and fraction of successful collision which would affect the reaction rate. When a system is heated, the system will shift toward the endothermic direction in order to remove added energy while the system will shift toward the exothermic side to replace lost energy when the system is cooled. In general, the endothermic side is more sensitive to the temperature change due to its high activation energy. When we were determining the effects of the stress, we had to depend on our observations. We basically kept track of the colour changes or formation of precipitation to determine the shifts in the system. The colours became visible when we diluted the solution. In addition, I learned that kinetics can go along with Le Chatelier’s Principle. When the system is stressed, the forward and reverse reaction rates would both increase or decrease. However, they would have different magnitudes. For example, when we removed the species, the concentration of the reactants had a lower concentration so that the frequency of collision decreased; this decreased the forward rate more than the reverse rate so that it caused a left shift to occur. ",

    # Example 4: llm writing
    "In this lab, I learned how a chemical system responds to stress in order to re-establish equilibrium, according to Le Chatelier's Principle. When a chemical species is added, the system shifts to the opposite side to reduce the increase. Conversely, when a species is removed—through neutralization, complex ion formation, or precipitation—the system shifts to replace what was removed. This happens because changes in concentration affect the reaction rate by altering the frequency of collisions between particles. Spectator ions, however, do not influence the direction of the shift. I also learned how temperature affects equilibrium. A temperature increase causes the system to shift toward the endothermic direction to absorb excess energy, while cooling shifts the equilibrium toward the exothermic direction to release energy. The endothermic side is generally more sensitive to temperature changes because it has a higher activation energy. To determine how the equilibrium shifted, we relied on observations of color changes or the formation of precipitates. Diluting the solution often made these color changes more visible. Additionally, I learned that reaction kinetics work alongside equilibrium: when a system is stressed, both the forward and reverse reaction rates change, but not equally. For example, removing a reactant decreases its concentration and collision frequency, reducing the forward rate more than the reverse rate. This results in a shift toward the left, or the reactant side.",

    # Example 5: llm writing
    "Probability is a branch of mathematics that measures how likely an event is to happen. It provides a way to quantify uncertainty by assigning a number between 0 and 1 to events, where 0 means the event is impossible and 1 means it is certain. For example, when flipping a fair coin, the probability of getting heads is 0.5 because it is equally likely to land on heads or tails. Probability helps us make predictions, analyze patterns, and understand random processes in everyday life, science, and engineering.",

    # Example 6: my writing
    "But what makes our map truly functional are its important functionalities and features that have been carefully integrated to make your navigation experience smoother. For instance, when users are using the search bar, the search bar provides 4 different autofill options so that users can simply select without typing the whole word on the search bar. Also, when we place the cursor over a specific button, it shows a small message or description about what it will show.",

    # Example 7: llm writing
    "What makes our map truly effective are the carefully designed features that enhance the overall navigation experience. For example, the search bar offers four smart autofill suggestions, allowing users to quickly select a result without typing the entire word. Additionally, when users hover over a button, a small tooltip appears with a brief description of its function, making the interface more intuitive and user-friendly."
]

# process and predict
print("Predicting on custom sentences...")
with torch.no_grad():
    for text in my_examples:
        #tokenize (use same tokenizer)
        seq = tokenizer.texts_to_sequences([text])
        
        # pad
        padded_seq = pad_sequences(seq, maxlen=MAX_LEN, padding='post')
        
        # convert to tensor
        tensor = torch.tensor(padded_seq, dtype=torch.long).to(device)
        
        # get prediction
        prediction = test_model(tensor)
        
        # probs and class
        probs = F.softmax(prediction, dim=1)
        pred_class = torch.argmax(probs, dim=1).item()
        confidence = probs[0][pred_class].item() * 100
        
        label = "LLM (1)" if pred_class == 1 else "Human (0)"
        
        print(f"\nPrediction: {label} (Confidence: {confidence:.2f}%)")
        print(f"Text: '{text}'")

--- Test: Manual Qualitative Test ---
Successfully loaded 'best_cnn_model.pth'
Predicting on custom sentences...

Prediction: Human (0) (Confidence: 99.30%)
Text: 'In this lab, I had to use a technique called chromatography. We used paper chromatography to separate metal ions. Generally, chromatography is a simple way of identifying an unknown mixture of chemicals or compounds, which are in liquid or gas form, by dissolving mixtures in a fluid called the mobile phase that carries through the stationary phase. The mobile phase can be a proper liquid solvent or mixture of solvents, while the stationary phase is a solid or liquid phase that is fixed in a place in the experiment. In our lab, we used a specific chromatography called paper chromatography. What I learned during this lab, in general, is that paper chromatography works based on capillary action. Capillary action is the tendency of liquid to rise in thin tubes or to be brought into small openings; It happens because there are ad