In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, roc_auc_score, accuracy_score

In [2]:
# load data
processed_path = '../data/processed'
try:
    df_train = pd.read_csv(f'{processed_path}/train_windows.csv')
    df_val = pd.read_csv(f'{processed_path}/val_windows.csv')

    # separate features (X) and target (y)
    X_train = df_train['text_window']
    y_train = df_train['generated']
    
    X_val = df_val['text_window']
    y_val = df_val['generated']

    print(f"Training windows: {len(X_train)}")
    print(f"Validation windows: {len(X_val)}")
    
except FileNotFoundError:
    print("ERROR: Processed data not found.")
    print(f"Please make sure 'train_windows.csv' and 'val_windows.csv' are in {processed_path}")

Training windows: 40796
Validation windows: 10200


In [3]:
# TF-IDF Vectorizer - converts raw text into numerical features that ml model can understand
# TF - Term Frequency (how often a word patter appears)
# IDF - Inverse Document Frequency (how rare the pattern is across all text -> more important)
# TF-IDF = how often it appears * how unique it is

vectorizer = TfidfVectorizer(
    analyzer = 'char',
    ngram_range = (3, 5),
    max_features = 100000
)

# Fit vectorizer on training data
print("Fit vectorizer on training data...")
X_train_tfidf = vectorizer.fit_transform(X_train)

# transform validation data
print("Transform validation data...")
X_val_tfidf = vectorizer.transform(X_val)

print(f"Vocab size: {len(vectorizer.vocabulary_)}")
print(f"Shape of X_train_tfidf: {X_train_tfidf.shape}")

Fit vectorizer on training data...
Transform validation data...
Transform validation data...
Vocab size: 100000
Shape of X_train_tfidf: (40796, 100000)
Vocab size: 100000
Shape of X_train_tfidf: (40796, 100000)


In [4]:
# train baseline model
baseline_model = LogisticRegression(
    C = 1.0,
    solver = 'liblinear',
    random_state=42
)

print("Fitting Logistic Regression model...")
baseline_model.fit(X_train_tfidf, y_train)
print("complete")

Fitting Logistic Regression model...
complete
complete


In [5]:
print("\n--- Baseline Model Validation Results ---")

# get predictions
y_pred = baseline_model.predict(X_val_tfidf)
y_pred_proba = baseline_model.predict_proba(X_val_tfidf)[:, 1]

# metrics
# 1. accuracy
accuracy = accuracy_score(y_val, y_pred)
print(f"Accuracy: {accuracy:.4f}")

# 2. AUROC
auroc = roc_auc_score(y_val, y_pred_proba)
print(f"AUROC: {auroc:.4f}")

# report
print("\nClassification Report:")
print(classification_report(y_val, y_pred, target_names=['Human (0)', 'LLM (1)']))


--- Baseline Model Validation Results ---
Accuracy: 0.9939
AUROC: 0.9988

Classification Report:
              precision    recall  f1-score   support

   Human (0)       0.99      1.00      1.00      6654
     LLM (1)       1.00      0.98      0.99      3546

    accuracy                           0.99     10200
   macro avg       0.99      0.99      0.99     10200
weighted avg       0.99      0.99      0.99     10200



In [6]:
import numpy as np

print("--- Top Model Features (Qualitative Test) ---")

# feature names from the vectorizer
feature_names = np.array(vectorizer.get_feature_names_out())

# coefficient from model
model_coef = baseline_model.coef_[0]

# indices of the top 20 positive (LLM) and negative (Human) features
top_llm_indices = model_coef.argsort()[-20:][::-1]
top_human_indices = model_coef.argsort()[:20]

print("\nTop 20 features predicting LLM (Positive)")
print(list(feature_names[top_llm_indices]))

print("\nTop 20 features predicting Human (Negative)")
print(list(feature_names[top_human_indices]))

--- Top Model Features (Qualitative Test) ---

Top 20 features predicting LLM (Positive)
[', i ', ', i', "t's ", "t's", 'like,', 'ike, ', 'ike,', " it's", 'ke, ', " it'", 'ke,', "it's ", "it's", ' and ', 'y, ', "it'", ' and', ' imp', 'and', "'s "]

Top 20 features predicting Human (Negative)
[' th', 'the', ' the', 'he ', 'the ', ' the ', ' ve', 'caus', 'cau', 'cause', 'aus', 'ause', 'eca', 'beca', 'becau', 'ecau', 'ecaus', 'use', ' beca', 'en ']


In [7]:
# find where model failed

# create a 'predictions' column on val dataframe
df_val['prediction'] = y_pred

# filter for rows where the model failed
false_negatives = df_val[
    (df_val['generated'] == 1) & (df_val['prediction'] == 0)
]

print(f"--- Model Failures: LLM text predicted as Human (False Negatives) ---")
print(f"Total failures: {len(false_negatives)}")

# Print out a few examples of the text it failed on
for i, row in false_negatives.head(5).iterrows():
    print(f"\nExample {i}")
    print(row['text_window'])

--- Model Failures: LLM text predicted as Human (False Negatives) ---
Total failures: 54

Example 26
I agree with Emerson that if you try to do something different of what you are doing, you will grow. However, if you cap doing that same thing, you will never grow. For example, in life, you do different things to know more about that world, you experiment with different things to know things that other people don't know, to become a strong person. Doing different things, you will grow. For example, when you become a father and a man, you change your experiment things that you never imagined how that works. Like some people, they still act like kids because they talk to young men like their age is 17 and 19, this you men they still don't know how to act like men, when their age is 40, that's how you will never grow. Because a man changes to a man is when he is 25, this is that age that you are changing things, and that's the way people get old and they still act like kids. Other people 

In [23]:
print("\n--- Manual Qualitative Test (My examples) ---")

my_examples = [
    # Example 1: my writing
    "In this lab, I had to use a technique called chromatography. We used paper chromatography to separate metal ions. Generally, chromatography is a simple way of identifying an unknown mixture of chemicals or compounds, which are in liquid or gas form, by dissolving mixtures in a fluid called the mobile phase that carries through the stationary phase. The mobile phase can be a proper liquid solvent or mixture of solvents, while the stationary phase is a solid or liquid phase that is fixed in a place in the experiment. In our lab, we used a specific chromatography called paper chromatography. What I learned during this lab, in general, is that paper chromatography works based on capillary action. Capillary action is the tendency of liquid to rise in thin tubes or to be brought into small openings; It happens because there are adhesive forces between the molecules of the solvent. In paper chromatography, the liquid rises up through the paper, which is the stationary medium, since there are small holes in between the paper fibres. The main point of chromatography, however, is that it uses the difference in solubility of substances in a solvent. Solubility basically means how much of a particular substance can dissolve in a specific solvent, and the difference of it creates the various substances to leave solution at varying points as the solvent rises up the stationary phase. In this case, I noticed that the substance will travel more if it is more soluble. Also, absorption takes part in creating separation. Higher absorption to the stationary phase will slow down the molecule that moves through the column.",
    
    # Example 2: llm writing
    "In this lab, we used a technique called paper chromatography to separate and identify metal ions. Chromatography is a method used to analyze mixtures of chemicals by separating them based on how they move through two phases: a mobile phase (a liquid or gas that moves) and a stationary phase (a solid or liquid fixed in place). In our experiment, the stationary phase was the paper, and the mobile phase was the solvent. Paper chromatography works mainly due to capillary action, which is the ability of a liquid to flow through narrow spaces without external force. This happens because of adhesive forces between the liquid molecules and the paper fibers. As the solvent rises up the paper, it carries the dissolved substances with it. The key principle of chromatography is that different substances have different solubilities in the solvent and different levels of absorption (or attraction) to the stationary phase. Substances that are more soluble in the solvent travel farther up the paper, while substances that are more strongly absorbed by the paper move more slowly. From this lab, I learned that paper chromatography separates substances based on their solubility and their interaction with the stationary phase.",
    
    # Example 3: my writing
    "In this lab, I learned about how the system shifts to alleviate stress which allows the system to reach a new equilibrium. When we add a chemical species, the equilibrium system will remove the added species by shifting to the other side while the equilibrium system will replace the species by shifting if we remove some of the chemical species by the process of neutralizing, complex ion formation, or producing precipitate. This is due to a change in concentration; when concentration increases, the reaction rate increases the frequency of collision while it does opposite when concentration decreases. In the addition of a chemical species, ions considered as spectator ions did not affect the shift. Furthermore, temperature change would affect frequency of collision and fraction of successful collision which would affect the reaction rate. When a system is heated, the system will shift toward the endothermic direction in order to remove added energy while the system will shift toward the exothermic side to replace lost energy when the system is cooled. In general, the endothermic side is more sensitive to the temperature change due to its high activation energy. When we were determining the effects of the stress, we had to depend on our observations. We basically kept track of the colour changes or formation of precipitation to determine the shifts in the system. The colours became visible when we diluted the solution. In addition, I learned that kinetics can go along with Le Chatelier’s Principle. When the system is stressed, the forward and reverse reaction rates would both increase or decrease. However, they would have different magnitudes. For example, when we removed the species, the concentration of the reactants had a lower concentration so that the frequency of collision decreased; this decreased the forward rate more than the reverse rate so that it caused a left shift to occur. ",

    # Example 4: llm writing
    "In this lab, I learned how a chemical system responds to stress in order to re-establish equilibrium, according to Le Chatelier's Principle. When a chemical species is added, the system shifts to the opposite side to reduce the increase. Conversely, when a species is removed—through neutralization, complex ion formation, or precipitation—the system shifts to replace what was removed. This happens because changes in concentration affect the reaction rate by altering the frequency of collisions between particles. Spectator ions, however, do not influence the direction of the shift. I also learned how temperature affects equilibrium. A temperature increase causes the system to shift toward the endothermic direction to absorb excess energy, while cooling shifts the equilibrium toward the exothermic direction to release energy. The endothermic side is generally more sensitive to temperature changes because it has a higher activation energy. To determine how the equilibrium shifted, we relied on observations of color changes or the formation of precipitates. Diluting the solution often made these color changes more visible. Additionally, I learned that reaction kinetics work alongside equilibrium: when a system is stressed, both the forward and reverse reaction rates change, but not equally. For example, removing a reactant decreases its concentration and collision frequency, reducing the forward rate more than the reverse rate. This results in a shift toward the left, or the reactant side."
]

print(f"Testing {len(my_examples)} custom examples...\n")

# 1. Transform your examples using the *fitted* vectorizer
my_examples_tfidf = vectorizer.transform(my_examples)

# 2. Get predictions from the *fitted* model
my_preds = baseline_model.predict(my_examples_tfidf)

# 3. Get the probabilities (for confidence score)
my_probs = baseline_model.predict_proba(my_examples_tfidf)

# 4. Show the results
for text, pred, prob in zip(my_examples, my_preds, my_probs):
    label = "LLM (1)" if pred == 1 else "Human (0)"
    confidence = prob[pred] * 100
    
    print(f"Prediction: {label} (Confidence: {confidence:.2f}%)")
    print(f"Text: '{text}'\n")


--- Manual Qualitative Test (My examples) ---
Testing 4 custom examples...

Prediction: Human (0) (Confidence: 57.59%)
Text: 'In this lab, I had to use a technique called chromatography. We used paper chromatography to separate metal ions. Generally, chromatography is a simple way of identifying an unknown mixture of chemicals or compounds, which are in liquid or gas form, by dissolving mixtures in a fluid called the mobile phase that carries through the stationary phase. The mobile phase can be a proper liquid solvent or mixture of solvents, while the stationary phase is a solid or liquid phase that is fixed in a place in the experiment. In our lab, we used a specific chromatography called paper chromatography. What I learned during this lab, in general, is that paper chromatography works based on capillary action. Capillary action is the tendency of liquid to rise in thin tubes or to be brought into small openings; It happens because there are adhesive forces between the molecules o

The TF-IDF baseline achieved a 99.39% accuracy on the validation set. However, qualitative analysis revealed that this high score is due to overfitting on simple, dataset-specific character n-grams like 'cause' (human) and 'like,' (LLM). The model fails to generalize, incorrectly classifying new, human-written sentences as LLM-generated. This proves that a simple lexical model is insufficient and justifies the need for the proposed Text-CNN to learn deeper, more robust semantic and structural patterns.

In [8]:
# save vectorizer for reuse
from joblib import dump
import os
os.makedirs('../baseline', exist_ok=True)
artifact_path = '../baseline/baseline_tfidf_lr.joblib'
dump({'vectorizer': vectorizer, 'model': baseline_model}, artifact_path)
print(f"Saved TF-IDF vectorizer and LogisticRegression to {artifact_path}")

Saved TF-IDF vectorizer and LogisticRegression to ../baseline/baseline_tfidf_lr.joblib
