In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import os
os.chdir('drive/MyDrive/CIS 5300 - Final Project/Milestone 2/Data')
print(f"Current working directory: {os.getcwd()}")

Current working directory: /content/drive/MyDrive/CIS 5300 - Final Project/Milestone 2/Data


In [3]:
!ls

bert_test_predictions.csv	 svm_test_predictions.csv
dev_data.csv			 test_data.csv
dev_data_with_features.csv	 test_data_with_features.csv
evaluate.py			 test_labels.txt
lstm_test_predictions.csv	 train_data.csv
results				 train_data_head.csv
simple_baseline_predictions.txt  train_data_with_features.csv
svm_human_vs_ai.joblib


In [4]:
import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

train_df = pd.read_csv('train_data_with_features.csv').drop(columns=['content_category'])
dev_df = pd.read_csv('dev_data_with_features.csv').drop(columns=['content_category'])
test_df = pd.read_csv('test_data_with_features.csv').drop(columns=['content_category'])

print("Train data shape:", train_df.shape)
print("Dev data shape:", dev_df.shape)
print("Test data shape:", test_df.shape)

# Display the first few rows and columns to identify the target variable
print("\nTrain DataFrame head:")
display(train_df.head())
print("\nTrain DataFrame columns:")
print(train_df.columns.tolist())

Train data shape: (389788, 16)
Dev data shape: (48723, 16)
Test data shape: (48724, 16)

Train DataFrame head:


Unnamed: 0,text,generated,text_length,flesch_reading_ease,automated_readability_index,num_words,num_unique_words,avg_word_length,num_sentences,avg_words_per_sentence,punctuation_count,flesch_kincaid_grade,type_token_ratio,stop_word_ratio,encoded_content_category,ml_content_category
0,"I think that FACS is very useful technology, t...",0,1171,73.798579,7.431909,231.0,118.0,4.538835,15.0,15.4,25.0,6.37765,0.572816,0.543689,3,3
1,Should students create their own summer projec...,0,3032,63.568636,11.473788,582.0,170.0,4.49631,24.0,24.25,39.0,10.003109,0.313653,0.512915,3,3
2,"As an average 8thgrade student, I have develop...",1,2123,56.503538,10.680003,422.0,147.0,4.465608,18.0,23.444444,42.0,10.622807,0.388889,0.555556,1,1
3,Holy Avocados! A new computer software has jus...,0,3023,51.379326,13.48161,603.0,223.0,4.617308,22.0,27.409091,82.0,12.29181,0.428846,0.503846,3,3
4,Title: A Cowboy Who Rode the Waves\n\nOnce upo...,1,2167,69.273559,8.871601,433.0,189.0,4.452442,22.0,19.681818,43.0,7.989714,0.485861,0.473008,0,0



Train DataFrame columns:
['text', 'generated', 'text_length', 'flesch_reading_ease', 'automated_readability_index', 'num_words', 'num_unique_words', 'avg_word_length', 'num_sentences', 'avg_words_per_sentence', 'punctuation_count', 'flesch_kincaid_grade', 'type_token_ratio', 'stop_word_ratio', 'encoded_content_category', 'ml_content_category']


In [5]:
import numpy as np
import pandas as pd

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix


In [6]:
feature_cols = train_df.columns[2:]  # all except last
label_col    = train_df.columns[1]   # last column

X_train = train_df[feature_cols].values
y_train = train_df[label_col].values

X_dev   = dev_df[feature_cols].values
y_dev   = dev_df[label_col].values

X_test = test_df[feature_cols].values
y_test = test_df[label_col].values

In [7]:
print("Train:", X_train.shape, y_train.shape)
print("Dev:",   X_dev.shape,   y_dev.shape)
print("Test:",  X_test.shape, y_test.shape)


Train: (389788, 14) (389788,)
Dev: (48723, 14) (48723,)
Test: (48724, 14) (48724,)


In [8]:
svm_clf = Pipeline([
    ("scaler", StandardScaler()),
    ("svm", SVC(
        kernel="rbf",
        C=1.0,
        gamma="scale",
        probability=False,
        verbose = True,
        random_state=42
    ))
])


In [9]:
import joblib

# Load the saved model instead of training
print("Loading saved model from svm_human_vs_ai.joblib...")
svm_clf = joblib.load("svm_human_vs_ai.joblib")

y_dev_pred = svm_clf.predict(X_dev)

dev_acc = accuracy_score(y_dev, y_dev_pred)
print(f"Dev accuracy: {dev_acc:.4f}\n")

print("Classification report (dev):")
print(classification_report(y_dev, y_dev_pred, digits=4))

print("Confusion matrix (dev):")
print(confusion_matrix(y_dev, y_dev_pred))

Loading saved model from svm_human_vs_ai.joblib...
Dev accuracy: 0.8996

Classification report (dev):
              precision    recall  f1-score   support

           0     0.9029    0.9414    0.9217     30608
           1     0.8932    0.8289    0.8599     18115

    accuracy                         0.8996     48723
   macro avg     0.8981    0.8851    0.8908     48723
weighted avg     0.8993    0.8996    0.8987     48723

Confusion matrix (dev):
[[28813  1795]
 [ 3099 15016]]


In [10]:
import pandas as pd
y_test_pred = svm_clf.predict(X_test)

# Create a DataFrame for the detailed results
results_df = pd.DataFrame({
    'essay': test_df['text'],
    'actual_generated': y_test,
    'predicted_generated': y_test_pred
})

# Add a column to check if the prediction is correct
results_df['is_correct'] = results_df['actual_generated'] == results_df['predicted_generated']

# Save the detailed results to a CSV file
output_filename = "svm_test_predictions.csv"
results_df.to_csv(output_filename, index=False)

print(f"Detailed predictions saved to {output_filename}")

# Display the first few rows of the results
display(results_df.head())

Detailed predictions saved to svm_test_predictions.csv


Unnamed: 0,essay,actual_generated,predicted_generated,is_correct
0,In American society today and other countries ...,0,0,True
1,Most people were raised on fairness; what ever...,0,0,True
2,"""Carefree"" Many rivers around the world are re...",0,0,True
3,"Places around the world such as Germany, Ameri...",0,0,True
4,"Dear TEACHER_NAME,\n\nI think community servic...",0,0,True


In [11]:
from sklearn.metrics import confusion_matrix

print("Confusion Matrix (Test Set):")
print(confusion_matrix(y_test, y_test_pred))

Confusion Matrix (Test Set):
[[28657  1847]
 [ 3074 15146]]


In [12]:
print("Test accuracy:", accuracy_score(y_test, y_test_pred))

Test accuracy: 0.8990025449470487
