In [1]:
import pandas as pd
import numpy as np
from datasets import load_dataset

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
dataset = load_dataset("Hello-SimpleAI/HC3", name='all')
dataset = dataset['train'].to_pandas()

In [5]:
len(dataset)

24322

In [6]:
num_prompts = 1000

In [7]:
reduced_df = dataset.head(num_prompts).drop(columns=['source'])

human_df = reduced_df[['human_answers']].rename(columns={'human_answers':'text'})
human_df['prompt_id'] = reduced_df["id"]
human_df['text'] = human_df['text'].apply(lambda x: ''.join(x))
human_df['label'] = 0

llm_df = reduced_df[['chatgpt_answers']].rename(columns={'chatgpt_answers':'text'})
llm_df['prompt_id'] = reduced_df["id"]
llm_df['text'] = llm_df['text'].apply(lambda x: ''.join(x))
llm_df['label'] = 1

full_df = pd.concat((human_df, llm_df), axis=0, ignore_index=True)
full_df = full_df[['prompt_id', 'text', 'label']]

In [8]:
X = full_df['text']
y = full_df['label']

In [9]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [11]:
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

In [12]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn import metrics
from sklearn.metrics import confusion_matrix

In [23]:
lg = LogisticRegression(penalty='l1',solver='liblinear')
sv = SVC(kernel='sigmoid',gamma=1.0)
mnb = MultinomialNB()
dtc = DecisionTreeClassifier(max_depth=5)
knn = KNeighborsClassifier()
rfc = RandomForestClassifier(n_estimators=50,random_state=2)
etc = ExtraTreesClassifier(n_estimators=50,random_state=2)
abc = AdaBoostClassifier(n_estimators=50,random_state=2)
bg = BaggingClassifier(n_estimators=50,random_state=2)
gbc = GradientBoostingClassifier(n_estimators=50,random_state=2)

In [16]:
def prediction(model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    pr = model.predict(X_test)
    acc_score = metrics.accuracy_score(y_test, pr)
    f1 = metrics.f1_score(y_test, pr)
    return acc_score, f1

acc_score = {}
f1_score = {}
clfs = {
    'LR': lg,
    'SVM': sv,
    'DTC': dtc,
    'KNN': knn,
    'RFC': rfc,
    'ETC': etc,
    'ABC': abc,
    'BG': bg,
    'GBC': gbc,
}
for name, clf in clfs.items():
    acc_score[name], f1_score[name] = prediction(clf, X_train_tfidf, X_test_tfidf, y_train, y_test)

# View those scores
for name, acc in acc_score.items():
    print(f'Accuracy for {name}: {acc}')

for name, f1 in f1_score.items():
    print(f'F1 score for {name}: {f1}')


Accuracy for LR: 0.935
Accuracy for SVM: 0.945
Accuracy for DTC: 0.8175
Accuracy for KNN: 0.655
Accuracy for RFC: 0.9275
Accuracy for ETC: 0.9475
Accuracy for ABC: 0.91
Accuracy for BG: 0.93
Accuracy for GBC: 0.9225
F1 score for LR: 0.934010152284264
F1 score for SVM: 0.9438775510204082
F1 score for DTC: 0.8103896103896104
F1 score for KNN: 0.649746192893401
F1 score for RFC: 0.9265822784810127
F1 score for ETC: 0.9465648854961832
F1 score for ABC: 0.9095477386934674
F1 score for BG: 0.9285714285714286
F1 score for GBC: 0.9223057644110275


In [24]:

mnb_pred_acc, mnb_pred_f1 = prediction(mnb, X_train_tfidf, X_test_tfidf, y_train, y_test)

print(mnb_pred_acc, mnb_pred_f1)

0.875 0.861878453038674


In [20]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, f1_score

# Define scoring metric
scorer = make_scorer(f1_score, average='weighted')

# Define hyperparameter grids for each model
param_grids = {
    'LogisticRegression': {
        'C': [0.01, 0.1, 1, 10],
        'penalty': ['l1', 'l2'],
        'solver': ['liblinear', 'saga']
    },
    'SVC': {
        'C': [0.1, 1, 10],
        'kernel': ['linear', 'rbf', 'sigmoid'],
        'gamma': ['scale', 'auto']
    },
    'MultinomialNB': {
        'alpha': [0.1, 0.5, 1, 5]
    },
    'DecisionTreeClassifier': {
        'max_depth': [3, 5, 10, None],
        'min_samples_split': [2, 5, 10]
    },
    'RandomForestClassifier': {
        'n_estimators': [50, 100, 200],
        'max_depth': [5, 10, None],
        'min_samples_split': [2, 5, 10]
    },
    'GradientBoostingClassifier': {
        'learning_rate': [0.01, 0.1, 0.2],
        'n_estimators': [50, 100, 200],
        'max_depth': [3, 5, 10]
    },
    'KNN': {
        'n_neighbors': [3, 5, 7, 10],  # Number of neighbors
        'weights': ['uniform', 'distance'],  # Weighting scheme
        'metric': ['euclidean', 'manhattan', 'minkowski']  # Distance metric
    },
    'ExtraTreesClassifier' : {
        'n_estimators': [50, 100, 200],  # Number of trees
        'max_features': ['sqrt', 'log2', None],  # Features to consider for splitting
        'max_depth': [5, 10, None],  # Depth of the tree
        'min_samples_split': [2, 5, 10],  # Minimum samples for a split
        'min_samples_leaf': [1, 2, 4]  # Minimum samples in a leaf node
    },
    'ADABoost' : {
        'n_estimators': [50, 100, 200],  # Number of boosting stages
        'learning_rate': [0.01, 0.1, 1, 2],  # Learning rate
        'algorithm': ['SAMME', 'SAMME.R']  # Boosting algorithm
    },
    'Bagging' : {
        'n_estimators': [10, 50, 100],  # Number of base estimators
        'max_samples': [0.5, 0.7, 1.0],  # Fraction of samples for each estimator
        'max_features': [0.5, 0.7, 1.0],  # Fraction of features for each estimator
        'bootstrap': [True, False],  # Sampling with replacement
        'bootstrap_features': [True, False]  # Sampling features with replacement
}

}

In [21]:
# Instantiate models
models = {
    'LogisticRegression': LogisticRegression(),
    'SVC': SVC(),
    'MultinomialNB': MultinomialNB(),
    'DecisionTreeClassifier': DecisionTreeClassifier(),
    'RandomForestClassifier': RandomForestClassifier(),
    'GradientBoostingClassifier': GradientBoostingClassifier(),
    'KNN': KNeighborsClassifier(),
    'ExtraTreesClassifier': ExtraTreesClassifier(),
    'ADABoost': AdaBoostClassifier(),
    'Bagging': BaggingClassifier()
}

In [22]:
# Perform GridSearch for each model
best_models = {}
for model_name, model in models.items():
    print(f"Running GridSearch for {model_name}...")
    grid = GridSearchCV(model, param_grids[model_name], scoring=scorer, cv=5, n_jobs=-1)
    grid.fit(X_train_tfidf, y_train)
    best_models[model_name] = grid.best_params_
    print(f"Best parameters for {model_name}: {grid.best_params_}")
    print(f"Best F1 Score: {grid.best_score_}")

Running GridSearch for LogisticRegression...




Best parameters for LogisticRegression: {'C': 10, 'penalty': 'l1', 'solver': 'saga'}
Best F1 Score: 0.9468714411363408
Running GridSearch for SVC...
Best parameters for SVC: {'C': 1, 'gamma': 'scale', 'kernel': 'sigmoid'}
Best F1 Score: 0.9406235655754139
Running GridSearch for MultinomialNB...
Best parameters for MultinomialNB: {'alpha': 0.5}
Best F1 Score: 0.8674568498684898
Running GridSearch for DecisionTreeClassifier...
Best parameters for DecisionTreeClassifier: {'max_depth': None, 'min_samples_split': 10}
Best F1 Score: 0.8511802447913281
Running GridSearch for RandomForestClassifier...
Best parameters for RandomForestClassifier: {'max_depth': None, 'min_samples_split': 5, 'n_estimators': 200}
Best F1 Score: 0.961243673685852
Running GridSearch for GradientBoostingClassifier...
Best parameters for GradientBoostingClassifier: {'learning_rate': 0.2, 'max_depth': 5, 'n_estimators': 200}
Best F1 Score: 0.9562424408729229
Running GridSearch for KNN...
Best parameters for KNN: {'metri

60 fits failed out of a total of 120.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
60 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/liamoreilly/Desktop/CornellTech/AI-detection/.venv/lib/python3.12/site-packages/sklearn/model_selection/_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/liamoreilly/Desktop/CornellTech/AI-detection/.venv/lib/python3.12/site-packages/sklearn/base.py", line 1382, in wrapper
    estimator._validate_params()
  File "/Users/liamoreilly/Desktop/CornellTech/AI-detection/.venv/lib/python3.12/site-packages/sklearn/base.py", line 436, in _validate_params
    validate_parameter_constraints(
  File "/Users/liamoreil

Best parameters for ADABoost: {'algorithm': 'SAMME', 'learning_rate': 1, 'n_estimators': 200}
Best F1 Score: 0.9562404248935195
Running GridSearch for Bagging...
Best parameters for Bagging: {'bootstrap': False, 'bootstrap_features': True, 'max_features': 0.5, 'max_samples': 1.0, 'n_estimators': 100}
Best F1 Score: 0.956874310265956
