### Import Libraries and Dataset

In [2]:
import pandas as pd
import numpy as np
from datasets import load_dataset

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
dataset = load_dataset("Hello-SimpleAI/HC3", name='all')
dataset = dataset['train'].to_pandas()

In [4]:
dataset.head()

Unnamed: 0,id,question,human_answers,chatgpt_answers,source
0,0,"Why is every book I hear about a "" NY Times # ...","[Basically there are many categories of "" Best...",[There are many different best seller lists th...,reddit_eli5
1,1,"If salt is so bad for cars , why do we use it ...",[salt is good for not dying in car crashes and...,[Salt is used on roads to help melt ice and sn...,reddit_eli5
2,2,Why do we still have SD TV channels when HD lo...,[The way it works is that old TV stations got ...,[There are a few reasons why we still have SD ...,reddit_eli5
3,3,Why has nobody assassinated Kim Jong - un He i...,[You ca n't just go around assassinating the l...,[It is generally not acceptable or ethical to ...,reddit_eli5
4,4,How was airplane technology able to advance so...,[Wanting to kill the shit out of Germans drive...,[After the Wright Brothers made the first powe...,reddit_eli5


In [5]:
dataset.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24322 entries, 0 to 24321
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   id               24322 non-null  object
 1   question         24322 non-null  object
 2   human_answers    24322 non-null  object
 3   chatgpt_answers  24322 non-null  object
 4   source           24322 non-null  object
dtypes: object(5)
memory usage: 950.2+ KB


### Preprocessing
Notes/Steps:
1. Create df with columns Id, Prompt, Essay, Generated(0,1)
2. Use 1000 essays at first, 500 prompts x 2 essays/prompt (one human, one generated)
3. Split 80/10/10 Train/Test/Validation

In [17]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix

In [7]:
reduced_df = dataset.head(500).drop(columns=['source'])

In [8]:
human_df = reduced_df[['human_answers']].rename(columns={'human_answers':'text'})
human_df['prompt_id'] = reduced_df["id"]
human_df['text'] = human_df['text'].apply(lambda x: ''.join(x))
human_df['label'] = 0

llm_df = reduced_df[['chatgpt_answers']].rename(columns={'chatgpt_answers':'text'})
llm_df['prompt_id'] = reduced_df["id"]
llm_df['text'] = llm_df['text'].apply(lambda x: ''.join(x))
llm_df['label'] = 1

full_df = pd.concat((human_df, llm_df), axis=0, ignore_index=True)
full_df = full_df[['prompt_id', 'text', 'label']]

In [9]:
full_df

Unnamed: 0,prompt_id,text,label
0,0,"Basically there are many categories of "" Best ...",0
1,1,salt is good for not dying in car crashes and ...,0
2,2,The way it works is that old TV stations got a...,0
3,3,You ca n't just go around assassinating the le...,0
4,4,Wanting to kill the shit out of Germans drives...,0
...,...,...,...
995,495,"When people look at a map of the Earth, they u...",1
996,496,Sigmund Freud was a psychologist who developed...,1
997,497,"In the United States, the Americans with Disab...",1
998,498,Genetic makeup can make a person more or less ...,1


In [10]:
train_df, temp_df = train_test_split(full_df, test_size=0.2, random_state=42)
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)

### Feature Extraction

In [12]:
vectorizer = TfidfVectorizer(max_features=10000)

In [14]:
X_train = vectorizer.fit_transform(train_df["text"])
X_valid = vectorizer.transform(val_df["text"])
X_test = vectorizer.transform(test_df["text"])

y_train = train_df["label"]
y_valid = val_df["label"]
y_test = test_df["label"]

### Model: Logistic Regression

In [16]:
C_list = [0.01, 0.1, 1, 10, 100]

best_acc = 0
best_C = 0

for C in C_list:
    model = LogisticRegression(C=C)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_valid)
    
    acc = accuracy_score(y_valid, y_pred)
    print(f"Accuracy: {acc}, C: {C}")

    if acc > best_acc:
        best_acc = acc
        best_C = C

print(f"Best accuracy: {best_acc}, Best C: {best_C}")

Accuracy: 0.61, C: 0.01
Accuracy: 0.86, C: 0.1
Accuracy: 0.92, C: 1
Accuracy: 0.91, C: 10
Accuracy: 0.92, C: 100
Best accuracy: 0.92, Best C: 1


In [18]:
final_model = LogisticRegression(C=best_C)
final_model.fit(X_train, y_train)

In [19]:
y_pred = final_model.predict(X_test)
acc = accuracy_score(y_test, y_pred)
print(f"Accuracy: {acc}")

cm = confusion_matrix(y_test, y_pred)
print(cm)

Accuracy: 0.92
[[49  5]
 [ 3 43]]


### Model: BernoulliNB

In [109]:
# from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import BernoulliNB

M = [i for i in range(1, 21)]
best_acc, best_m = 0, 0

for m in M:
    vectorizer = CountVectorizer(ngram_range=(2, 2), binary=True, min_df=m)
    X_train = vectorizer.fit_transform(train_df['text'])
    X_valid = vectorizer.transform(val_df['text'])
    X_test = vectorizer.transform(test_df["text"])
    
    y_train = train_df["label"]
    y_valid = val_df["label"]
    y_test = test_df["label"]
    
    model = BernoulliNB()
    model.fit(X_train, y_train)
    
    y_pred = model.predict(X_valid)
    accuracy = accuracy_score(y_valid, y_pred)
    print(f"Accuracy: {accuracy}, M: {m}")

    if accuracy > best_acc:
        best_acc = accuracy
        best_m = m

print(f'Best M: {best_m}')



Accuracy: 0.64, M: 1
Accuracy: 0.83, M: 2
Accuracy: 0.89, M: 3
Accuracy: 0.89, M: 4
Accuracy: 0.89, M: 5
Accuracy: 0.89, M: 6
Accuracy: 0.9, M: 7
Accuracy: 0.9, M: 8
Accuracy: 0.89, M: 9
Accuracy: 0.88, M: 10
Accuracy: 0.88, M: 11
Accuracy: 0.87, M: 12
Accuracy: 0.9, M: 13
Accuracy: 0.88, M: 14
Accuracy: 0.9, M: 15
Accuracy: 0.88, M: 16
Accuracy: 0.89, M: 17
Accuracy: 0.88, M: 18
Accuracy: 0.86, M: 19
Accuracy: 0.87, M: 20
Best M: 7


In [110]:
final_vectorizer = CountVectorizer(ngram_range=(2, 2), binary=True, min_df=m)
X_train = final_vectorizer.fit_transform(train_df['text'])
X_test = final_vectorizer.transform(test_df["text"])
y_train = train_df["label"]
y_test = test_df["label"]

model = BernoulliNB()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy on test set: {accuracy}")

Accuracy on test set: 0.96
