In [1]:
# from transformers import DistilBertTokenizerFast, DistilBertModel
from transformers import AutoTokenizer, AutoModel, AutoModelForSequenceClassification
from transformers import get_linear_schedule_with_warmup

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
import evaluate
from datasets import Dataset, DatasetDict
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
import os
import pickle
from tqdm import tqdm
from typing import List
from abc import abstractmethod

np.random.seed(42)

In [2]:
input_data = pd.read_csv('HC3.csv')
input_data = input_data.dropna()
# Shuffle the DataFrame
input_data = input_data.sample(frac=0.2) 
input_data

Unnamed: 0,question,source,labels,answers
4388,Suggest treatment for colorectal cancerMy Fath...,medicine,0,"Hi, dearI have gone through your question. I c..."
17007,"Evolutionarily speaking , why did the first li...",reddit_eli5,0,It did n't decide to propagate . It somehow mu...
19805,what are the facial differences between men an...,reddit_eli5,0,Sexual Dimorphism - Some species of animals ex...
9975,Why does India do so badly in the Olympics des...,reddit_eli5,0,The Indian gov't does n't invest the same sort...
13756,How and why does this work ? ( X - post from /...,reddit_eli5,1,The rods and cones in your eye are light-sensi...
...,...,...,...,...
22985,how do linguist find dead language pronunciati...,reddit_eli5,0,* they extrapolate from similar languages that...
8817,How internet speeds work . Why are some faster...,reddit_eli5,1,Sure! Let's start with how the internet works....
231,Expecting to move in five years; how to lock m...,finance,1,It is generally not possible to lock in a mort...
8240,Marijuana 's actual effect on driving Have n't...,reddit_eli5,1,Marijuana is a drug that is often used for rec...


In [3]:
input_data = input_data.rename(columns={'answers': 'text', 'labels': 'label'})
input_data = input_data.loc[:, ['text', 'label']]
input_data = input_data.reset_index(drop=True)
input_data

Unnamed: 0,text,label
0,"Hi, dearI have gone through your question. I c...",0
1,It did n't decide to propagate . It somehow mu...,0
2,Sexual Dimorphism - Some species of animals ex...,0
3,The Indian gov't does n't invest the same sort...,0
4,The rods and cones in your eye are light-sensi...,1
...,...,...
4768,* they extrapolate from similar languages that...,0
4769,Sure! Let's start with how the internet works....,1
4770,It is generally not possible to lock in a mort...,1
4771,Marijuana is a drug that is often used for rec...,1


In [4]:
train_data, test_data = train_test_split(input_data, test_size=0.2, random_state=42)
train_data.reset_index(drop=True, inplace=True)
test_data.reset_index(drop=True, inplace=True)
train_dataset = Dataset.from_pandas(train_data)
test_dataset = Dataset.from_pandas(test_data)

dataset_dict = DatasetDict({
    'train': train_dataset,
    'test': test_dataset,
})

print(dataset_dict)

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 3818
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 955
    })
})


In [5]:
train_data

Unnamed: 0,text,label
0,Mickie James is a professional wrestler who ha...,1
1,Well for starters dopamine is a neurotransmitt...,0
2,"Its not hate , its a strong dislike of their r...",0
3,"As an audio engineer , I 've broken this down ...",0
4,What the fuck are you snorting water for ?,0
...,...,...
3813,"Whole life insurance , in addition to having t...",0
3814,"Brandon Sanderson , fantasy writer , offers a ...",0
3815,Under the water is some kind of ground . If yo...,0
3816,Have you ever heard a really loud or high-pitc...,1


In [6]:
def truncate(example):
    # here we cut into a shorter text, since large text tends to be gpt-made
    """Truncate text to the first 50 words."""
    return {
        'text': " ".join(example['text'].split())[:50],
        'label': example['label']
    }

small_dataset_dict = DatasetDict({
    'train': dataset_dict['train'].map(truncate),
    'val': dataset_dict['test'].map(truncate),
})

Map:   0%|          | 0/3818 [00:00<?, ? examples/s]

Map:   0%|          | 0/955 [00:00<?, ? examples/s]

In [7]:
id2label = {
    0: "Human", 
    1: "Chatgpt"
}

In [8]:
def print_encoding(model_inputs, indent=4):
    indent_str = " " * indent
    print("{")
    for k, v in model_inputs.items():
        print(indent_str + k + ":")
        print(indent_str + indent_str + str(v))
    print("}")

In [9]:
model_name = 'distilbert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize(batch):
    return tokenizer(batch['text'], padding=True, truncation=True)

tokenized_dataset = small_dataset_dict.map(tokenize, batched=True, batch_size=16)

# Remove the 'text' column and rename 'label' to 'labels'
tokenized_dataset = tokenized_dataset.remove_columns(["text"])
tokenized_dataset = tokenized_dataset.rename_column("label", "labels")
tokenized_dataset.set_format("torch")

# print the frist 3 processed samples
print_encoding(tokenized_dataset['train'][:3])

Map:   0%|          | 0/3818 [00:00<?, ? examples/s]

Map:   0%|          | 0/955 [00:00<?, ? examples/s]

{
    labels:
        tensor([1, 0, 0])
    input_ids:
        tensor([[  101, 10872,  2666,  2508,  2003,  1037,  2658, 10706,  2040,  2038,
         24185,   102,     0,     0,     0,     0,     0],
        [  101,  2092,  2005, 29400,  2079,  4502, 11233,  2003,  1037, 11265,
         10976,  6494,  3619, 22930,  3334,  1010,   102],
        [  101,  2049,  2025,  5223,  1010,  2049,  1037,  2844, 18959,  1997,
          2037, 10958, 22987,   102,     0,     0,     0]])
    attention_mask:
        tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0]])
}


In [10]:
train_dl = DataLoader(tokenized_dataset['train'], batch_size=8)
val_dl = DataLoader(tokenized_dataset['val'], batch_size=8)

model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)
num_epochs = 5
num_training_steps = num_epochs * len(train_dl)

optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5, weight_decay=0.01)

lr_scheduler= get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

for epoch in range(num_epochs):
    model.train()
    
    batch_no = 0
    for batch_no, batch in enumerate(train_dl, 1):
        if batch_no % 100 == 0:
            print(f'Epoch {epoch} Batch {batch_no}')
        # run in device
        batch = {k: v.to(device) for k, v in batch.items()}
        t_output = model(**batch)
        optimizer.zero_grad()
        loss = t_output.loss
        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        
    torch.save(model, f"bert_model_full_{epoch}.pt")
    t_metric = evaluate.load("accuracy")
    
    model.eval()
    with torch.no_grad():
        for batch in train_dl:
            batch = {k: v.to(device) for k, v in batch.items()}
            t_outputs = model(**batch)
            t_logits = t_outputs.logits
            t_predictions = torch.argmax(t_logits, dim=-1)
            t_metric.add_batch(predictions=t_predictions, references=batch["labels"])
    train_acc = [float(i) for i in t_metric.compute().values()]
    
    # validation
    metric = evaluate.load("accuracy")            
                
    model.eval()
    with torch.no_grad():
        for batch in val_dl:
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            logits = outputs.logits
            predictions = torch.argmax(logits, dim=-1)
            metric.add_batch(predictions=predictions, references=batch["labels"])
    validation_acc = [float(i) for i in metric.compute().values()]
    
    # print the training process
    print(f'Epoch {epoch + 1}: train acc = {train_acc[0]}, validation acc = {validation_acc[0]}')

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 0 Batch 100
Epoch 0 Batch 200
Epoch 0 Batch 300
Epoch 0 Batch 400
Epoch 1: train acc = 0.8881613410162389, validation acc = 0.8345549738219895
Epoch 1 Batch 100
Epoch 1 Batch 200
Epoch 1 Batch 300
Epoch 1 Batch 400
Epoch 2: train acc = 0.9565217391304348, validation acc = 0.8575916230366493
Epoch 2 Batch 100
Epoch 2 Batch 200
Epoch 2 Batch 300
Epoch 2 Batch 400
Epoch 3: train acc = 0.9580932425353589, validation acc = 0.8408376963350785
Epoch 3 Batch 100
Epoch 3 Batch 200
Epoch 3 Batch 300
Epoch 3 Batch 400
Epoch 4: train acc = 0.9863803038239917, validation acc = 0.8617801047120419
Epoch 4 Batch 100
Epoch 4 Batch 200
Epoch 4 Batch 300
Epoch 4 Batch 400
Epoch 5: train acc = 0.9895233106338397, validation acc = 0.86282722513089


In [11]:
# ! jupyter nbconvert --to html DistilBert_2025.ipynb

In [12]:
chatgpt_genreated_news = [
    "It is not appropriate to generalize and make assumptions about what kind of people will not succeed in life."
]
prediction_label = []

# test finetuned model on chatgpt_genreated_news
tokenizer = AutoTokenizer.from_pretrained(model_name)
tkn_chatgpt_genreated_news = tokenizer(
    chatgpt_genreated_news, 
    truncation=True, 
    padding=True, 
    return_tensors="pt"
).to(device)

with torch.no_grad():
    outputs = model(**tkn_chatgpt_genreated_news)
    prediction = torch.argmax(outputs.logits, dim=1)

prediction_label = [id2label[i.item()] for i in prediction]

print(prediction_label)

['Chatgpt']


In [13]:
for epoch in range(num_epochs):
    model.eval()
    metric = evaluate.load("accuracy")
    
    with torch.no_grad():
        pred_list=[]
        actual_list=[]
        for batch in val_dl:
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            predictions = torch.argmax(outputs.logits, dim=-1)
            
            pred_list.append(predictions.cpu().numpy())
            actual_list.append(batch["labels"].cpu().numpy())
            metric.add_batch(predictions=predictions, references=batch["labels"])

    validation_acc = [float(i) for i in metric.compute().values()]

In [14]:
p_lst = [item for sublist in pred_list for item in sublist]
a_lst = [item for sublist in actual_list for item in sublist]

In [15]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

accuracy = accuracy_score(a_lst, p_lst)
precision, recall, f_score, _ = precision_recall_fscore_support(
    a_lst, p_lst, average="macro")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F-score: {f_score:.4f}")

Accuracy: 0.8628
Precision: 0.8701
Recall: 0.8660
F-score: 0.8627


In [16]:
############################### extra test #################################

test_data = pd.read_csv('test_set.csv',encoding = 'unicode_escape')
test_data = test_data.rename(columns={'answers': 'text', 'labels': 'label'})
test_data = test_data.loc[:, ['text', 'label']]

In [17]:
testset_dict = DatasetDict({
    'test': test_data
})
test_dl = DataLoader(testset_dict['test'], batch_size=16)

In [18]:
test_pred_list=[]
with torch.no_grad():
    for i in test_data['text']:
        tkn_chatgpt_genreated_news = tokenizer(i, truncation=True, padding=True, return_tensors="pt")
        tkn_chatgpt_genreated_news = {k: v.to(device) for k, v in tkn_chatgpt_genreated_news.items()}
        outputs = model(**tkn_chatgpt_genreated_news)
        prediction = torch.argmax(outputs.logits, dim=1)
        test_pred_list.append(prediction.cpu().numpy())

# convert prediction and actual to list
test_data_pred = [item for sublist in test_pred_list for item in sublist]

test_data['pred'] = test_data_pred
test_data

Unnamed: 0,text,label,pred
0,"As an AI language model, I cannot provide fina...",1,1
1,IMO Tesla is overvalued. Not a bad company but...,0,0
2,"First, it's important to understand that finan...",1,1
3,Trade the wheel as this is a fully covered str...,0,0
4,"Hello. No.\n\nThe closest to options are ""Warr...",0,0
5,Options trading involves buying or selling con...,1,1
6,No traffic. My life didnt change at all during...,0,0
7,While lockdowns were implemented to slow the s...,1,1
8,What meds have they tried? Sounds like they sh...,0,0
9,Winter asthma cough can be triggered by a vari...,1,1


In [19]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

accuracy = accuracy_score(test_data['label'], test_data_pred)
precision, recall, f_score, _ = precision_recall_fscore_support(
    test_data['label'], test_data_pred, average="macro")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F-score: {f_score:.4f}")

Accuracy: 0.9250
Precision: 0.9348
Recall: 0.9250
F-score: 0.9246


In [20]:
wrong_idx = [i for i in range(40) if (test_data['label'] != test_data['pred']).tolist()[i]]

for idx in wrong_idx:
    print(f'[Index {idx}]')
    print(test_data.iloc[idx]["text"])
    print()

[Index 22]
A derivative is a financial term often used to refer to a general asset class; however, the actual value derives from the underlying assets. If you are considering diversifying your portfolio by trading derivatives, itâs a good idea to get a thorough understanding beforehand, as higher risk and more complex processes are involved. This guide will explain how they function, the most common derivative contract types, and the benefits and risks of trading derivatives.

[Index 26]
People who lack the capacity to be honest with themselves.

[Index 33]
Mayonnaise on food like burgers, sandwiches and sometimes fries.



In [1]:
! jupyter nbconvert --to html DistilBert_2025.ipynb

[NbConvertApp] Converting notebook DistilBert_2025.ipynb to html
[NbConvertApp] Writing 650413 bytes to DistilBert_2025.html
