In [1]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import os
from tqdm import tqdm
import warnings
warnings.filterwarnings("ignore")
from transformers import (
    BertTokenizer, 
    BertForSequenceClassification, 
    TrainingArguments, 
    Trainer,
    DataCollatorWithPadding
)
import torch
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from datasets import load_dataset,Dataset
from sklearn.model_selection import train_test_split

In [None]:
ds = load_dataset("ahmadreza13/human-vs-Ai-generated-dataset")

In [None]:
ds['train'][0]

In [None]:
df = ds['train'].to_pandas()

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.drop('model',axis=1,inplace=True)

In [None]:
plt.figure(figsize=(20,6))
sns.countplot(x="generated", data=df)

In [None]:
# from tqdm import tqdm
# MaxLength = 0
# for item in tqdm(ds['train']):
#     if len(item['data']) > MaxLength :
#         MaxLength = len(item['data'])
# MaxLength

In [None]:
len(df)

In [None]:
df = df[df['data'].str.len() <= 512] #2000
df.rename(columns={"data": "text", "generated": "label"}, inplace=True)

In [None]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained(
    'bert-base-uncased',
    num_labels=2,
    problem_type="single_label_classification"
)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer,padding="max_length")

In [None]:
def chunked_data_generator(df, num_splits=32):
    subset_size = len(df) // num_splits
    df = df.sample(frac=1, random_state=42).reset_index(drop=True)
    remaining_data = df

    for i in range(num_splits):
        if i == num_splits - 1:
            yield remaining_data
        else:
            subset, remaining_data = train_test_split(
                remaining_data,
                train_size=subset_size,
                stratify=remaining_data['label'],
                random_state=i
            )
            yield subset

In [None]:
def preprocess_function_df(text):
    return tokenizer(text, truncation=True, padding='max_length', max_length=512, return_tensors="pt")

In [None]:
model_save_path = '/kaggle/working/model'
training_args = TrainingArguments(
    output_dir=model_save_path,
    num_train_epochs=10,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_ratio=0.1,
    weight_decay=0.01,
    logging_dir='/kaggle/working/logs',
    logging_steps=100,
    evaluation_strategy="steps",
    eval_steps=500,
    save_strategy="steps",
    save_steps=500,
    load_best_model_at_end=True,
    metric_for_best_model='accuracy',
    push_to_hub=False,  
    )

In [None]:
Train_data,Test_data = train_test_split(df,test_size=0.1,df['label'],random_state=49)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    eval_dataset=Test_Dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

In [None]:
Metrics = []
generator = chunked_data_generator(Train_data)
for i,subset in enumerate(generator) :
    try:
        subset["text"] = subset.map(
            lambda x: preprocess_function_df(x["text"])
        )
        Train_dataset = Dataset.from_pandas(subset)
        trainer.train_dataset = Train_dataset
        trainer.train()
        result = trainer.evaluate()
        Metrics.append(result)    
    except :
        print(f'Error in the {i}_th subset')

In [None]:
trainer.save_model(model_save_path)
tokenizer.save_pretrained(model_save_path)    

In [None]:
Metrics = pd.DataFrame(Metrics)
for col in Metrics.columns :
    plt.figure(figsize=(20,6))
    sns.lineplot(data=Metrics,y=col)
    plt.show()

In [4]:
# def predict_text(text, model, tokenizer):
#     # Tokenize the text
#     inputs = tokenizer(text, truncation=True, padding=True, return_tensors="pt")
    
#     # Get prediction
#     outputs = model(**inputs)
#     probs = torch.nn.functional.softmax(outputs.logits, dim=1)
    
#     # Get prediction and confidence
#     prediction = torch.argmax(probs, dim=1).item()
#     confidence = probs[0][prediction].item()
    
#     return {
#         'prediction': 'AI-generated' if prediction == 1 else 'Human-written',
#         'confidence': confidence,
#         'probabilities': {
#             'human': probs[0][0].item(),
#             'ai': probs[0][1].item()
#         }
#     }

# if __name__ == "__main__":
#     # Train the model
#     model, tokenizer, trainer = train_ai_detector()
    
#     # Example predictions
#     sample_texts = [
#         "This is a human-written text about artificial intelligence.",
#         "The quantum mechanics principles state that particles can exist in multiple states simultaneously.",
#     ]
    
#     for text in sample_texts:
#         result = predict_text(text, model, tokenizer)
#         print(f"\nText: {text[:50]}...")
#         print(f"Prediction: {result['prediction']}")
#         print(f"Confidence: {result['confidence']:.2f}")
#         print(f"Probabilities: Human: {result['probabilities']['human']:.2f}, AI: {result['probabilities']['ai']:.2f}")

In [None]:
from huggingface_hub import login
login('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')

In [None]:
repo_name = 'Text_AI_Detection'
from huggingface_hub import HfApi
api = HfApi()
api.create_repo(repo_id=repo_name, private=True)

In [None]:
api.upload_folder(
    folder_path=model_save_path, 
    path_in_repo=".", 
    repo_id=repo_name,  
    repo_type="model",  
    revision="main" 
)