In [1]:
import torch, transformers, sklearn, os, re, random, time, sys
import numpy as np
import pandas as pd
from pathlib import Path
import matplotlib.pyplot as plt
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from transformers import AutoModelForSequenceClassification, AutoTokenizer, TrainingArguments, Trainer
from datasets import Dataset


from tqdm import tqdm
import optuna

pd.options.display.max_rows = 999
pd.options.display.max_colwidth = 99

print(f'Torch Version: {torch.__version__}')
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)



Torch Version: 2.0.0
cuda


In [2]:
seed=1
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)

<torch._C.Generator at 0x7a7a889c69f0>

In [3]:
df = pd.read_csv("/kaggle/input/daigt-v2-train-dataset/train_v2_drcat_02.csv", sep=',')

In [4]:
df.prompt_name.unique()

array(['Phones and driving', 'Car-free cities', 'Summer projects',
       '"A Cowboy Who Rode the Waves"',
       'Mandatory extracurricular activities', 'Exploring Venus',
       'Facial action coding system', 'The Face on Mars',
       'Community service', 'Grades for extracurricular activities',
       'Driverless cars', 'Does the electoral college work?',
       'Cell phones at school', 'Distance learning',
       'Seeking multiple opinions'], dtype=object)

In [5]:
df_filtered =df[df.RDizzl3_seven]

In [6]:
df_filtered.prompt_name.unique()

array(['Car-free cities', '"A Cowboy Who Rode the Waves"',
       'Exploring Venus', 'Facial action coding system',
       'The Face on Mars', 'Driverless cars',
       'Does the electoral college work?'], dtype=object)

In [7]:
df_filtered[df_filtered.label==1].iloc[1000].text

'Hey, ya\'ll! 😃 So, I\'m sure we\'ve all noticed how much technology has taken over our lives, right? Like, I mean, we\'re basically glued to our phones and computers all day long. And while it\'s super convenient to be able to communicate with people from anywhere in the world, I think it\'s also having some pretty negative effects on our relationships and overall health. 🤔\n\nFirst of all, let\'s talk about how technology is making us all bad communicators. Like, I know I\'m guilty of it too, but I\'ll be chatting with my friends online and I\'ll be like, "lol" and "omg" and stuff, but I\'m not actually paying attention to what they\'re saying. It\'s like, I\'m not even listening! And then when we do finally meet up in person, I\'m like, "Uh, what were we talking about again?" 🙈 It\'s like, our faces are glued to our screens and we\'re not even present in the moment. 🙄\n\nAnd it\'s not just our communication skills that are suffering. Technology is also giving us some serious health 

In [8]:
from transformers import AutoModelForSequenceClassification,AutoTokenizer
model_name = '../input/huggingfacedebertav3variants/deberta-v3-small'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2).to(device)

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at ../input/huggingfacedebertav3variants/deberta-v3-small and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight', 'classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
def preprocess_function(examples):
    # Your existing code
    outputs = tokenizer(examples['text'], max_length=256, padding=True, truncation=True, return_tensors='pt')

    # Convert the Torch tensors to lists or numpy arrays
    outputs = {key: value.tolist() if isinstance(value, torch.Tensor) else np.array(value) if isinstance(value, np.ndarray) else value.item() if torch.is_tensor(value) else value for key, value in outputs.items()}

    return outputs

In [10]:
ds = Dataset.from_pandas(df_filtered[['text', 'label']].reset_index(drop=True)).train_test_split(test_size=0.2)
ds['test'] = ds['test'].map(preprocess_function, num_proc=1, batched=True,remove_columns=['text'])
ds['train'] = ds['train'].map(preprocess_function, num_proc=1, batched=True,remove_columns=['text'])
ds

  0%|          | 0/5 [00:00<?, ?ba/s]

  0%|          | 0/17 [00:00<?, ?ba/s]

DatasetDict({
    train: Dataset({
        features: ['label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 16360
    })
    test: Dataset({
        features: ['label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 4090
    })
})

In [11]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    probs = np.exp(logits) / np.sum(np.exp(logits), axis=-1, keepdims=True)
    auc = roc_auc_score(labels, probs[:,1], multi_class='ovr')
    return {"roc_auc": auc}

In [12]:
args = TrainingArguments(
    "deberta-finetuned",
    evaluation_strategy = "steps",
    save_strategy = "steps",
    save_steps = 10000,
    eval_steps = 100,
    learning_rate=5e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=32,
    num_train_epochs=1,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model='roc_auc',
    report_to='none',
)

trainer = Trainer(
    model,
    args,
    train_dataset=ds['train'],
    eval_dataset=ds['test'],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [13]:
trainer.train()

You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss,Roc Auc
100,No log,0.056801,0.999356
200,No log,0.025369,0.999522
300,No log,0.235515,0.999558
400,No log,0.020408,0.999831
500,0.080700,0.035318,0.999786
600,0.080700,0.025049,0.999836
700,0.080700,0.090261,0.999874
800,0.080700,0.07934,0.999861
900,0.080700,0.017936,0.999929
1000,0.018100,0.03477,0.999902




TrainOutput(global_step=1023, training_loss=0.048323523510841036, metrics={'train_runtime': 706.3601, 'train_samples_per_second': 23.161, 'train_steps_per_second': 1.448, 'total_flos': 1083621919088640.0, 'train_loss': 0.048323523510841036, 'epoch': 1.0})

In [14]:
test = pd.read_csv('/kaggle/input/llm-detect-ai-generated-text/test_essays.csv')
test_ds = Dataset.from_pandas(test)
test_ds_enc = test_ds.map(preprocess_function, batched=True)

  0%|          | 0/1 [00:00<?, ?ba/s]

In [15]:
test_preds = trainer.predict(test_ds_enc)
test_preds

PredictionOutput(predictions=array([[-0.16550678,  0.08815041],
       [-0.1835741 ,  0.10907069],
       [-0.14762944,  0.06472559]], dtype=float32), label_ids=None, metrics={'test_runtime': 0.095, 'test_samples_per_second': 31.588, 'test_steps_per_second': 10.529})

In [16]:
test_preds

PredictionOutput(predictions=array([[-0.16550678,  0.08815041],
       [-0.1835741 ,  0.10907069],
       [-0.14762944,  0.06472559]], dtype=float32), label_ids=None, metrics={'test_runtime': 0.095, 'test_samples_per_second': 31.588, 'test_steps_per_second': 10.529})

In [17]:
logits = test_preds.predictions
probs = np.exp(logits) / np.sum(np.exp(logits), axis=-1, keepdims=True)
sub = pd.DataFrame()
sub['id'] = test['id']
sub['generated'] = probs[:,1]
predictions = test_preds.predictions
tensor_predictions = torch.from_numpy(predictions)
sub['generated'] = tensor_predictions[:,1]
sub.head()
sub.to_csv('submission.csv', index=False)