## Define model

In [1]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

2025-04-13 14:49:43.279892: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1744555783.302908    3378 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1744555783.309799    3378 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


## Data Preprocessing

In [2]:
import pandas as pd
# Load the training data
train_path = '/kaggle/input/game-sentiment-analysis/train.csv'
valid_path = '/kaggle/input/game-sentiment-analysis/val.csv'
test_path = '/kaggle/input/game-sentiment-analysis/test.csv'

train_df = pd.read_csv(train_path)
valid_df = pd.read_csv(valid_path)
test_df = pd.read_csv(test_path)

In [3]:
train_df.head()

Unnamed: 0.1,Unnamed: 0,user_review,user_suggestion
0,19015,fine play whatever way want however im genuine...,1
1,18268,true happened friend played pubg,1
2,24906,comic satirical gender role reversed comment w...,1
3,15910,sally: astro boy!? yesss!!! we're saved! what ...,1
4,26012,husband love think steam stopped making couple...,0


In [4]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22979 entries, 0 to 22978
Data columns (total 3 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Unnamed: 0       22979 non-null  int64 
 1   user_review      22979 non-null  object
 2   user_suggestion  22979 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 538.7+ KB


In [5]:
train_df = train_df.loc[:, ['user_review', 'user_suggestion']]
valid_df = valid_df.loc[:, ['user_review', 'user_suggestion']]

In [6]:
from datasets import Dataset, DatasetDict

raw_datasets = DatasetDict({
    'train': Dataset.from_pandas(train_df),
    'valid': Dataset.from_pandas(valid_df),
    'test': Dataset.from_pandas(test_df),
})

In [7]:
print("Dataset Dict:\n", raw_datasets)
print("\n\nTrain's features:\n", raw_datasets["train"].features)
print("\n\nFirst row of Train:\n", raw_datasets["train"][0])

Dataset Dict:
 DatasetDict({
    train: Dataset({
        features: ['user_review', 'user_suggestion'],
        num_rows: 22979
    })
    valid: Dataset({
        features: ['user_review', 'user_suggestion'],
        num_rows: 2872
    })
    test: Dataset({
        features: ['Unnamed: 0', 'user_review', 'user_suggestion'],
        num_rows: 2873
    })
})


Train's features:
 {'user_review': Value(dtype='string', id=None), 'user_suggestion': Value(dtype='int64', id=None)}


First row of Train:
 {'user_review': 'fine play whatever way want however im genuinely curious shb like hour pressing escape button', 'user_suggestion': 1}


## Tokenize

In [8]:
tokenized_datasets = raw_datasets.map(
    lambda dataset: tokenizer(dataset['user_review'], truncation=True), 
    batched=True
)

print(tokenized_datasets)

Map:   0%|          | 0/22979 [00:00<?, ? examples/s]

Map:   0%|          | 0/2872 [00:00<?, ? examples/s]

Map:   0%|          | 0/2873 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['user_review', 'user_suggestion', 'input_ids', 'attention_mask'],
        num_rows: 22979
    })
    valid: Dataset({
        features: ['user_review', 'user_suggestion', 'input_ids', 'attention_mask'],
        num_rows: 2872
    })
    test: Dataset({
        features: ['Unnamed: 0', 'user_review', 'user_suggestion', 'input_ids', 'attention_mask'],
        num_rows: 2873
    })
})


In [9]:
tokenized_datasets = tokenized_datasets.remove_columns(['user_review'])
tokenized_datasets = tokenized_datasets.rename_column('user_suggestion', 'labels')

## Fine-tuning

In [10]:
!pip -q install evaluate

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [11]:
from transformers import Trainer, TrainingArguments, DataCollatorWithPadding
import evaluate
import numpy as np

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

training_args = TrainingArguments(
    output_dir="fine-tuned model", 
    num_train_epochs=3, 
    eval_strategy="epoch", 
    weight_decay=5e-4, 
    save_strategy="no", 
    fp16=True, 
    push_to_hub=False,
    report_to="none"
)

def compute_metrics(eval_pred):
    metric_acc = evaluate.load("accuracy")
    metric_f1 = evaluate.load("f1")
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    acc = metric_acc.compute(predictions=predictions, references=labels)
    f1 = metric_f1.compute(predictions=predictions, references=labels, average="weighted")
    return {"accuracy": acc["accuracy"], "f1": f1["f1"]}

trainer = Trainer(
    model, 
    training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['valid'],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

  trainer = Trainer(


In [12]:
trainer.train()



Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.3258,0.266673,0.888928,0.887543
2,0.2443,0.559646,0.888231,0.888639
3,0.2841,1.472124,0.899025,0.898835




TrainOutput(global_step=4311, training_loss=0.27738628418421807, metrics={'train_runtime': 1721.4681, 'train_samples_per_second': 40.045, 'train_steps_per_second': 2.504, 'total_flos': 7248091359369456.0, 'train_loss': 0.27738628418421807, 'epoch': 3.0})

## Evaluation

In [13]:
from sklearn.metrics import classification_report

y_pred = trainer.predict(tokenized_datasets['test']).predictions
y_pred = np.argmax(y_pred, axis=-1)

y_true = tokenized_datasets["test"]["labels"]
y_true = np.array(y_true)

print(classification_report(y_true, y_pred, digits=3))



              precision    recall  f1-score   support

           0      0.865     0.860     0.863      1084
           1      0.915     0.919     0.917      1789

    accuracy                          0.897      2873
   macro avg      0.890     0.889     0.890      2873
weighted avg      0.897     0.897     0.897      2873



## Save model

In [14]:
model.save_pretrained('fine-tuned model')
tokenizer.save_pretrained('fine-tuned model')

('fine-tuned model/tokenizer_config.json',
 'fine-tuned model/special_tokens_map.json',
 'fine-tuned model/vocab.txt',
 'fine-tuned model/added_tokens.json',
 'fine-tuned model/tokenizer.json')

In [15]:
import shutil
import os

shutil.make_archive("/kaggle/working/fine-tuned model", "zip", "/kaggle/working/fine-tuned model")

'/kaggle/working/fine-tuned model.zip'