## Define model

In [1]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

  from .autonotebook import tqdm as notebook_tqdm


## Data Preprocessing

In [2]:
import pandas as pd
# Load the training data
path = '../data/'

train_df = pd.read_csv(path + 'train.csv')
valid_df = pd.read_csv(path + 'val.csv')
test_df = pd.read_csv(path + 'test.csv')

In [3]:
train_df.head()

Unnamed: 0.1,Unnamed: 0,user_review,user_suggestion
0,15004,freejam scammed 10 ing euros from me. i will f...,0
1,23876,mic issues can be real lol,1
2,29142,"this may not be readily apparent, but south pa...",1
3,10724,i've been spending less time playing magic rec...,1
4,6521,never playing again... i would recommend this ...,0


In [4]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 42604 entries, 0 to 42603
Data columns (total 3 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Unnamed: 0       42604 non-null  int64 
 1   user_review      42604 non-null  object
 2   user_suggestion  42604 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 998.7+ KB


In [5]:
train_df = train_df.loc[:, ['user_review', 'user_suggestion']]
valid_df = valid_df.loc[:, ['user_review', 'user_suggestion']]

In [6]:
from datasets import Dataset, DatasetDict

raw_datasets = DatasetDict({
    'train': Dataset.from_pandas(train_df),
    'valid': Dataset.from_pandas(valid_df),
    'test': Dataset.from_pandas(test_df),
})

In [7]:
print("Dataset Dict:\n", raw_datasets)
print("\n\nTrain's features:\n", raw_datasets["train"].features)
print("\n\nFirst row of Train:\n", raw_datasets["train"][0])

Dataset Dict:
 DatasetDict({
    train: Dataset({
        features: ['user_review', 'user_suggestion'],
        num_rows: 42604
    })
    valid: Dataset({
        features: ['user_review', 'user_suggestion'],
        num_rows: 5325
    })
    test: Dataset({
        features: ['Unnamed: 0', 'user_review', 'user_suggestion'],
        num_rows: 5326
    })
})


Train's features:
 {'user_review': Value(dtype='string', id=None), 'user_suggestion': Value(dtype='int64', id=None)}


First row of Train:
 {'user_review': 'freejam scammed 10 ing euros from me. i will find fj and destroy their s25 ing protonium crates and all was uningcommon.so if fj read this they will die quickly', 'user_suggestion': 0}


## Tokenize

In [8]:
tokenized_datasets = raw_datasets.map(
    lambda dataset: tokenizer(dataset['user_review'], truncation=True), 
    batched=True
)

print(tokenized_datasets)

Map: 100%|██████████| 42604/42604 [00:01<00:00, 22075.15 examples/s]
Map: 100%|██████████| 5325/5325 [00:00<00:00, 25223.86 examples/s]
Map: 100%|██████████| 5326/5326 [00:00<00:00, 24763.01 examples/s]

DatasetDict({
    train: Dataset({
        features: ['user_review', 'user_suggestion', 'input_ids', 'attention_mask'],
        num_rows: 42604
    })
    valid: Dataset({
        features: ['user_review', 'user_suggestion', 'input_ids', 'attention_mask'],
        num_rows: 5325
    })
    test: Dataset({
        features: ['Unnamed: 0', 'user_review', 'user_suggestion', 'input_ids', 'attention_mask'],
        num_rows: 5326
    })
})





In [9]:
tokenized_datasets = tokenized_datasets.remove_columns(['user_review'])
tokenized_datasets = tokenized_datasets.rename_column('user_suggestion', 'labels')

## Fine-tuning

In [10]:
!pip -q install evaluate

In [11]:
from transformers import Trainer, TrainingArguments, DataCollatorWithPadding
import evaluate
import numpy as np

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

training_args = TrainingArguments(
    output_dir="fine-tuned model", 
    num_train_epochs=3, 
    eval_strategy="epoch", 
    weight_decay=5e-4, 
    save_strategy="no", 
    fp16=True, 
    push_to_hub=False,
    report_to="none"
)

def compute_metrics(eval_pred):
    metric_acc = evaluate.load("accuracy")
    metric_f1 = evaluate.load("f1")
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    acc = metric_acc.compute(predictions=predictions, references=labels)
    f1 = metric_f1.compute(predictions=predictions, references=labels, average="weighted")
    return {"accuracy": acc["accuracy"], "f1": f1["f1"]}

trainer = Trainer(
    model, 
    training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['valid'],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

  trainer = Trainer(


In [12]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.3349,0.330533,0.879249,0.877794
2,0.2357,0.42113,0.888263,0.888043
3,0.1062,0.554432,0.892582,0.89178


Downloading builder script: 100%|██████████| 4.20k/4.20k [00:00<00:00, 16.1MB/s]
Downloading builder script: 100%|██████████| 6.79k/6.79k [00:00<00:00, 11.2MB/s]


TrainOutput(global_step=15978, training_loss=0.2458274773026993, metrics={'train_runtime': 1948.0798, 'train_samples_per_second': 65.609, 'train_steps_per_second': 8.202, 'total_flos': 8132289512465664.0, 'train_loss': 0.2458274773026993, 'epoch': 3.0})

## Evaluation

In [13]:
from sklearn.metrics import classification_report

y_pred = trainer.predict(tokenized_datasets['test']).predictions
y_pred = np.argmax(y_pred, axis=-1)

y_true = tokenized_datasets["test"]["labels"]
y_true = np.array(y_true)

print(classification_report(y_true, y_pred, digits=3))

              precision    recall  f1-score   support

           0      0.894     0.926     0.910      3348
           1      0.867     0.815     0.840      1978

    accuracy                          0.885      5326
   macro avg      0.881     0.870     0.875      5326
weighted avg      0.884     0.885     0.884      5326

