In [1]:
import os
import json
import math
import numpy as np 
import pandas as pd

## PyTorch
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.utils.data
import torch.optim as optim

In [2]:
!pip install transformers
!pip install datasets
!pip install evaluate

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.26.0-py3-none-any.whl (6.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.3/6.3 MB[0m [31m42.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m66.0 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.12.0-py3-none-any.whl (190 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m190.3/190.3 KB[0m [31m17.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.12.0 tokenizers-0.13.2 transformers-4.26.0
Looking in indexes: https://pypi.org/simple, http

In [3]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [4]:
torch.cuda.set_device(0)
device = torch.device("cuda")
torch.backends.cudnn.deterministic = True

In [5]:
if not os.path.exists("/content/train_data.csv"):
  !unzip gdrive/MyDrive/SSNE/proj6/p6.zip

Archive:  gdrive/MyDrive/SSNE/proj6/p6.zip
  inflating: test_data.csv           
  inflating: train_data.csv          
  inflating: tresc_zadania.txt       


In [6]:
import transformers
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, AutoTokenizer, AutoConfig, AutoModelForSequenceClassification
import numpy as np
from datasets import load_dataset
import evaluate
from tqdm import tqdm

In [9]:
model_name = 'distilbert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(model_name)
# model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=5)
model = AutoModelForSequenceClassification.from_pretrained("gdrive/My Drive/SSNE/proj6/distilbert_classification", local_files_only=True)

In [10]:
train_dataset = load_dataset("csv", data_files=["train_data.csv"])



Downloading and preparing dataset csv/default to /root/.cache/huggingface/datasets/csv/default-cf58e8f46ae15258/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/csv/default-cf58e8f46ae15258/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

In [11]:
train_dataset["train"][0]

{'review': 'location not palace excellent hotel booke dthe hotel nh hotel site attractive rate, room spacious quiet, clean, hotel bit unpersonal big, breakfast tremendous, teh breakfast room art nouveau.. real experience, location ok altough busy neighbourhood night, rooms sound proof, stayed hotels amsterdam, favorite,  ',
 'rating': 4}

In [12]:
def tokenize_function(examples):
    return tokenizer(examples["review"], padding="max_length", truncation=True)

In [13]:
train_tokenized_dataset = train_dataset.map(tokenize_function, batched=True)

  0%|          | 0/17 [00:00<?, ?ba/s]

In [14]:
train_tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['review', 'rating', 'input_ids', 'attention_mask'],
        num_rows: 16392
    })
})

In [15]:
train_tokenized_dataset["train"] = train_tokenized_dataset["train"].rename_column("rating", "label")
tokenized_dataset = train_tokenized_dataset["train"].train_test_split(test_size=0.2, generator=np.random.default_rng(12345))

In [23]:
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['review', 'label', 'input_ids', 'attention_mask'],
        num_rows: 13113
    })
    test: Dataset({
        features: ['review', 'label', 'input_ids', 'attention_mask'],
        num_rows: 3279
    })
})

In [24]:
metric = evaluate.load("accuracy")

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

In [25]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [26]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(output_dir="test_trainer", evaluation_strategy="epoch")

In [27]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    compute_metrics=compute_metrics,
)

In [28]:
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['review', 'label', 'input_ids', 'attention_mask'],
        num_rows: 13113
    })
    test: Dataset({
        features: ['review', 'label', 'input_ids', 'attention_mask'],
        num_rows: 3279
    })
})

In [None]:
trainer.train()

The following columns in the training set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: review. If review are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 13113
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 4920
  Number of trainable parameters = 66957317


Epoch,Training Loss,Validation Loss,Accuracy
1,0.8258,0.807563,0.650198
2,0.6646,0.752798,0.68161
3,0.4477,0.90784,0.680695


Saving model checkpoint to test_trainer/checkpoint-500
Configuration saved in test_trainer/checkpoint-500/config.json
Model weights saved in test_trainer/checkpoint-500/pytorch_model.bin
Saving model checkpoint to test_trainer/checkpoint-1000
Configuration saved in test_trainer/checkpoint-1000/config.json
Model weights saved in test_trainer/checkpoint-1000/pytorch_model.bin
Saving model checkpoint to test_trainer/checkpoint-1500
Configuration saved in test_trainer/checkpoint-1500/config.json
Model weights saved in test_trainer/checkpoint-1500/pytorch_model.bin
The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: review. If review are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 3279
  Batch size = 8
Saving model checkpoint to test_trainer/checkpoint-2000
Configuration saved in test_train

TrainOutput(global_step=4920, training_loss=0.6648206168073949, metrics={'train_runtime': 1981.7157, 'train_samples_per_second': 19.851, 'train_steps_per_second': 2.483, 'total_flos': 5211413795312640.0, 'train_loss': 0.6648206168073949, 'epoch': 3.0})

In [17]:
from datasets import Dataset
import pandas as pd

test_df = pd.read_csv("/content/test_data.csv", names=["review"])
test_df = pd.DataFrame(test_df)
test_dataset = Dataset.from_pandas(test_df, split="test")

In [18]:
# trainer.save_model("gdrive/MyDrive/SSNE/proj6/distilbert_classification")

In [19]:
test_tokenized_dataset = test_dataset.map(tokenize_function, batched=True)

  0%|          | 0/5 [00:00<?, ?ba/s]

In [58]:
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['review', 'label', 'input_ids', 'attention_mask'],
        num_rows: 13113
    })
    test: Dataset({
        features: ['review', 'label', 'input_ids', 'attention_mask'],
        num_rows: 3279
    })
})

In [75]:
predictions = trainer.predict(test_tokenized_dataset)

The following columns in the test set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: review. If review are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 4099
  Batch size = 8


In [76]:
preds = np.argmax(predictions.predictions, axis=-1)

In [78]:
out = pd.DataFrame(preds)
out.to_csv("gdrive/My Drive/SSNE/proj6/preds.csv", header=None, index=False)

In [79]:
out

Unnamed: 0,0
0,3
1,1
2,2
3,4
4,4
...,...
4094,0
4095,3
4096,4
4097,3


In [81]:
from transformers import pipeline
pipe = pipeline("text-classification", model=model, tokenizer=tokenizer, device=0)
pipe("I've loved it!")

[{'label': 'LABEL_4', 'score': 0.9506745934486389}]