In [1]:
import torch
from torch.utils.data import DataLoader
from torch.optim import Adam
from torch import nn
import transformers
import evaluate
import numpy as np
from datasets import *

In [2]:
# !pip install transformers evaluate datasets

In [3]:
device = torch.device("cuda:0") if torch.cuda.is_available() else torch.device("cpu")
print("Device:", device)

Device: cuda:0


In [4]:
torch.cuda.empty_cache()

In [5]:
from transformers import DistilBertForSequenceClassification, Trainer, TrainingArguments, DistilBertTokenizerFast

In [6]:
dataset = load_dataset("csv", data_files="train_data.csv")
train_dataset = dataset["train"].train_test_split(test_size=0.2, seed=42)
dataset = DatasetDict({
    'train': train_dataset["train"],
    'test': train_dataset["test"]})
dataset



  0%|          | 0/1 [00:00<?, ?it/s]



DatasetDict({
    train: Dataset({
        features: ['text', 'labels'],
        num_rows: 13113
    })
    test: Dataset({
        features: ['text', 'labels'],
        num_rows: 3279
    })
})

In [7]:
model_name = 'distilbert-base-uncased'
tokenizer = DistilBertTokenizerFast.from_pretrained(model_name)
model = DistilBertForSequenceClassification.from_pretrained(model_name, num_labels=5)


Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_projector.bias', 'vocab_transform.bias', 'vocab_layer_norm.bias', 'vocab_layer_norm.weight', 'vocab_transform.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight', 'classifier.

In [8]:
def tokenize(batch):
    return tokenizer(batch['text'], padding=True, truncation=True)

train_dataset, test_dataset = dataset["train"], dataset["test"]
train_dataset = train_dataset.map(tokenize, batched=True, batch_size=len(train_dataset))
test_dataset = test_dataset.map(tokenize, batched=True, batch_size=len(test_dataset))

# Set format for pytorch
train_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])
test_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])




Map:   0%|          | 0/3279 [00:00<?, ? examples/s]

In [None]:
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
)

# Create the Trainer and train
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset
)

trainer.train()


In [11]:
model.save_pretrained("hotel_rating_model")

In [24]:
trainer.evaluate()

{'eval_loss': 0.8352432250976562,
 'eval_runtime': 61.23,
 'eval_samples_per_second': 53.552,
 'eval_steps_per_second': 0.849,
 'epoch': 3.0}

In [26]:
with open("test_data.csv", "r") as file:
    test_data = file.read().splitlines()

# Function to predict
def predict(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True).to(model.device)
    outputs = model(**inputs)
    probs = outputs[0].softmax(1)
    return probs.argmax().item()

# Make predictions
predictions = [predict(opinion) for opinion in test_data]


In [37]:
import csv

# Write predictions to CSV
file_name = f"{model_name}_predictions.csv"
with open(file_name, 'w', newline='') as file:
    writer = csv.writer(file)
    for prediction in predictions:
        writer.writerow([prediction])