# 7

## Import

In [1]:
# import logging
from pathlib import Path
import torch
import os

from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from peft import LoraConfig, get_peft_model, PromptTuningConfig

## Initialization

In [2]:
def find_project_root() -> Path:

    start_path = Path.cwd()
    for parent in start_path.parents:
        if (parent / ".git").exists() or (parent / "pyproject.toml").exists() or (parent / "setup.py").exists():
            return parent
    return start_path  # Fallback: if no marker is found, return the original path


# Get the project root automatically
PROJECT_ROOT = find_project_root()
print(Path.cwd())

d:\Projects\finam_hotel_reviews\notebooks


In [3]:
# logging.basicConfig(
#     level=logging.INFO,
#     format="%(asctime)s - %(name)s -%(levelname)s - %(funcName)s -  %(message)s",
#     handlers=[logging.StreamHandler(), logging.FileHandler(PROJECT_ROOT / "logs" / "logfile.log")],
# )

device = torch.device("cuda") if torch.cuda.is_available else torch.device("cpu")

## Load model

In [4]:
model_name = "lightblue/DeepSeek-R1-Distill-Qwen-7B-Multilingual"

# –ù–∞—Å—Ç—Ä–æ–π–∫–∞ 4-–±–∏—Ç–Ω–æ–π –∫–≤–∞–Ω—Ç–æ–≤–∞–Ω–∏—è
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True, bnb_4bit_compute_dtype=torch.float16  # Use float16 for faster computation
)

# –ó–∞–≥—Ä—É–∑–∫–∞ —Ç–æ–∫–µ–Ω–∏–∑–∞—Ç–æ—Ä–∞ –∏ –º–æ–¥–µ–ª–∏
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name, quantization_config=bnb_config, device_map="auto")

# –ù–∞—Å—Ç—Ä–æ–π–∫–∞ Prompt Tuning
prompt_tuning_config = PromptTuningConfig(
    task_type="CAUSAL_LM", num_virtual_tokens=20  # –¢–∏–ø –∑–∞–¥–∞—á–∏  # –ö–æ–ª–∏—á–µ—Å—Ç–≤–æ –≤–∏—Ä—Ç—É–∞–ª—å–Ω—ã—Ö —Ç–æ–∫–µ–Ω–æ–≤ –¥–ª—è –æ–ø—Ç–∏–º–∏–∑–∞—Ü–∏–∏
)

# –ü—Ä–∏–º–µ–Ω–µ–Ω–∏–µ Prompt Tuning
model = get_peft_model(model, prompt_tuning_config)
# logging.info(model.print_trainable_parameters())
print(model.print_trainable_parameters())

# logging.info(" DeepSeek LLM –∑–∞–≥—Ä—É–∂–µ–Ω —Å Prompt Tuning –∏ 4-–±–∏—Ç–Ω—ã–º –∫–≤–∞–Ω—Ç–æ–≤–∞–Ω–∏–µ–º!")
print(" DeepSeek LLM –∑–∞–≥—Ä—É–∂–µ–Ω —Å Prompt Tuning –∏ 4-–±–∏—Ç–Ω—ã–º –∫–≤–∞–Ω—Ç–æ–≤–∞–Ω–∏–µ–º!")

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

trainable params: 71,680 || all params: 7,615,688,192 || trainable%: 0.0009
None
 DeepSeek LLM –∑–∞–≥—Ä—É–∂–µ–Ω —Å Prompt Tuning –∏ 4-–±–∏—Ç–Ω—ã–º –∫–≤–∞–Ω—Ç–æ–≤–∞–Ω–∏–µ–º!


## Load and Preprocess Dataset

In [5]:
import pandas as pd
import json

df_initial = pd.read_csv(PROJECT_ROOT / "data" / "interim" / "200_labeled_gpt_4.csv")
df_initial.head(2)

ls_labels_raw = df_initial["labels"].to_list()

ls_aspect = []
ls_sentiment = []
ls_text = df_initial["text"].to_list()
ls_labels = []

for label in ls_labels_raw:
    label_json = json.loads(label.replace("'", '"'))
    ls_labels.append(label_json)

In [6]:
from datasets import Dataset, DatasetDict
import random

data = {"text": ls_text, "labels_raw": ls_labels_raw, "labels_json": ls_labels}

# –°–æ–∑–¥–∞–µ–º Dataset
dataset = Dataset.from_dict(data)

# –†–∞–∑–¥–µ–ª—è–µ–º –Ω–∞ train/test
dataset = dataset.train_test_split(test_size=0.2, shuffle=True, seed=42)

# –°–æ–∑–¥–∞–µ–º DatasetDict
dataset_dict = DatasetDict({"train": dataset["train"], "test": dataset["test"]})

# –ü—Ä–æ–≤–µ—Ä—è–µ–º —Å—Ç—Ä—É–∫—Ç—É—Ä—É
print(dataset_dict)

DatasetDict({
    train: Dataset({
        features: ['text', 'labels_raw', 'labels_json'],
        num_rows: 160
    })
    test: Dataset({
        features: ['text', 'labels_raw', 'labels_json'],
        num_rows: 40
    })
})


## Tokenize dataset

In [7]:
def tokenize_function(examples):
    inputs = tokenizer(examples["text"], truncation=True, padding="max_length", max_length=512)
    inputs["labels"] = inputs["input_ids"].copy()  # Use input_ids as labels for causal LM
    return inputs


tokenized_datasets = dataset.map(tokenize_function, batched=True)

# Verify tokenized sample
print("Tokenized Sample with Labels:")
print(tokenized_datasets["train"][0])

Map:   0%|          | 0/160 [00:00<?, ? examples/s]

Map:   0%|          | 0/40 [00:00<?, ? examples/s]

Tokenized Sample with Labels:
{'text': '–ù–µ–≤–µ—Ä–æ—è—Ç–Ω—ã–π –æ—Ç–µ–ª—å —Å –ª—É—á—à–∏–º–∏ –Ω–æ–º–µ—Ä–∞–º–∏. –ß–µ–≥–æ —Å—Ç–æ—è—Ç —Ç–æ–ª—å–∫–æ —ç—Ç–∏ –º–∞—Ç—Ä–∞—Å—ã, –Ω–∞ –∫–æ—Ç–æ—Ä—ã–µ –ª–æ–∂–∏—à—å—Å—è –∏ –Ω–µ –º–æ–∂–µ—à—å –≤—Å—Ç–∞—Ç—å. –í —Å–∞–º–æ–º —Ü–µ–Ω—Ç—Ä–µ –≥–æ—Ä–æ–¥–∞. –®–æ–ø–ø–∏–Ω–≥, –¥–æ—Å—Ç–æ–ø—Ä–∏–º–µ—á–∞—Ç–µ–ª—å–Ω–æ—Å—Ç–∏ - –≤—Å–µ —Ä—è–¥–æ–º. –ó–∞–≤—Ç—Ä–∞–∫–∏ –Ω–∞–∏–≤–∫—É—Å–Ω–µ–π—à–∏–µ! –û—á–µ–Ω—å —Å–æ–≤–µ—Ç—É—é –ª—é–±–∏—Ç–µ–ª—è–º —à–∏–∫–∞—Ä–Ω–æ –æ—Ç–¥–æ—Ö–Ω—É—Ç—å!', 'labels_raw': "[{'aspect': '–æ–±—â–µ–µ –≤–ø–µ—á–∞—Ç–ª–µ–Ω–∏–µ', 'sentiment': 'positive'}, {'aspect': '–Ω–æ–º–µ—Ä', 'sentiment': 'positive'}, {'aspect': '–º–∞—Ç—Ä–∞—Å—ã', 'sentiment': 'positive'}, {'aspect': '—Ä–∞—Å–ø–æ–ª–æ–∂–µ–Ω–∏–µ', 'sentiment': 'positive'}, {'aspect': '–¥–æ—Å—Ç–æ–ø—Ä–∏–º–µ—á–∞—Ç–µ–ª—å–Ω–æ—Å—Ç–∏', 'sentiment': 'positive'}, {'aspect': '–∑–∞–≤—Ç—Ä–∞–∫', 'sentiment': 'positive'}]", 'labels_json': [{'aspect': '–æ–±—â–µ–µ –≤–ø–µ—á–∞—Ç–ª–µ–Ω–∏–µ', 'sentiment': 'positive'}, {'aspect': '–Ω–æ–º–µ—Ä'

## Set training parameters

In [8]:
import os

os.environ["WANDB_DISABLED"] = "true"
os.environ["CLEARML_DISABLED"] = "true"
os.environ["DVCLIVE_DISABLED"] = "true"


from transformers import TrainingArguments

# Training arguments
training_args = TrainingArguments(
    output_dir=PROJECT_ROOT / "results",
    evaluation_strategy="epoch",
    learning_rate=3e-4,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=8,
    num_train_epochs=1,
    weight_decay=0.01,
    logging_dir=PROJECT_ROOT / "logs",
    fp16=True,
    report_to=[],  # –û—Ç–∫–ª—é—á–∞–µ—Ç –∏–Ω—Ç–µ–≥—Ä–∞—Ü–∏–∏ (DVC, ClearML, WandB –∏ —Ç.–¥.)
)

# logging.info("WandB Disabled!")
print("WandB Disabled!")

WandB Disabled!




## Get sample Data

In [9]:
small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(100))
small_test_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(10))

## Initialize trainer and train

In [10]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=small_train_dataset.remove_columns(["text"]),  # Remove raw text column
    eval_dataset=small_test_dataset.remove_columns(["text"]),
)

print("Trainer Initialized!")


Trainer Initialized!


In [11]:
torch.cuda.empty_cache()
print("Cleared CUDA Cache")

Cleared CUDA Cache


## Fine-Tune DeepSeek LLM

In [12]:
print("üöÄ Starting Fine-Tuning...")
trainer.train()

üöÄ Starting Fine-Tuning...


Epoch,Training Loss,Validation Loss
0,No log,7.588213


TrainOutput(global_step=12, training_loss=7.874982833862305, metrics={'train_runtime': 21.4747, 'train_samples_per_second': 4.657, 'train_steps_per_second': 0.559, 'total_flos': 2085210430636032.0, 'train_loss': 7.874982833862305, 'epoch': 0.96})

## Run predictions

In [16]:
def generate_prediction(review_text):
    inputs = tokenizer(review_text, return_tensors="pt").to("cuda")
    outputs = model.generate(**inputs, max_length=1000, pad_token_id=tokenizer.eos_token_id)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)


# Example reviews
reviews = [ls_text[0], ls_text[1], ls_text[2]]

# Run predictions
for review in reviews:
    print(f"Review: {review}")
    print(f"Predicted Sentiment: {generate_prediction(review)}")
    print("-" * 80)

Review: –≠—Ç–æ –±—ã–ª —Å–∞–º—ã–π –º–∞–ª–µ–Ω—å–∫–∏–π –ø–æ –ø–ª–æ—â–∞–¥–∏ –Ω–æ–º–µ—Ä –∏–∑ –≤—Å–µ—Ö –æ—Ç–µ–ª–µ–π, –≥–¥–µ —è –æ—Å—Ç–∞–Ω–∞–≤–ª–∏–≤–∞–ª–∞—Å—å. –¢–µ–º –Ω–µ –º–µ–Ω–µ–µ, –æ—á–µ–Ω—å —É—é—Ç–Ω–µ–Ω—å–∫–∏–π –∏ –∫–æ–º—Ñ–æ—Ä—Ç–∞–±–µ–ª—å–Ω—ã–π –º–∏–Ω–∏ –æ—Ç–µ–ª—å—á–∏–∫. –ú–µ–±–µ–ª—å –Ω–æ–≤–∞—è, –≤—Å–µ —Ñ—É–Ω–∫—Ü–∏–æ–Ω–∏—Ä—É–µ—Ç, —É–¥–æ–±–Ω–æ —Ä–∞—Å–ø–æ–ª–æ–∂–µ–Ω. –£—á–∏—Ç—ã–≤–∞—è –¥–æ—Ä–æ–≥–æ–≤–∏–∑–Ω—É –æ—Ç–µ–ª–µ–π –ü–∞—Ä–∏–∂–∞, –¥–ª—è —ç–∫–æ–Ω–æ–º –≤–∞—Ä–∏–∞–Ω—Ç–∞ –≤–ø–æ–ª–Ω–µ –¥–æ—Å—Ç–æ–π–Ω–æ.
Predicted Sentiment: –≠—Ç–æ –±—ã–ª —Å–∞–º—ã–π –º–∞–ª–µ–Ω—å–∫–∏–π –ø–æ –ø–ª–æ—â–∞–¥–∏ –Ω–æ–º–µ—Ä –∏–∑ –≤—Å–µ—Ö –æ—Ç–µ–ª–µ–π, –≥–¥–µ —è –æ—Å—Ç–∞–Ω–∞–≤–ª–∏–≤–∞–ª–∞—Å—å. –¢–µ–º –Ω–µ –º–µ–Ω–µ–µ, –æ—á–µ–Ω—å —É—é—Ç–Ω–µ–Ω—å–∫–∏–π –∏ –∫–æ–º—Ñ–æ—Ä—Ç–∞–±–µ–ª—å–Ω—ã–π –º–∏–Ω–∏ –æ—Ç–µ–ª—å—á–∏–∫. –ú–µ–±–µ–ª—å –Ω–æ–≤–∞—è, –≤—Å–µ —Ñ—É–Ω–∫—Ü–∏–æ–Ω–∏—Ä—É–µ—Ç, —É–¥–æ–±–Ω–æ —Ä–∞—Å–ø–æ–ª–æ–∂–µ–Ω. –£—á–∏—Ç—ã–≤–∞—è –¥–æ—Ä–æ–≥–æ–≤–∏–∑–Ω—É –æ—Ç–µ–ª–µ–π –ü–∞—Ä–∏–∂–∞, –¥–ª—è —ç–∫–æ–Ω–æ–º –≤–∞—Ä–∏–∞–Ω—Ç–∞ –≤–ø–æ–ª–Ω–µ –¥–æ—

In [13]:
ls_text[1]

'–ü–æ–Ω—Ä–∞–≤–∏–ª–æ—Å—å –≤—Å—ë!!! –û—Ç –ø–µ—Ä—Å–æ–Ω–∞–ª–∞ –¥–æ –æ—Å–Ω–∞—â–µ–Ω–∏—è –Ω–æ–º–µ—Ä–∞!!! –ú–æ–¥–Ω–æ,—Å—Ç–∏–ª—å–Ω–æ,—É—é—Ç–Ω–æ. –û–≥—Ä–æ–º–Ω—ã–π –ø–ª–∞–∑–º–µ–Ω–Ω—ã–π —Ç–µ–ª–µ–≤–∏–∑–æ—Ä –≤ –Ω–æ–º–µ—Ä–µ, –Ω–∞–ª–∏—á–∏–µ –±–∞–Ω–Ω—ã—Ö –ø—Ä–∏–Ω–∞–¥–ª–µ–∂–Ω–æ—Å—Ç–µ–π (–¥–ª—è –º–∏–Ω–∏-–æ—Ç–µ–ª—è —ç—Ç–æ —Ä–µ–¥–∫–æ—Å—Ç—å), –ª—é–±–µ–∑–Ω–æ –ø—Ä–µ–¥–æ—Å—Ç–∞–≤–ª–µ–Ω–Ω–∞—è –ø–æ—Å—É–¥–∞,–∫–æ—Ç–æ—Ä–∞—è –ø–æ–Ω–∞–¥–æ–±–∏–ª–∞—Å—å –≤ —Ö–æ–¥–µ –ø—Ä–æ–∂–∏–≤–∞–Ω–∏—è, –≤–∞–Ω–Ω–∞—è –∫–æ–º–Ω–∞—Ç–∞ —Å –ø—Ä–æ—Å—Ç–æ –±–ª–µ—Å—Ç—è—â–µ–π —Å–∞–Ω—Ç–µ—Ö–Ω–∏–∫–æ–π - –≤—Å—ë —ç—Ç–æ —Å–ª–æ–∂–∏–ª–æ –Ω–µ–≤–µ—Ä–æ—è—Ç–Ω–æ –ø—Ä–∏—è—Ç–Ω–æ–µ –≤–ø–µ—á–∞—Ç–ª–µ–Ω–∏–µ –æ –≥–æ—Å—Ç–∏–Ω–∏—Ü–µ!!! –î–µ–≤—É—à–∫–∞ –Ω–∞ —Ä–µ—Å–µ–ø—à–µ–Ω–µ –≤—Å—Ç—Ä–µ—Ç–∏–ª–∞ —Å —É–ª—ã–±–∫–æ–π, –≤—Å—ë –ø–æ–¥—Ä–æ–±–Ω–æ –æ–±—ä—è—Å–Ω–∏–ª–∞, –ø–æ—Å–æ–≤–µ—Ç–æ–≤–∞–ª–∞. –ñ–µ–ª–∞—é –æ—Ç–µ–ª—é –¥–∞–ª—å–Ω–µ–π—à–µ–≥–æ –ø—Ä–æ—Ü–≤–µ—Ç–∞–Ω–∏—è!–û–≥—Ä–æ–º–Ω–æ–µ —Å–ø–∞—Å–∏–±–æ –∑–∞ –≤–æ–∑–º–æ–∂–Ω–æ—Å—Ç—å –ø—Ä–µ–∫—Ä–∞—Å–Ω–æ –ø—Ä–æ–≤–µ—Å—Ç–∏ –≤—Ä–µ–º—è!'