In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/sentencesfold/sentences.csv
/kaggle/input/proposedworkk/proposedworktrainingdata.csv
/kaggle/input/3000dataset/sentences2.csv
/kaggle/input/3000dataset/proposedworktrainingdata.csv


In [2]:
# =========================
# 1. Install Dependencies
# =========================
# If you're in a fresh environment (e.g., a Kaggle notebook), install:
!pip install torch==2.0.1  # or a compatible PyTorch version
!pip install transformers==4.30.2  # or a compatible Transformers version
!pip install datasets==2.12.0
!pip install peft==0.3.0

Collecting torch==2.0.1
  Downloading torch-2.0.1-cp310-cp310-manylinux1_x86_64.whl.metadata (24 kB)
Collecting nvidia-cuda-nvrtc-cu11==11.7.99 (from torch==2.0.1)
  Downloading nvidia_cuda_nvrtc_cu11-11.7.99-2-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu11==11.7.99 (from torch==2.0.1)
  Downloading nvidia_cuda_runtime_cu11-11.7.99-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cuda-cupti-cu11==11.7.101 (from torch==2.0.1)
  Downloading nvidia_cuda_cupti_cu11-11.7.101-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu11==8.5.0.96 (from torch==2.0.1)
  Downloading nvidia_cudnn_cu11-8.5.0.96-2-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu11==11.10.3.66 (from torch==2.0.1)
  Downloading nvidia_cublas_cu11-11.10.3.66-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cufft-cu11==10.9.0.58 (from torch==2.0.1)
  Downloading nvidia_cufft_cu11-10.9.0.58-py3-none-man

In [3]:
import os
import gc
import random
import pandas as pd
import torch
from datasets import Dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    TrainingArguments,
    Trainer,
)
from peft import LoraConfig, get_peft_model

# ---------------------------------------------------------------------
# 1. Basic GPU Check
# ---------------------------------------------------------------------
print("Number of GPUs available:", torch.cuda.device_count())
if torch.cuda.is_available():
    print("GPU Name:", torch.cuda.get_device_name(0))
else:
    print("No GPU detected")

# ---------------------------------------------------------------------
# 2. Load Base Model & Tokenizer
# ---------------------------------------------------------------------
model_name = "Orkhan/llama-2-7b-absa"  # Replace with your model name
base_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    return_dict=True,
    torch_dtype=torch.float16
)
base_model.to("cuda:0")
base_model.config.use_cache = False
base_model.config.pretraining_tp = 1

tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

# ---------------------------------------------------------------------
# 3. Load CSV (New Format) and Create 80/20 Train/Eval Split
# ---------------------------------------------------------------------
# The CSV is expected to have columns: text, span, opinion, sentiment
import pandas as pd
from datasets import Dataset

# Load dataset
df = pd.read_csv("/kaggle/input/3000dataset/proposedworktrainingdata.csv")  # Update with your CSV path
print("Dataset columns:", df.columns.tolist())

# Apply case folding
df = df.applymap(lambda x: x.lower() if isinstance(x, str) else x)

# Manually split without shuffling
split_index = int(0.8 * len(df))  # First 80% for training
train_df = df.iloc[:split_index]  # First 80% rows
eval_df = df.iloc[split_index:]   # Last 20% rows

# Convert to Hugging Face Dataset
train_dataset = Dataset.from_pandas(train_df)
eval_dataset = Dataset.from_pandas(eval_df)

# ---------------------------------------------------------------------
# 4. Preprocessing Function
# ---------------------------------------------------------------------
def preprocess_function(example):
    # Build the prompt using all annotations from the CSV file.
    target = (
        f"Aspect detected: {example['span']} ## "
        f"Opinion detected: {example['opinion']} ## "
        f"Sentiment detected: {example['sentiment']}"
    )
    input_text = f"### Human: {example['text']} ### Assistant: {target}"
    
    tokenized = tokenizer(
        input_text,
        truncation=True,
        max_length=256,  # Adjust as necessary
        padding="max_length"
    )
    tokenized["labels"] = tokenized["input_ids"].copy()
    return tokenized

# Map the preprocessing function over the datasets.
train_dataset = train_dataset.map(preprocess_function, batched=False, remove_columns=train_dataset.column_names)
eval_dataset  = eval_dataset.map(preprocess_function, batched=False, remove_columns=eval_dataset.column_names)

train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
eval_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

# ---------------------------------------------------------------------
# 5. Apply LoRA
# ---------------------------------------------------------------------
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.1,
    bias="none",
    task_type="CAUSAL_LM"
)
model = get_peft_model(base_model, lora_config)
model.print_trainable_parameters()

# ---------------------------------------------------------------------
# 6. Training Arguments: 5 Epochs, Logging/Eval Once Per Epoch
# ---------------------------------------------------------------------
training_args = TrainingArguments(
    output_dir="results",
    overwrite_output_dir=True,
    num_train_epochs=5,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=1,  # Increased accumulation steps to 16
    evaluation_strategy="epoch",
    logging_strategy="epoch",
    disable_tqdm=False,
    log_level="error",
    save_strategy="epoch",
    learning_rate=2e-4,
    fp16=True,
    report_to="none"
)


def custom_data_collator(features):
    batch = {}
    for key in features[0].keys():
        collated = []
        for f in features:
            value = f[key]
            if not torch.is_tensor(value):
                value = torch.tensor(value)
            if value.ndim == 0:
                value = value.unsqueeze(0)
            if value.ndim == 1:
                value = value.unsqueeze(0)
            collated.append(value)
        batch[key] = torch.cat(collated, dim=0)
    return batch

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    data_collator=custom_data_collator,
)

# ---------------------------------------------------------------------
# 7. Train the Model (5 Epochs)
# ---------------------------------------------------------------------
print("Starting training for 5 epochs on a single GPU...")
trainer.train()
print("Training complete.")

# ---------------------------------------------------------------------
# 8. Merge LoRA + Base Weights, Save Full Model
# ---------------------------------------------------------------------
print("Merging LoRA into base weights...")
model = model.merge_and_unload()

save_dir = "finetuned_model"
model.save_pretrained(save_dir)
tokenizer.save_pretrained(save_dir)
print(f"✅ Full model saved to '{save_dir}' with config.json, etc.")

# ---------------------------------------------------------------------
# 9. Free GPU Memory and Reload for Inference
# ---------------------------------------------------------------------
del model, base_model, trainer
torch.cuda.empty_cache()
gc.collect()

from transformers import AutoModelForCausalLM
print(f"Loading merged model from '{save_dir}'...")
inference_model = AutoModelForCausalLM.from_pretrained(save_dir, torch_dtype=torch.float16)
inference_model.to("cuda:0")
inference_model.eval()
print("✅ Merged model loaded successfully! Ready for inference.")

# ---------------------------------------------------------------------
# 10. Example Inference
# ---------------------------------------------------------------------
from transformers import pipeline
def process_prompt(user_prompt, model):
    text_input = f"### Human: {user_prompt} ###"
    pipe = pipeline(
        task="text-generation",
        model=model,
        tokenizer=tokenizer,
        max_length=int(len(tokenizer.encode(user_prompt)) * 3.5),
        device=0
    )
    return pipe(text_input)

test_sentence = "The food is fresh at a good price, and the place is clean and hygienic.My stay at Hotel Tranquil Haven was pleasant thanks to its serene location and friendly staff.Overall, this hotel is suitable for a peaceful retreat, but overall experience could be way better."
result = process_prompt(test_sentence, inference_model)
print("\nInference Result:")
print(result[0]["generated_text"])

##################################
import pandas as pd
from transformers import pipeline

# Define a function to process a prompt using your fine-tuned model.
def process_prompt(user_prompt, model):
    # Construct the input with the expected prompt format.
    text_input = f"### Human: {user_prompt} ###"
    # Create a pipeline for text generation.
    pipe = pipeline(
        task="text-generation",
        model=model,
        tokenizer=tokenizer,
        max_length=int(len(tokenizer.encode(user_prompt)) * 3.5),
        device=0
    )
    return pipe(text_input)

# Path to the CSV file containing sentences (only a "text" column)
input_file = "/kaggle/input/sentencesfold/sentences.csv"  # Update with your file path
output_file = "predicted_annotations.csv"  # The file where predictions will be saved

# Load the CSV file of sentences.
df = pd.read_csv(input_file)

# Apply case folding (convert to lowercase)
df["text"] = df["text"].str.lower()

# Prepare a list to store predictions.
predictions = []

# Iterate over each sentence in the CSV file.
for idx, row in df.iterrows():
    sentence = row["text"]
    # Generate prediction using the fine-tuned model.
    result = process_prompt(sentence, inference_model)
    # Extract the generated text (assuming the output format is similar to your training example).
    generated_text = result[0]["generated_text"]
    predictions.append(generated_text)

# Create a new DataFrame column for the predictions.
df["prediction"] = predictions

# Save the DataFrame with predictions to a new CSV file.
df.to_csv(output_file, index=False)
print(f"Predictions saved to {output_file}")

##################################
##################################
import pandas as pd
import numpy as np
import re
from sklearn.metrics import precision_score, recall_score, f1_score, matthews_corrcoef, hamming_loss

# -----------------------------
# 1. Define functions to extract A-O-S triples
# -----------------------------

def extract_aos_from_actual(row):
    """
    Convert actual row into a set of A-O-S triples.
    """
    aspects = [x.strip() for x in str(row['span']).split(",") if x.strip()]
    opinions = [x.strip() for x in str(row['opinion']).split(",") if x.strip()]
    sentiments = [x.strip() for x in str(row['sentiment']).split(",") if x.strip()]
    aos_set = set(sorted(zip(aspects, opinions, sentiments)))
    return aos_set

def extract_aos_from_pred(pred_str):
    """
    Extract A-O-S triples from the predicted output.
    """
    a_match = re.search(r"aspect detected:\s*(.*?)\s*##", pred_str)
    o_match = re.search(r"opinion detected:\s*(.*?)\s*##", pred_str)
    s_match = re.search(r"sentiment detected:\s*(.*)", pred_str)

    if a_match and o_match and s_match:
        aspects = [x.strip() for x in a_match.group(1).split(",") if x.strip()]
        opinions = [x.strip() for x in o_match.group(1).split(",") if x.strip()]
        sentiments = [x.strip() for x in s_match.group(1).split(",") if x.strip()]
        aos_set = set(sorted(zip(aspects, opinions, sentiments)))
        return aos_set
    else:
        return set()

# -----------------------------
# 2. Load and preprocess actual and predicted data
# -----------------------------

df_actual = pd.read_csv("/kaggle/input/proposedworkk/proposedworktrainingdata.csv")
df_actual = df_actual.apply(lambda col: col.map(lambda x: x.lower().strip() if isinstance(x, str) else x))
df_actual['aos'] = df_actual.apply(extract_aos_from_actual, axis=1)

df_pred = pd.read_csv("/kaggle/working/predicted_annotations.csv")
df_pred = df_pred.apply(lambda col: col.map(lambda x: x.lower().strip() if isinstance(x, str) else x))
df_pred['aos'] = df_pred['prediction'].apply(extract_aos_from_pred)

df_merged = df_actual[['text', 'aos']].merge(df_pred[['text', 'aos']], on='text', suffixes=('_actual', '_pred'))

# -----------------------------
# 3. Build a global universe of unique A-O-S triples
# -----------------------------

global_triples = sorted(set().union(*df_merged['aos_actual']).union(*df_merged['aos_pred']))
triple_to_idx = {triple: i for i, triple in enumerate(global_triples)}

def aos_to_vector(aos_set):
    vec = [0] * len(global_triples)
    for triple in aos_set:
        if triple in triple_to_idx:
            vec[triple_to_idx[triple]] = 1
    return np.array(vec)

df_merged['vector_actual'] = df_merged['aos_actual'].apply(aos_to_vector)
df_merged['vector_pred'] = df_merged['aos_pred'].apply(aos_to_vector)

actual_vectors = np.stack(df_merged['vector_actual'].values)
pred_vectors = np.stack(df_merged['vector_pred'].values)

# -----------------------------
# 4. Compute TP, TN, FP, FN and Metrics
# -----------------------------

TP = np.sum(np.logical_and(actual_vectors == 1, pred_vectors == 1))
TN = np.sum(np.logical_and(actual_vectors == 0, pred_vectors == 0))
FP = np.sum(np.logical_and(actual_vectors == 0, pred_vectors == 1))
FN = np.sum(np.logical_and(actual_vectors == 1, pred_vectors == 0))

precision = TP / (TP + FP) if (TP + FP) > 0 else 0  # Precision = TP / (TP + FP)
recall = TP / (TP + FN) if (TP + FN) > 0 else 0  # Recall = TP / (TP + FN)
f1 = (2 * precision * recall) / (precision + recall) if (precision + recall) > 0 else 0  # F1 = 2 * (P * R) / (P + R)
mcc = ((TP * TN) - (FP * FN)) / (np.sqrt((TP + FP) * (TP + FN) * (TN + FP) * (TN + FN))) if ((TP + FP) * (TP + FN) * (TN + FP) * (TN + FN)) > 0 else 0  # MCC formula
hamming = hamming_loss(actual_vectors, pred_vectors)  # Hamming Loss formula: (FP + FN) / total samples
fdr = FP / (FP + TP) if (FP + TP) > 0 else 0  # False Discovery Rate (FDR) = FP / (FP + TP)

print("\n=== Evaluation Metrics ===")
print(f"True Positives (TP): {TP}")
print(f"True Negatives (TN): {TN}")
print(f"False Positives (FP): {FP}")
print(f"False Negatives (FN): {FN}")
print(f"Precision: {precision:.4f} (TP / (TP + FP))")
print(f"Recall: {recall:.4f} (TP / (TP + FN))")
print(f"F1 Score: {f1:.4f} (2 * (Precision * Recall) / (Precision + Recall))")
print(f"MCC: {mcc:.4f} (((TP * TN) - (FP * FN)) / sqrt((TP+FP)(TP+FN)(TN+FP)(TN+FN)))")
print(f"Hamming Loss: {hamming:.4f} ((FP + FN) / total samples)")
print(f"False Discovery Rate: {fdr:.4f} (FP / (FP + TP))")

# -----------------------------
# 5. Check for Empty Predictions
# -----------------------------
empty_preds = df_merged[df_merged['aos_pred'].apply(lambda x: len(x) == 0)]
if not empty_preds.empty:
    print("\n⚠️ Warning: Some predictions are empty!")
    print(empty_preds[['text', 'aos_actual', 'aos_pred']].head())
# Save actual A-O-S triples
df_actual[['text', 'aos']].to_csv("/kaggle/working/actual_aos.csv", index=False)

# Save predicted A-O-S triples
df_pred[['text', 'aos']].to_csv("/kaggle/working/predicted_aos.csv", index=False)

print("\n✅ A-O-S files saved: 'actual_aos.csv' and 'predicted_aos.csv'")


##################################

Number of GPUs available: 1
GPU Name: Tesla P100-PCIE-16GB




config.json:   0%|          | 0.00/632 [00:00<?, ?B/s]

pytorch_model.bin.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

pytorch_model-00001-of-00002.bin:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

pytorch_model-00002-of-00002.bin:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/132 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/695 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/434 [00:00<?, ?B/s]

Dataset columns: ['id', 'text', 'span', 'opinion', 'sentiment']


  df = df.applymap(lambda x: x.lower() if isinstance(x, str) else x)


Map:   0%|          | 0/2564 [00:00<?, ? examples/s]

Map:   0%|          | 0/641 [00:00<?, ? examples/s]

trainable params: 8388608 || all params: 6746804224 || trainable%: 0.12433454005023165
Starting training for 5 epochs on a single GPU...




Epoch,Training Loss,Validation Loss
1,0.262,0.226798
2,0.1944,0.225772
3,0.1431,0.239426
4,0.0974,0.272965
5,0.0676,0.303588


Training complete.
Merging LoRA into base weights...
✅ Full model saved to 'finetuned_model' with config.json, etc.
Loading merged model from 'finetuned_model'...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

✅ Merged model loaded successfully! Ready for inference.





Inference Result:
### Human: The food is fresh at a good price, and the place is clean and hygienic.My stay at Hotel Tranquil Haven was pleasant thanks to its serene location and friendly staff.Overall, this hotel is suitable for a peaceful retreat, but overall experience could be way better. ### Assistant: Aspect detected: food, place, staff, hotel, experience ## Opinion detected: good, clean, friendly, suitable, better ## Sentiment detected: positive, positive, positive, neutral, negative
Predictions saved to predicted_annotations.csv

=== Evaluation Metrics ===
True Positives (TP): 1578
True Negatives (TN): 1393112
False Positives (FP): 229
False Negatives (FN): 396
Precision: 0.8733 (TP / (TP + FP))
Recall: 0.7994 (TP / (TP + FN))
F1 Score: 0.8347 (2 * (Precision * Recall) / (Precision + Recall))
MCC: 0.8353 (((TP * TN) - (FP * FN)) / sqrt((TP+FP)(TP+FN)(TN+FP)(TN+FN)))
Hamming Loss: 0.0004 ((FP + FN) / total samples)
False Discovery Rate: 0.1267 (FP / (FP + TP))

                

In [4]:
import os
from IPython.display import FileLink, display

model_dir = "finetuned_model"

# Iterate over each file in the directory
for filename in os.listdir(model_dir):
    file_path = os.path.join(model_dir, filename)
    # Create and display a download link for each file
    display(FileLink(file_path, result_html_prefix=f"👉 Download {filename}: "))
