In [50]:
import pandas as pd
import re
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, Trainer, TrainingArguments
import os
os.environ["CUDA_VISIBLE_DEVICES"]="1"
import torch
from peft import LoraConfig, get_peft_model
from datasets import Dataset
from sklearn.model_selection import train_test_split

DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [2]:
cce_df = pd.read_csv("./data/finecite/full_data.csv")

In [10]:
def label_mapping(label):
    if label == 1: return 'information'
    if label == 2: return 'perception'
    if label == 3: return 'background'

# Process the DataFrame
results = []
for index, row in cce_df.iterrows():
    # Clean the paragraph by replacing <ref> tags with '[TREF]'
    clean_paragraph = re.sub(r'<ref.*?>.*?</ref>', '[TREF]', row["paragraph"])

    # Split the cleaned paragraph into words using ';' as the delimiter
    words = clean_paragraph.split(';')

    # Process the context_location1 list
    context_location1 = eval(row["context_location1"])

    # Check if the lengths match, and map the context_location1 to the words
    if len(context_location1) == len(words):
        # Aggregate the mapped results for the current row
        mapped_result = list(zip(context_location1, words))
        
        # Separate the numbers and words into separate lists
        numbers = [item[0] for item in mapped_result]  # Convert numbers to strings
        mapped_words = [item[1].strip() for item in mapped_result]  # Strip extra spaces from words

        sem_structured_context = []
        staged_context = []
        prev_label = 0
        for label, context in mapped_result:
            if label != prev_label and prev_label != 0:
                sem_structured_context.append({
                    "label": label_mapping(prev_label),
                    "context": ' '.join(staged_context)
                })
                staged_context = []
            if label != 0: staged_context.append(context)
            prev_label = label
                
            
        
        
        results.append({
            "Paragraph": ' '.join(mapped_words),
            "Scope": numbers,
            "Sem_struc_context": str(sem_structured_context)
        })
    else:
        results.append({
            "Paragraph": "Length of context_location1 and words don't match",
            "Scope": "Mismatch",
            "Sem_struc_context": 'Mismatch'
        })

# Convert results to DataFrame
df = pd.DataFrame(results)
df

Unnamed: 0,Paragraph,Scope,Sem_struc_context
0,Neural Machine Translation (NMT) has opened se...,"[3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, ...","[{'label': 'background', 'context': 'Neural Ma..."
1,"As shown in Table 1, the size of the 'in-domai...","[0, 0, 0, 0, 0, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, ...","[{'label': 'background', 'context': ""the size ..."
2,Automatic extraction of events has gained siza...,"[2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, ...","[{'label': 'perception', 'context': 'Automatic..."
3,The subject NP 'Bill' is coindexed with the tr...,"[3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, ...","[{'label': 'background', 'context': ""The subje..."
4,Self-training [TREF] ) uses a source-to-target...,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[{'label': 'information', 'context': 'Self-tra..."
...,...,...,...
1050,"For the final-stage neural reranker, we experi...","[0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 0, 0, 0, 0, ...","[{'label': 'background', 'context': 'BERT-larg..."
1051,Trained on 20GB texts of both Vietnamese news ...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[{'label': 'background', 'context': 'ViBERT wa..."
1052,Pretraining Corpus: Following the E2E pretrain...,"[0, 0, 2, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, ...","[{'label': 'perception', 'context': 'Following..."
1053,The nouns are organized as an inheritance syst...,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 3, 3, 3, ...","[{'label': 'information', 'context': 'The noun..."


In [29]:
model_id = "meta-llama/Meta-Llama-3-8B"

tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token_id = tokenizer.eos_token_id

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_storage=torch.bfloat16,
)

LMmodel = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config = bnb_config,
    torch_dtype = torch.bfloat16,
    device_map = 'auto'
)

peft_config = LoraConfig(target_modules=[ "v_proj", "q_proj", "up_proj", "o_proj", "k_proj", "down_proj", "gate_proj" ], inference_mode=False, r=4, lora_alpha=32, lora_dropout=0.1)

LMmodel = get_peft_model(LMmodel, peft_config)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [48]:
# Tokenize and prepare dataset
def tokenize_function(examples):
    # Tokenize the paragraphs
    tokens = tokenizer(examples["Paragraph"], padding="max_length", truncation=True, max_length=512)
    
    # Convert to tensor
    tokens["labels"] = torch.tensor(tokenizer(examples['Sem_struc_context'], padding="max_length", truncation=True, max_length=512).input_ids, dtype=torch.long)
    
    return tokens

train_df, test_df = train_test_split(df, test_size=0.2, random_state=96, shuffle=True)

# Convert the DataFrame to a Dataset
train_ds = Dataset.from_pandas(train_df)
test_ds = Dataset.from_pandas(test_df)

# Apply the tokenization function to the dataset
train_ds = train_ds.map(
    tokenize_function, 
    batched=True, 
    remove_columns=train_ds.column_names  # Remove all original columns
)

test_ds = test_ds.map(
    tokenize_function, 
    batched=True, 
    remove_columns=test_ds.column_names  # Remove all original columns
)

Map:   0%|          | 0/844 [00:00<?, ? examples/s]

Map:   0%|          | 0/211 [00:00<?, ? examples/s]

In [52]:
training_args = TrainingArguments(
    output_dir="my_awesome_eli5_clm-model",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    weight_decay=0.01,
)

trainer = Trainer(
    model=LMmodel,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=test_ds,
)

trainer.train()


OutOfMemoryError: CUDA out of memory. Tried to allocate 112.00 MiB. GPU  has a total capacity of 11.76 GiB of which 33.81 MiB is free. Process 401495 has 2.86 GiB memory in use. Including non-PyTorch memory, this process has 8.84 GiB memory in use. Of the allocated memory 8.46 GiB is allocated by PyTorch, and 80.46 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)