In [1]:
import os
HUGGINGFACE_API_KEY = os.getenv("HUGGINGFACE_API_KEY")

import pandas as pd
from unsloth import FastLanguageModel
import torch
from datasets import Dataset
from transformers import EarlyStoppingCallback
from trl import SFTTrainer
from transformers import TrainingArguments

### Data Prep

In [2]:
alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{}

### Input:
{}

### Response:
{}"""

In [3]:
instruction_scope3 = '''
    Below is a sentence from a corporate ESG report. Classify if the given sentence is about Scope3 emissions or not. 
    
    Scope 3 categories are:
    Purchased goods and services
    Capital goods
    Fuel and energy-related activities (not included in scope 1 or 2)
    Upstream transportation and distribution
    Waste generated in operations
    Business travel
    Employee commuting
    Upstream leased assets
    Downstream transportation and distribution
    Processing of sold products
    Use of sold products
    End-of-life treatment of sold products
    Downstream leased assets
    Franchises
    Investments
    
    Return only yes or no. Take a deep breath and answer.'''

instruction_vagueness = '''
    You will be given sentences from corporate ESG reports. Your task is to classify each sentence into one of the following categories:

1. specific: The sentence describes specific and measurable actions, initiatives, or achievements related to ESG.

2. ambiguous: The sentence mentions ESG-related topics but lacks clear details or measurable commitments.

3. generic: The sentence contains generic statements or platitudes about ESG without specific details.

4. notESG: The sentence is not related to ESG topics.

For each sentence, respond with the only one of the four categories. Take a deep breath and answer.'''

# initialize model

In [4]:

max_seq_length = 1024 
dtype = None 
load_in_4bit = False 

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/mistral-7b-instruct-v0.2-bnb-4bit", 
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    token = "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",
)

Unsloth: You passed in `unsloth/mistral-7b-instruct-v0.2-bnb-4bit` which is a 4bit model, yet you set
`load_in_4bit = False`. We shall load `mistralai/Mistral-7B-Instruct-v0.2` instead.


==((====))==  Unsloth: Fast Mistral patching release 2024.4
   \\   /|    GPU: NVIDIA GeForce RTX 3090. Max memory: 23.684 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.2.2. CUDA = 8.6. CUDA Toolkit = 12.1.
\        /    Bfloat16 = TRUE. Xformers = 0.0.25.post1. FA = False.
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [5]:
EOS_TOKEN = tokenizer.eos_token # Must add EOS_TOKEN

def formatting_prompts_func(examples):
    
    inputs = examples["text"]
    scope3 = examples["scope3"]
    vague = examples["vague"]
    texts = []
    for input, scope3_output, vague_output in zip(inputs, scope3, vague):
        text1 = alpaca_prompt.format(instruction_scope3, input, scope3_output) + EOS_TOKEN
        text2 = alpaca_prompt.format(instruction_vagueness, input, vague_output) + EOS_TOKEN

        texts.append(text1)
        texts.append(text2)
    return { "text" : texts, }


In [6]:
train_data = pd.read_csv("train_data.csv")
train_data = train_data[['text', 'scope3', 'vague']]

valid_set_size = int(len(train_data) * 0.1)
valid_data = train_data.iloc[:valid_set_size]
train_data = train_data.iloc[valid_set_size:]

train_dataset = Dataset.from_pandas(train_data, split="train")
train_dataset = train_dataset.map(formatting_prompts_func, batched=True, remove_columns=['scope3', 'vague'])

valid_dataset = Dataset.from_pandas(valid_data, split="train")
valid_dataset = valid_dataset.map(formatting_prompts_func, batched=True, remove_columns=['scope3', 'vague'])

Map:   0%|          | 0/8776 [00:00<?, ? examples/s]

Map:   0%|          | 0/975 [00:00<?, ? examples/s]

In [7]:
train_dataset['text'][0]

'Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n\n### Instruction:\n\n    Below is a sentence from a corporate ESG report. Classify if the given sentence is about Scope3 emissions or not. \n    \n    Scope 3 categories are:\n    Purchased goods and services\n    Capital goods\n    Fuel and energy-related activities (not included in scope 1 or 2)\n    Upstream transportation and distribution\n    Waste generated in operations\n    Business travel\n    Employee commuting\n    Upstream leased assets\n    Downstream transportation and distribution\n    Processing of sold products\n    Use of sold products\n    End-of-life treatment of sold products\n    Downstream leased assets\n    Franchises\n    Investments\n    \n    Return only yes or no. Take a deep breath and answer.\n\n### Input:\nScope: Primary fiber based packaging refers to products that are used to package guest food 

In [8]:
train_dataset['text'][1]

'Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n\n### Instruction:\n\n    You will be given sentences from corporate ESG reports. Your task is to classify each sentence into one of the following categories:\n\n1. specific: The sentence describes specific and measurable actions, initiatives, or achievements related to ESG.\n\n2. ambiguous: The sentence mentions ESG-related topics but lacks clear details or measurable commitments.\n\n3. generic: The sentence contains generic statements or platitudes about ESG without specific details.\n\n4. notESG: The sentence is not related to ESG topics.\n\nFor each sentence, respond with the only one of the four categories. Take a deep breath and answer.\n\n### Input:\nScope: Primary fiber based packaging refers to products that are used to package guest food on premises at McDonald’s restaurants.\n\n\n### Response:\nnotESG</s>'

# Model Loading

In [9]:
# We now add LoRA adapters so we only need to update 1 to 10% of all parameters!
model = FastLanguageModel.get_peft_model(
    model,
    r = 16, 
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0, 
    bias = "none", 
    use_gradient_checkpointing = "unsloth",
    random_state = 42,
    use_rslora = True,
    loftq_config = None,
)

Unsloth 2024.4 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


In [10]:
from trl import SFTTrainer
from transformers import TrainingArguments

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = train_dataset,
    eval_dataset = valid_dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    packing = False, # Can make training 5x faster for short sequences.
    callbacks=[EarlyStoppingCallback(early_stopping_patience=5)],
    args = TrainingArguments(
        per_device_train_batch_size = 8,
        gradient_accumulation_steps = 2,
        warmup_steps = 10,
        # max_steps = 60, # Set  for full training runs,
        num_train_epochs = 3, #do 5-6
        learning_rate = 2e-5, #do 2e-5
        fp16 = not torch.cuda.is_bf16_supported(),
        bf16 = torch.cuda.is_bf16_supported(),
        logging_steps = 30,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
        eval_accumulation_steps = 4,
        evaluation_strategy = "steps",
        eval_steps = 60,
        save_strategy = "steps",
        save_steps = 60,
        load_best_model_at_end = True
    ),
)

Map (num_proc=2):   0%|          | 0/17552 [00:00<?, ? examples/s]

Map (num_proc=2):   0%|          | 0/1950 [00:00<?, ? examples/s]

In [11]:
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

GPU = NVIDIA GeForce RTX 3090. Max memory = 23.684 GB.
14.162 GB of memory reserved.


In [12]:
trainer_stats = trainer.train(resume_from_checkpoint = False)

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 17,552 | Num Epochs = 3
O^O/ \_/ \    Batch size per device = 8 | Gradient Accumulation steps = 2
\        /    Total batch size = 16 | Total steps = 3,291
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
60,0.3991,0.381917
120,0.3494,0.348235
180,0.3407,0.344556
240,0.3436,0.340615
300,0.332,0.33835
360,0.3293,0.336426
420,0.335,0.334411
480,0.3268,0.333029
540,0.3293,0.331986
600,0.3197,0.331247



Cannot access gated repo for url https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2/resolve/main/config.json.
Access to model mistralai/Mistral-7B-Instruct-v0.2 is restricted. You must be authenticated to access it. - silently ignoring the lookup for the file config.json in mistralai/Mistral-7B-Instruct-v0.2.

Cannot access gated repo for url https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2/resolve/main/config.json.
Access to model mistralai/Mistral-7B-Instruct-v0.2 is restricted. You must be authenticated to access it. - silently ignoring the lookup for the file config.json in mistralai/Mistral-7B-Instruct-v0.2.

Cannot access gated repo for url https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2/resolve/main/config.json.
Access to model mistralai/Mistral-7B-Instruct-v0.2 is restricted. You must be authenticated to access it. - silently ignoring the lookup for the file config.json in mistralai/Mistral-7B-Instruct-v0.2.

Cannot access gated repo for url https://h

In [13]:
#@title Show final memory and time stats
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory         /max_memory*100, 3)
lora_percentage = round(used_memory_for_lora/max_memory*100, 3)
print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
print(f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training.")
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")

7621.603 seconds used for training.
127.03 minutes used for training.
Peak reserved memory = 16.168 GB.
Peak reserved memory for training = 2.006 GB.
Peak reserved memory % of max memory = 68.265 %.
Peak reserved memory for training % of max memory = 8.47 %.


### Saving, loading finetuned models

In [14]:
model.save_pretrained("lora_model", ) # Local saving
# model.push_to_hub("hf_repo/lora_model", token = "hf_") # Online saving


Cannot access gated repo for url https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2/resolve/main/config.json.
Access to model mistralai/Mistral-7B-Instruct-v0.2 is restricted. You must be authenticated to access it. - silently ignoring the lookup for the file config.json in mistralai/Mistral-7B-Instruct-v0.2.


In [15]:
model.save_pretrained_merged("model", tokenizer, save_method = "merged_16bit",)

Unsloth: Merging 4bit and LoRA weights to 16bit...
Unsloth: Will use up to 135.83 out of 187.86 RAM for saving.


100%|██████████| 32/32 [00:00<00:00, 95.36it/s]


Unsloth: Saving tokenizer... Done.
Unsloth: Saving model... This might take 5 minutes for Llama-7b...
Done.


### Inference

In [4]:
# initialize model
max_seq_length = 1024 
dtype = None 
load_in_4bit = False 

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "model/", 
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    token = "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",
)

==((====))==  Unsloth: Fast Mistral patching release 2024.4
   \\   /|    GPU: NVIDIA GeForce RTX 3090. Max memory: 23.684 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.2.2. CUDA = 8.6. CUDA Toolkit = 12.1.
\        /    Bfloat16 = TRUE. Xformers = 0.0.25.post1. FA = False.
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [5]:
FastLanguageModel.for_inference(model) 

def get_inference(text, instruction):
    input_prompt = alpaca_prompt.format(instruction, text, "")
    input_token = tokenizer(input_prompt, return_tensors = "pt").to('cuda')
    
    output = model.generate(**input_token, max_new_tokens = 10, use_cache = True, pad_token_id=tokenizer.eos_token_id)

    output_tokens_minus_input = output[0][len(input_token['input_ids'][0]):-1]
    output_text = tokenizer.decode(output_tokens_minus_input, skip_special_tokens = True)
    return output_text

In [6]:
from tqdm import tqdm

In [10]:
test_df = pd.read_csv("CS685_final_project/src/report_prediction/all_reports.csv")
texts = test_df['text'].tolist()

scope3_outputs = [get_inference(text, instruction_scope3) for text in tqdm(texts)]
vague_outputs = [get_inference(text, instruction_vagueness) for text in tqdm(texts)]

100%|██████████| 105596/105596 [3:08:25<00:00,  9.34it/s] 
100%|██████████| 105596/105596 [3:40:20<00:00,  7.99it/s] 


In [11]:
test_df['predicted_scope3'] = scope3_outputs
test_df['predicted_vague'] = vague_outputs

In [12]:
test_df.to_csv('CS685_final_project/src/report_prediction/report_prediction.csv', index=False)

In [7]:
test_df = pd.read_csv('test_data.csv')
texts = test_df['text'].tolist()

scope3_outputs = [get_inference(text, instruction_scope3) for text in tqdm(texts)]
vague_outputs = [get_inference(text, instruction_vagueness) for text in tqdm(texts)]

test_df['predicted_scope3'] = scope3_outputs
test_df['predicted_vague'] = vague_outputs

100%|██████████| 1083/1083 [01:54<00:00,  9.50it/s]
100%|██████████| 1083/1083 [02:12<00:00,  8.19it/s]


In [8]:
import pandas as pd

In [9]:
def calculate_metrics(y_pred, y_true, class_mapping):
    
    metrics = []
    for i in class_mapping:

        true_positives = ((y_pred == i) & (y_true == i)).sum()
        true_negatives = ((y_pred != i) & (y_true != i)).sum()
        false_positives = ((y_pred == i) & (y_true != i)).sum()
        false_negatives = ((y_pred != i) & (y_true == i)).sum()

        accuracy = (true_positives + true_negatives) / (true_positives + true_negatives + false_positives + false_negatives)
        precision = true_positives / (true_positives + false_positives) if true_positives + false_positives != 0 else 0
        recall = true_positives / (true_positives + false_negatives) if true_positives + false_negatives != 0 else 0

        class_name = i
        metrics.append(f'Accuracy_{class_name}: {accuracy:.4f} | Precision_{class_name}: {precision:.4f} | Recall_{class_name}: {recall:.4f}|')

    metrics = " ".join(metrics)
    return metrics

In [10]:
calculate_metrics(test_df['predicted_scope3'], test_df['scope3'], ['yes', 'no'])

'Accuracy_yes: 0.9575 | Precision_yes: 0.6000 | Recall_yes: 0.6610| Accuracy_no: 0.9575 | Precision_no: 0.9804 | Recall_no: 0.9746|'

In [11]:
calculate_metrics(test_df['predicted_vague'], test_df['vague'],['specific', 'generic', 'ambiguous', 'notESG'])

'Accuracy_specific: 0.8366 | Precision_specific: 0.6018 | Recall_specific: 0.5991| Accuracy_generic: 0.8523 | Precision_generic: 0.4621 | Recall_generic: 0.4497| Accuracy_ambiguous: 0.8366 | Precision_ambiguous: 0.6596 | Recall_ambiguous: 0.4189| Accuracy_notESG: 0.7876 | Precision_notESG: 0.7257 | Recall_notESG: 0.8531|'

# Baseline

In [None]:
# initialize model
max_seq_length = 1024 
dtype = None 
load_in_4bit = False 

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/mistral-7b-instruct-v0.2-bnb-4bit", 
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    token = "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",
)

Unsloth: You passed in `unsloth/mistral-7b-instruct-v0.2-bnb-4bit` which is a 4bit model, yet you set
`load_in_4bit = False`. We shall load `mistralai/Mistral-7B-Instruct-v0.2` instead.


==((====))==  Unsloth: Fast Mistral patching release 2024.4
   \\   /|    GPU: NVIDIA GeForce RTX 3090. Max memory: 23.684 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.2.2. CUDA = 8.6. CUDA Toolkit = 12.1.
\        /    Bfloat16 = TRUE. Xformers = 0.0.25.post1. FA = False.
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [None]:
test_df = pd.read_csv("test_data.csv")
texts = test_df['text'].tolist()

inputs_scope3 = [alpaca_prompt.format(instruction_scope3, text, "") for text in texts]
inputs_vague = [alpaca_prompt.format(instruction_vagueness, text, "") for text in texts]

inputs_scope3_tokens = tokenizer(inputs_scope3[0], return_tensors = "pt").to('cuda')
inputs_vague_tokens = tokenizer(inputs_vague[0], return_tensors = "pt").to('cuda')


In [None]:
inputs_scope3[0], inputs_vague[0]

('Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n\n### Instruction:\n\n    Below is a sentence from a corporate ESG report. Classify if the given sentence is about Scope3 emissions or not. \n    \n    Scope 3 categories are:\n    Purchased goods and services\n    Capital goods\n    Fuel and energy-related activities (not included in scope 1 or 2)\n    Upstream transportation and distribution\n    Waste generated in operations\n    Business travel\n    Employee commuting\n    Upstream leased assets\n    Downstream transportation and distribution\n    Processing of sold products\n    Use of sold products\n    End-of-life treatment of sold products\n    Downstream leased assets\n    Franchises\n    Investments\n    \n    Return only yes or no. Take a deep breath and answer.\n\n### Input:\nThe Company gives preference to power from renewable sources and has developed a range of d

In [None]:
FastLanguageModel.for_inference(model) 

scope3_outputs = model.generate(**inputs_scope3_tokens, max_new_tokens = 10, use_cache = True)
vague_outputs = model.generate(**inputs_vague_tokens, max_new_tokens = 10, use_cache = True)

scope3_outputs = tokenizer.batch_decode(scope3_outputs)
vague_outputs = tokenizer.batch_decode(vague_outputs)

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


In [None]:
scope3_outputs, vague_outputs

(['<s> Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n\n### Instruction:\n\n    Below is a sentence from a corporate ESG report. Classify if the given sentence is about Scope3 emissions or not. \n    \n    Scope 3 categories are:\n    Purchased goods and services\n    Capital goods\n    Fuel and energy-related activities (not included in scope 1 or 2)\n    Upstream transportation and distribution\n    Waste generated in operations\n    Business travel\n    Employee commuting\n    Upstream leased assets\n    Downstream transportation and distribution\n    Processing of sold products\n    Use of sold products\n    End-of-life treatment of sold products\n    Downstream leased assets\n    Franchises\n    Investments\n    \n    Return only yes or no. Take a deep breath and answer.\n\n### Input:\nThe Company gives preference to power from renewable sources and has developed a range