In [3]:
import os
HUGGINGFACE_API_KEY = os.getenv("HUGGINGFACE_API_KEY")

import pandas as pd
from unsloth import FastLanguageModel
import torch
from datasets import Dataset

### Data Prep

In [4]:
alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{}

### Input:
{}

### Response:
{}"""

In [5]:
instruction_scope3 = '''
    Below is a sentence from a corporate ESG report. Classify if the given sentence is about Scope3 emissions or not. 
    
    Scope 3 categories are:
    Purchased goods and services
    Capital goods
    Fuel and energy-related activities (not included in scope 1 or 2)
    Upstream transportation and distribution
    Waste generated in operations
    Business travel
    Employee commuting
    Upstream leased assets
    Downstream transportation and distribution
    Processing of sold products
    Use of sold products
    End-of-life treatment of sold products
    Downstream leased assets
    Franchises
    Investments
    
    Return only yes or no. Take a deep breath and answer.'''

instruction_vagueness = '''
    You will be given sentences from corporate ESG reports. Your task is to classify each sentence into one of the following categories:

1. specific: The sentence describes specific and measurable actions, initiatives, or achievements related to ESG.

2. ambiguous: The sentence mentions ESG-related topics but lacks clear details or measurable commitments.

3. generic: The sentence contains generic statements or platitudes about ESG without specific details.

4. notESG: The sentence is not related to ESG topics.

For each sentence, respond with the only one of the four categories. Take a deep breath and answer.'''

# initialize model

In [None]:

max_seq_length = 1024 
dtype = None 
load_in_4bit = False 

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/mistral-7b-instruct-v0.2-bnb-4bit", 
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    token = "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",
)

Unsloth: You passed in `unsloth/mistral-7b-instruct-v0.2-bnb-4bit` which is a 4bit model, yet you set
`load_in_4bit = False`. We shall load `mistralai/Mistral-7B-Instruct-v0.2` instead.


==((====))==  Unsloth: Fast Mistral patching release 2024.4
   \\   /|    GPU: NVIDIA GeForce RTX 3090. Max memory: 23.684 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.2.2. CUDA = 8.6. CUDA Toolkit = 12.1.
\        /    Bfloat16 = TRUE. Xformers = 0.0.25.post1. FA = False.
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [None]:
EOS_TOKEN = tokenizer.eos_token # Must add EOS_TOKEN

def formatting_prompts_func(examples):
    
    inputs = examples["text"]
    scope3 = examples["scope3"]
    vague = examples["vague"]
    texts = []
    for input, scope3_output, vague_output in zip(inputs, scope3, vague):
        text1 = alpaca_prompt.format(instruction_scope3, input, scope3_output) + EOS_TOKEN
        text2 = alpaca_prompt.format(instruction_vagueness, input, vague_output) + EOS_TOKEN

        texts.append(text1)
        texts.append(text2)
    return { "text" : texts, }


Map:   0%|          | 0/9751 [00:00<?, ? examples/s]

In [None]:
train_data = pd.read_csv("train_data.csv")
train_data = train_data[['text', 'scope3', 'vague']]

dataset = Dataset.from_pandas(train_data, split="train")
dataset = dataset.map(formatting_prompts_func, batched=True, remove_columns=['scope3', 'vague'])

In [None]:
dataset['text'][0]

'Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n\n### Instruction:\n\n    Below is a sentence from a corporate ESG report. Classify if the given sentence is about Scope3 emissions or not. \n    \n    Scope 3 categories are:\n    Purchased goods and services\n    Capital goods\n    Fuel and energy-related activities (not included in scope 1 or 2)\n    Upstream transportation and distribution\n    Waste generated in operations\n    Business travel\n    Employee commuting\n    Upstream leased assets\n    Downstream transportation and distribution\n    Processing of sold products\n    Use of sold products\n    End-of-life treatment of sold products\n    Downstream leased assets\n    Franchises\n    Investments\n    \n    Return only yes or no. Take a deep breath and answer.\n\n### Input:\nMaterials used in Product In FY22, NIKE used Higg MSI version 3.5 as the raw process data so

In [None]:
dataset['text'][1]

'Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n\n### Instruction:\n\n    You will be given sentences from corporate ESG reports. Your task is to classify each sentence into one of the following categories:\n\n1. specific: The sentence describes specific and measurable actions, initiatives, or achievements related to ESG.\n\n2. ambiguous: The sentence mentions ESG-related topics but lacks clear details or measurable commitments.\n\n3. generic: The sentence contains generic statements or platitudes about ESG without specific details.\n\n4. notESG: The sentence is not related to ESG topics.\n\nFor each sentence, respond with the only one of the four categories. Take a deep breath and answer.\n\n### Input:\nMaterials used in Product In FY22, NIKE used Higg MSI version 3.5 as the raw process data source and mapped Higg processes to NIKE material attributes to calculate impacts on

# Model Loading

In [3]:
# We now add LoRA adapters so we only need to update 1 to 10% of all parameters!
model = FastLanguageModel.get_peft_model(
    model,
    r = 16, 
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0, 
    bias = "none", 
    use_gradient_checkpointing = "unsloth",
    random_state = 42,
    use_rslora = True,
    loftq_config = None,
)

Unsloth 2024.4 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


In [7]:
from trl import SFTTrainer
from transformers import TrainingArguments

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    packing = False, # Can make training 5x faster for short sequences.
    args = TrainingArguments(
        per_device_train_batch_size = 4,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        # max_steps = 60, # Set  for full training runs,
        num_train_epochs = 3, #do 5-6
        learning_rate = 2e-4, #do 2e-5
        fp16 = not torch.cuda.is_bf16_supported(),
        bf16 = torch.cuda.is_bf16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
    ),
)

Map (num_proc=2):   0%|          | 0/19502 [00:00<?, ? examples/s]

In [8]:
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

GPU = NVIDIA GeForce RTX 3090. Max memory = 23.684 GB.
14.162 GB of memory reserved.


In [9]:
trainer_stats = trainer.train(resume_from_checkpoint = True)

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 19,502 | Num Epochs = 3
O^O/ \_/ \    Batch size per device = 4 | Gradient Accumulation steps = 4
\        /    Total batch size = 16 | Total steps = 3,657
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss
1,2.2569
2,2.253
3,1.6948
4,1.1573
5,0.867
6,0.5962
7,0.4981
8,0.4416
9,0.4366
10,0.4518



Cannot access gated repo for url https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2/resolve/main/config.json.
Access to model mistralai/Mistral-7B-Instruct-v0.2 is restricted. You must be authenticated to access it. - silently ignoring the lookup for the file config.json in mistralai/Mistral-7B-Instruct-v0.2.

Cannot access gated repo for url https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2/resolve/main/config.json.
Access to model mistralai/Mistral-7B-Instruct-v0.2 is restricted. You must be authenticated to access it. - silently ignoring the lookup for the file config.json in mistralai/Mistral-7B-Instruct-v0.2.

Cannot access gated repo for url https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2/resolve/main/config.json.
Access to model mistralai/Mistral-7B-Instruct-v0.2 is restricted. You must be authenticated to access it. - silently ignoring the lookup for the file config.json in mistralai/Mistral-7B-Instruct-v0.2.

Cannot access gated repo for url https://h

In [10]:
#@title Show final memory and time stats
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory         /max_memory*100, 3)
lora_percentage = round(used_memory_for_lora/max_memory*100, 3)
print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
print(f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training.")
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")

13168.5079 seconds used for training.
219.48 minutes used for training.
Peak reserved memory = 15.289 GB.
Peak reserved memory for training = 1.127 GB.
Peak reserved memory % of max memory = 64.554 %.
Peak reserved memory for training % of max memory = 4.758 %.


### Saving, loading finetuned models

In [11]:
model.save_pretrained("lora_model", ) # Local saving
# model.push_to_hub("hf_repo/lora_model", token = "hf_") # Online saving


Cannot access gated repo for url https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2/resolve/main/config.json.
Access to model mistralai/Mistral-7B-Instruct-v0.2 is restricted. You must be authenticated to access it. - silently ignoring the lookup for the file config.json in mistralai/Mistral-7B-Instruct-v0.2.


In [12]:
model.save_pretrained_merged("model", tokenizer, save_method = "merged_16bit",)

Unsloth: Merging 4bit and LoRA weights to 16bit...
Unsloth: Will use up to 137.42 out of 187.86 RAM for saving.


100%|██████████| 32/32 [00:00<00:00, 99.18it/s] 


Unsloth: Saving tokenizer... Done.
Unsloth: Saving model... This might take 5 minutes for Llama-7b...
Done.


### Inference

In [6]:
# initialize model
max_seq_length = 1024 
dtype = None 
load_in_4bit = False 

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "model/", 
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    token = "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",
)

==((====))==  Unsloth: Fast Mistral patching release 2024.4
   \\   /|    GPU: NVIDIA GeForce RTX 3090. Max memory: 23.684 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.2.2. CUDA = 8.6. CUDA Toolkit = 12.1.
\        /    Bfloat16 = TRUE. Xformers = 0.0.25.post1. FA = False.
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [7]:
FastLanguageModel.for_inference(model) 

def get_inference(text, instruction):
    input_prompt = alpaca_prompt.format(instruction, text, "")
    input_token = tokenizer(input_prompt, return_tensors = "pt").to('cuda')
    
    output = model.generate(**input_token, max_new_tokens = 10, use_cache = True, pad_token_id=tokenizer.eos_token_id)

    output_tokens_minus_input = output[0][len(input_token['input_ids'][0]):-1]
    output_text = tokenizer.decode(output_tokens_minus_input, skip_special_tokens = True)
    return output_text

In [8]:
test_df = pd.read_csv("test_data.csv")
texts = test_df['text'].tolist()

scope3_outputs = [get_inference(text, instruction_scope3) for text in texts]
vague_outputs = [get_inference(text, instruction_vagueness) for text in texts]

In [9]:
test_df['predicted_scope3'] = scope3_outputs
test_df['predicted_vague'] = vague_outputs

In [10]:
test_df.pivot_table(index = 'scope3', columns = 'predicted_scope3', values = 'text', aggfunc = 'count')

predicted_scope3,no,yes
scope3,Unnamed: 1_level_1,Unnamed: 2_level_1
no,1005,19
yes,22,37


In [11]:
test_df.pivot_table(index = 'vague', columns = 'predicted_vague', values = 'text', aggfunc = 'count')

predicted_vague,ambiguous,generic,notESG,specific
vague,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ambiguous,130,31,34,27
generic,36,62,38,13
notESG,25,22,413,30
specific,44,7,49,122


In [12]:
test_df.head()

Unnamed: 0,text,gpt_responses,scope3,vague,file_name,predicted_scope3,predicted_vague
0,The Company gives preference to power from ren...,"{'scope3': 'no', 'vague': 'generic'}",no,generic,NYSE_TTE_2022_results.csv,no,ambiguous
1,"In FY22, NIKE and the Nike Foundation provided...","{'scope3': 'no', 'vague': 'specific'}",no,specific,NYSE_NKE_2022_results.csv,no,specific
2,Process safety events can potentially impact t...,"{'scope3': 'no', 'vague': 'ambiguous'}",no,ambiguous,NYSE_XOM_2022_results.csv,no,notESG
3,In addition to the circular production process...,"{'scope3': 'no', 'vague': 'generic'}",no,generic,NYSE_TTE_2022_results.csv,no,generic
4,Foundational to our human rights approach is t...,"{'scope3': 'no', 'vague': 'specific'}",no,specific,NYSE_MCD_2022_results.csv,no,specific


In [14]:
test_df[(test_df['vague']=='specific') & (test_df['predicted_vague']!='specific')]['text'].tolist()[10:20]

['Transparent processes and systems to help ensure suppliers uphold standards on environmental, social, and ethical business practice issues.\n',
 'The Committee assists our Board in overseeing the company’s environmental, social, legislative, regulatory and public policy matters, including progress against our science based emissions reduction target.\n',
 'Finally, we assess how chemicals are used, including the volumes, types of application and availability of safe alternatives, to help us set realistic targets for the future.\n',
 'We continue to progress toward net zero emissions globally by 2050, transforming our business to be more resilient.\n',
 'Median earnings for hourly employees is based on total earnings for total hours paid and includes the impact of overtime, shift premiums, and gratuities.\n',
 'In fiscal 2022, more than 140 million ( 60%) of our charitable giving was directed to programs that support Asian American and Pacific Islander, Black and African American, His

In [15]:
test_df[(test_df['scope3']=='no') & (test_df['predicted_scope3']!='no')]['text'].tolist()[10:20]

['Based on this, we recognize action is needed throughout our value chain to help address the complex threat of climate change.\n',
 'In total, more than 160 of our suppliers have set or committed to setting SBTi approved emissions targets.\n',
 'We are engaging with suppliers and educating both accommodation providers and customers on what sustainable travel means.\n',
 'This mapping, which we aim to complete by the end of 2025, will help determine how we engage with suppliers to drive continuous improvement.\n',
 'In the near term, NIKE intends to focus on cotton and polyester supply chains to reduce our dependence on higher impact versions of important fabrics.\n',
 'Engaging with our value chain and policies as a crtitical tool for achieving outcomes.\n',
 'In partnership with global suppliers and an external third party facilitator, we are in the process of concluding an extensive, two year commercial trials initiative to inform our KWIs target setting efforts and to study the imp

In [44]:
(test_df["scope3"] == test_df["new_scope3"]).mean()

0.9621421975992613

# Baseline

In [4]:
# initialize model
max_seq_length = 1024 
dtype = None 
load_in_4bit = False 

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/mistral-7b-instruct-v0.2-bnb-4bit", 
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    token = "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",
)

Unsloth: You passed in `unsloth/mistral-7b-instruct-v0.2-bnb-4bit` which is a 4bit model, yet you set
`load_in_4bit = False`. We shall load `mistralai/Mistral-7B-Instruct-v0.2` instead.


==((====))==  Unsloth: Fast Mistral patching release 2024.4
   \\   /|    GPU: NVIDIA GeForce RTX 3090. Max memory: 23.684 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.2.2. CUDA = 8.6. CUDA Toolkit = 12.1.
\        /    Bfloat16 = TRUE. Xformers = 0.0.25.post1. FA = False.
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [9]:
test_df = pd.read_csv("test_data.csv")
texts = test_df['text'].tolist()

inputs_scope3 = [alpaca_prompt.format(instruction_scope3, text, "") for text in texts]
inputs_vague = [alpaca_prompt.format(instruction_vagueness, text, "") for text in texts]

inputs_scope3_tokens = tokenizer(inputs_scope3[0], return_tensors = "pt").to('cuda')
inputs_vague_tokens = tokenizer(inputs_vague[0], return_tensors = "pt").to('cuda')


In [10]:
inputs_scope3[0], inputs_vague[0]

('Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n\n### Instruction:\n\n    Below is a sentence from a corporate ESG report. Classify if the given sentence is about Scope3 emissions or not. \n    \n    Scope 3 categories are:\n    Purchased goods and services\n    Capital goods\n    Fuel and energy-related activities (not included in scope 1 or 2)\n    Upstream transportation and distribution\n    Waste generated in operations\n    Business travel\n    Employee commuting\n    Upstream leased assets\n    Downstream transportation and distribution\n    Processing of sold products\n    Use of sold products\n    End-of-life treatment of sold products\n    Downstream leased assets\n    Franchises\n    Investments\n    \n    Return only yes or no. Take a deep breath and answer.\n\n### Input:\nThe Company gives preference to power from renewable sources and has developed a range of d

In [11]:
FastLanguageModel.for_inference(model) 

scope3_outputs = model.generate(**inputs_scope3_tokens, max_new_tokens = 10, use_cache = True)
vague_outputs = model.generate(**inputs_vague_tokens, max_new_tokens = 10, use_cache = True)

scope3_outputs = tokenizer.batch_decode(scope3_outputs)
vague_outputs = tokenizer.batch_decode(vague_outputs)

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


In [12]:
scope3_outputs, vague_outputs

(['<s> Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n\n### Instruction:\n\n    Below is a sentence from a corporate ESG report. Classify if the given sentence is about Scope3 emissions or not. \n    \n    Scope 3 categories are:\n    Purchased goods and services\n    Capital goods\n    Fuel and energy-related activities (not included in scope 1 or 2)\n    Upstream transportation and distribution\n    Waste generated in operations\n    Business travel\n    Employee commuting\n    Upstream leased assets\n    Downstream transportation and distribution\n    Processing of sold products\n    Use of sold products\n    End-of-life treatment of sold products\n    Downstream leased assets\n    Franchises\n    Investments\n    \n    Return only yes or no. Take a deep breath and answer.\n\n### Input:\nThe Company gives preference to power from renewable sources and has developed a range