In [None]:
from google.colab import files
uploaded = files.upload()


Saving API_EN.ATM.CO2E.KT_DS2_en_csv_v2_32234.csv to API_EN.ATM.CO2E.KT_DS2_en_csv_v2_32234.csv
Saving Metadata_Country_API_EN.ATM.CO2E.KT_DS2_en_csv_v2_32234.csv to Metadata_Country_API_EN.ATM.CO2E.KT_DS2_en_csv_v2_32234.csv
Saving Metadata_Indicator_API_EN.ATM.CO2E.KT_DS2_en_csv_v2_32234.csv to Metadata_Indicator_API_EN.ATM.CO2E.KT_DS2_en_csv_v2_32234.csv


In [None]:
# Step 1: Install Required Libraries
!pip install torch transformers datasets

# Step 2: Import Necessary Libraries
import os
import pandas as pd
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments
from datasets import Dataset
from sklearn.model_selection import train_test_split
import torch

# Step 3: Define file paths for CSV files in the content folder
indicator_metadata_file = '/content/Metadata_Indicator_API_EN.ATM.CO2E.KT_DS2_en_csv_v2_32234.csv'
country_metadata_file = '/content/Metadata_Country_API_EN.ATM.CO2E.KT_DS2_en_csv_v2_32234.csv'

try:
    indicator_metadata = pd.read_csv(indicator_metadata_file)
    country_metadata = pd.read_csv(country_metadata_file)
except FileNotFoundError as e:
    print("FileNotFoundError:", e)
except pd.errors.ParserError as e:
    print("ParserError:", e)
except Exception as e:
    print("An error occurred:", e)

# Step 5: Data Preparation
def enrich_training_data(indicator_metadata, country_metadata):
    training_texts = []

    for index, row in indicator_metadata.iterrows():
        indicator_name = row['INDICATOR_NAME']
        for _, country_row in country_metadata.iterrows():
            country = country_row['Country Code']  # Assuming 'Country Code' is the relevant field
            training_texts.append(f"The indicator '{indicator_name}' for {country} provides insights into carbon emissions.")

    return training_texts

training_data = enrich_training_data(indicator_metadata, country_metadata)

# Step 6: Convert to Dataset
train_dataset = Dataset.from_dict({"text": training_data})

# Split the dataset into training and validation sets
train_texts, val_texts = train_test_split(training_data, test_size=0.1, random_state=42)

# Create datasets for training and validation
train_dataset = Dataset.from_dict({"text": train_texts})
val_dataset = Dataset.from_dict({"text": val_texts})

# Step 7: Load Tokenizer and Model
model_name = "HariVaradhan/ECOSPHERE_AI"  # Replace this with your chosen model name
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

# Step 8: Tokenization
def tokenize_function(examples):
    # Tokenize and create labels for the model
    input_ids = tokenizer(examples["text"], truncation=True, padding="max_length", max_length=128)
    input_ids["labels"] = input_ids["input_ids"].copy()  # Create labels
    return input_ids

tokenized_train_dataset = train_dataset.map(tokenize_function, batched=True)
tokenized_val_dataset = val_dataset.map(tokenize_function, batched=True)

# Set the format for tokenized datasets
tokenized_train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
tokenized_val_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

# Step 9: Set Training Arguments
training_args = TrainingArguments(
    output_dir="./gpt2-emissions",
    evaluation_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=2,
    num_train_epochs=3,
    save_steps=10_000,
    save_total_limit=2,
)

# Step 10: Create Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_val_dataset,  # Add validation dataset here
)

# Step 11: Start Training
trainer.train()

# Step 12: Save the Model
model.save_pretrained("./gpt2-emissions")
tokenizer.save_pretrained("./gpt2-emissions")

print("Model fine-tuning complete!")




Map:   0%|          | 0/238 [00:00<?, ? examples/s]

Map:   0%|          | 0/27 [00:00<?, ? examples/s]



Epoch,Training Loss,Validation Loss
1,No log,0.081658
2,No log,0.084031
3,No log,0.088152


Model fine-tuning complete!


In [None]:
import locale
locale.getpreferredencoding = lambda: "UTF-8"

In [None]:
!huggingface-cli login

model.push_to_hub("HariVaradhan/ECOSPHERE", check_pr=True)

tokenizer.push_to_hub("HariVaradhan/ECOSPHERE",check_pr=True)



    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    To login, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Enter your token (input will not be visible): 
Add token as git credential? (Y/n) y
Token is valid (permission: fineGrained).
[1m[31mCannot authenticate through git-credential as no helper is defined on your machine.
You might have to re-authenticate when pushing to the Hugging Face Hub.
Run the following command in yo

model.safetensors:   0%|          | 0.00/498M [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/HariVaradhan/ECOSPHERE/commit/59177f9210a6b159e4ee4c7ee84216a92f50d768', commit_message='Upload tokenizer', commit_description='', oid='59177f9210a6b159e4ee4c7ee84216a92f50d768', pr_url=None, pr_revision=None, pr_num=None)

In [None]:
from google.colab import files
uploaded = files.upload()


Saving climate.json to climate.json


In [None]:
def extract_conversations(example):
    return {
        "user_input": example['conversations'][0]['user_input'],
        "bot_response": example['conversations'][0]['bot_response']
    }

dataset = dataset.map(extract_conversations)


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

In [None]:
def preprocess_function(examples):
    return tokenizer(examples['user_input'], truncation=True, padding=True)

tokenized_dataset = dataset.map(preprocess_function, batched=True)


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

In [None]:
!pip install -q accelerate==0.21.0 peft==0.4.0 bitsandbytes==0.40.2 transformers==4.31.0 trl==0.4.7
import os
import torch
from datasets import load_dataset

from peft import LoraConfig, PeftModel
from trl import SFTTrainer

In [None]:
import json
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments

# Set model and dataset names
model_name = "HariVaradhan/ECOSPHERE_AI"  # Change to GPT-2
json_file_path = 'climate.json'  # Replace with your JSON file path

# TrainingArguments parameters
output_dir = "./results"
num_train_epochs = 3  # Adjust epochs as needed
fp16 = True
per_device_train_batch_size = 1  # Adjust batch size
per_device_eval_batch_size = 1
learning_rate = 2e-4
max_seq_length = 256  # Shorten the sequence length

# Load JSON data
with open(json_file_path, 'r') as file:
    json_data = json.load(file)

# Preprocess the data
def preprocess_data(json_data):
    dialogues = []
    for entry in json_data['conversations']:
        user_input = entry['user_input']
        bot_response = entry['bot_response']
        dialogues.append(f"User: {user_input} \nBot: {bot_response}")
    return dialogues

# Preprocess the JSON data
train_texts = preprocess_data(json_data)

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

# Tokenize and prepare dataset
def tokenize_function(examples):
    encodings = tokenizer(
        examples,
        padding="max_length",
        truncation=True,
        max_length=max_seq_length,
        return_tensors="pt",
    )
    # Create labels
    encodings["labels"] = encodings["input_ids"].clone()
    return encodings

train_encodings = tokenize_function(train_texts)

# Prepare dataset for training
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __len__(self):
        return len(self.encodings['input_ids'])

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        return item

train_dataset = CustomDataset(train_encodings)

# Load base model
model = AutoModelForCausalLM.from_pretrained(model_name)

# Set training parameters
training_arguments = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=num_train_epochs,
    per_device_train_batch_size=per_device_train_batch_size,
    per_device_eval_batch_size=per_device_eval_batch_size,
    learning_rate=learning_rate,
    logging_steps=25,
)

# Set up standard Trainer
trainer = Trainer(
    model=model,
    args=training_arguments,
    train_dataset=train_dataset,
)

# Train model
trainer.train()


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Step,Training Loss
25,1.0931


TrainOutput(global_step=36, training_loss=0.8704144027498033, metrics={'train_runtime': 248.3353, 'train_samples_per_second': 0.145, 'train_steps_per_second': 0.145, 'total_flos': 4703256576000.0, 'train_loss': 0.8704144027498033, 'epoch': 3.0})

In [None]:
import locale
locale.getpreferredencoding = lambda: "UTF-8"

In [None]:
!huggingface-cli login

model.push_to_hub("HariVaradhan/ECO_FINAL", check_pr=True)

tokenizer.push_to_hub("HariVaradhan/ECO_FINAL",check_pr=True)



    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    A token is already saved on your machine. Run `huggingface-cli whoami` to get more information or `huggingface-cli logout` if you want to log out.
    Setting a new token will erase the existing one.
    To login, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Enter your token (input will not be visible): 
Add token as git credential? (Y/n) y
Token is valid (permission: fineGr

pytorch_model.bin:   0%|          | 0.00/498M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/HariVaradhan/ECO_FINAL/commit/c13c1aba49d6bf9e19890a2e23d863c72287620b', commit_message='Upload tokenizer', commit_description='', oid='c13c1aba49d6bf9e19890a2e23d863c72287620b', pr_url=None, pr_revision=None, pr_num=None)