In [None]:
!pip install -q -U torch
!pip install -q -U bitsandbytes
!pip install -q -U datasets==2.10.1
!pip install transformers==4.31
!pip install -q -U git+https://github.com/huggingface/peft.git
!pip install -q -U git+https://github.com/huggingface/accelerate.git
!pip install -q -U git+https://github.com/lvwerra/trl.git
!pip install -q -U sentencepiece

In [1]:
import pandas as pd
import bitsandbytes as bnb
from functools import partial
import os
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training,PeftConfig,PeftModel
import torch
from transformers import AutoModelForCausalLM,LlamaForCausalLM,LlamaTokenizer, AutoTokenizer, set_seed, Trainer, TrainingArguments, BitsAndBytesConfig, \
   DataCollatorForLanguageModeling, Trainer, TrainingArguments, TextStreamer
from datasets import load_dataset,Dataset

2024-02-28 14:26:06.461560: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-02-28 14:26:06.461620: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-02-28 14:26:06.463213: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [2]:
# Loading Model and Tokenizer with a GPU limit of at most 8 GB
def load_model(model_name, bnb_config):
   n_gpus = torch.cuda.device_count()
   max_memory = f'{8000}MB'

   model = LlamaForCausalLM.from_pretrained(
       model_name,
       quantization_config=bnb_config,
       device_map="auto",  # Efficiently dispatch the model on available resources
       max_memory={i: max_memory for i in range(n_gpus)},
   )

   model.config.use_cache = False
   model.config.pretraining_tp = 1
   tokenizer = LlamaTokenizer.from_pretrained(model_name, trust_remote_code=True)


   # Needed for LLaMA tokenizer
   tokenizer.pad_token = tokenizer.eos_token
   tokenizer.padding_side = "right"

   return model, tokenizer

In [3]:
#These two functions are used to create the configuration for Bits and Bytes Quantization and LoRA Tuning.
# Create a BitsAndBytesConfig for quantization
def create_bnb_config():
   # Configure BitsAndBytes quantization with specific settings
   bnb_config = BitsAndBytesConfig(
       load_in_4bit=True,                    # Load weights in 4-bit format
       bnb_4bit_use_double_quant=True,       # Use double quantization for 4-bit
       bnb_4bit_quant_type="nf4",           # 4-bit quantization type
       bnb_4bit_compute_dtype=torch.bfloat16, # Compute data type for 4-bit
   )


   return bnb_config


# Create a Parameter-Efficient Fine-Tuning config for your model
def create_peft_config(modules):
   """
   Create Parameter-Efficient Fine-Tuning config for your model
   :param modules: Names of the modules to apply Lora to
   """
   # Configure Lora (Parameter-Efficient Fine-Tuning) with specific settings
   config = LoraConfig(
       r=16,                # Dimension of the updated matrices
       lora_alpha=64,       # Parameter for scaling
       target_modules=modules, # Names of the modules to apply Lora to
       lora_dropout=0.05,    # Dropout probability
       bias="none",         # Bias type
       task_type="CAUSAL_LM", # Task type (Causal Language Modeling in this case)
   )


   return config

In [4]:
# import pandas as pd
# from sklearn.model_selection import train_test_split

# # # Read the CSV file into a DataFrame
# # df = pd.read_csv('/kaggle/input/mergedhindi/merged_data.csv')

# # # Calculate the minimum number of samples for any class
# # min_samples = 2

# # # Use a custom function to ensure each class has at least min_samples in both sets
# # def custom_stratify(df, min_samples):
# #     min_class_samples = df['Labels Set'].value_counts().min()
# #     if min_class_samples < min_samples:
# #         raise ValueError(f"The minimum number of groups for any class cannot be less than {min_samples}.")
# #     return df

# # # Split the dataset into training and test sets for each sentiment class
# # X_train = []
# # X_test = []
# # for sentiment in df['Labels Set'].unique():
# #     data_sentiment = df[df['Labels Set'] == sentiment]
# #     data_sentiment = custom_stratify(data_sentiment, min_samples)
# #     train, test = train_test_split(data_sentiment, test_size=0.2, random_state=42)
# #     X_train.append(train)
# #     X_test.append(test)

# # # Concatenate the training and test sets
# # X_train = pd.concat(X_train)
# # X_test = pd.concat(X_test)

# # # Print the distribution of classes in the training and test sets
# print("Training Set:")
# # print(X_train['Labels Set'].value_counts())
# # print("\nTest Set:")
# # print(X_test['Labels Set'].value_counts())

# X_train = pd.read_csv('/content/merged_data.csv')
# X_test = pd.read_csv('/content/Test Set Complete - test (2).csv')
# X_eval = pd.read_csv('/content/Constraint_Hindi_Valid_Processed - Sheet1 (1) (1).csv')
# #evaluation_data = pd.read_csv('/path/to/evaluation_data.csv')

# # # Write the training and test datasets to separate CSV files
# # X_train.to_csv('training_data.csv', index=False)
# # X_test.to_csv('test_data.csv', index=False)

# # Shuffle the training data and select evaluation data
# # X_train = X_train.sample(frac=1, random_state=10)
# # eval_idx = [idx for idx in df.index if idx not in list(X_train.index) + list(X_test.index)]
# # X_eval = df[df.index.isin(eval_idx)]
# # X_eval = (X_eval
# #           .groupby('Labels Set', group_keys=False)
# #           .apply(lambda x: x.sample(n=50, random_state=10, replace=True)))

In [5]:
# def generate_prompt(data_point):
#     return f"""
#             Analyze the sentiment of the tweet post,determine if it is 'hate,offensive', 'non-hostile', 'defamation,offensive', 'fake', 'hate', 'offensive',
#           'fake,hate', 'defamation', 'defamation,hate', 'defamation,hate,offensive',
#           'defamation,fake,offensive', 'fake,offensive', 'defamation,fake',
#           'defamation,fake,hate', 'fake,hate,offensive', or 'defamation,fake,hate,offensive',
#           and return the answer as
#             the corresponding sentiment label "hate,offensive" or "non-hostile" or "defamation,offensive" or "fake" or "hate" or "offensive" or
#           "fake,hate" or "defamation" or "defamation,hate" or "defamation,hate,offensive" or
#           "defamation,fake,offensive" or "fake,offensive" or "defamation,fake" or
#           "defamation,fake,hate" or "fake,hate,offensive" or "defamation,fake,hate,offensive".

#             [{data_point["Post"]}] = {data_point["Labels Set"]}
#             """.strip()

def generate_prompt(data_point):
    return f"""Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.  # noqa: E501
### Instruction:
{data_point["instruction"]}
### Input:
{data_point["input"]}
### Response:
{data_point["output"]}"""

def generate_test_prompt(data_point):
    return f"""
            Analyze the sentiment of the tweet post,
            determine if it is 'hate,offensive', 'non-hostile', 'defamation,offensive', 'fake', 'hate', 'offensive',
          'fake,hate', 'defamation', 'defamation,hate', 'defamation,hate,offensive',
          'defamation,fake,offensive', 'fake,offensive', 'defamation,fake',
          'defamation,fake,hate', 'fake,hate,offensive', or 'defamation,fake,hate,offensive',
          and return the answer as
            the corresponding sentiment label "hate,offensive" or "non-hostile" or "defamation,offensive" or "fake" or "hate" or "offensive" or
          "fake,hate" or "defamation" or "defamation,hate" or "defamation,hate,offensive" or
          "defamation,fake,offensive" or "fake,offensive" or "defamation,fake" or
          "defamation,fake,hate" or "fake,hate,offensive" or "defamation,fake,hate,offensive".

            [{data_point["Post"]}] = """.strip()

# X_train = pd.DataFrame(X_train.apply(generate_prompt, axis=1),
#                        columns=["Post"])
# X_eval = pd.DataFrame(X_eval.apply(generate_prompt, axis=1),
#                       columns=["Post"])

In [6]:
# y_true = X_test["Labels Set"]
# X_test = pd.DataFrame(X_test.apply(generate_test_prompt, axis=1), columns=["Post"])

# train_data = Dataset.from_pandas(X_train)
# eval_data = Dataset.from_pandas(X_eval)

In [7]:
def evaluate(y_true, y_pred):
    labels = ['hate,offensive', 'non-hostile', 'defamation,offensive', 'fake', 'hate', 'offensive',
          'fake,hate', 'defamation', 'defamation,hate', 'defamation,hate,offensive',
          'defamation,fake,offensive', 'fake,offensive', 'defamation,fake',
          'defamation,fake,hate', 'fake,hate,offensive', 'defamation,fake,hate,offensive']
    mapping = {'hate,offensive': 1, 'non-hostile': 0, 'defamation,offensive': 2, 'fake':3, 'hate':4, 'offensive':5,
          'fake,hate':6, 'defamation':7, 'defamation,hate':8, 'defamation,hate,offensive':9,
          'defamation,fake,offensive':10, 'fake,offensive':11, 'defamation,fake':12,
          'defamation,fake,hate':13, 'fake,hate,offensive':14, 'defamation,fake,hate,offensive':15}
    def map_func(x):
        return mapping.get(x, 1)

    y_true = np.vectorize(map_func)(y_true)
    y_pred = np.vectorize(map_func)(y_pred)

    # Calculate accuracy
    accuracy = accuracy_score(y_true=y_true, y_pred=y_pred)
    print(f'Accuracy: {accuracy:.3f}')

    # Generate accuracy report
    unique_labels = set(y_true)  # Get unique labels

    for label in unique_labels:
        label_indices = [i for i in range(len(y_true))
                         if y_true[i] == label]
        label_y_true = [y_true[i] for i in label_indices]
        label_y_pred = [y_pred[i] for i in label_indices]
        accuracy = accuracy_score(label_y_true, label_y_pred)
        print(f'Accuracy for label {label}: {accuracy:.3f}')

    # Generate classification report
    class_report = classification_report(y_true=y_true, y_pred=y_pred)
    print('\nClassification Report:')
    print(class_report)

    # Generate confusion matrix
    conf_matrix = confusion_matrix(y_true=y_true, y_pred=y_pred, labels=[0, 1, 2,3,4,5,6,7,8,9,10,11,12,13,14,15])
    print('\nConfusion Matrix:')
    print(conf_matrix)

In [8]:
!huggingface-cli login --token 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

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [9]:
# df = pd.read_csv("/kaggle/input/mergedhindi/merged_data.csv")
# dataset_data = [
#     {
#         "instruction": "Detect the sentiment of the tweet.",
#         "input": row_dict["Post"],
#         "output": row_dict["Labels Set"]
#     }
#     for row_dict in df.to_dict(orient="records")
# ]
 
# dataset_data[0]

In [10]:
# import json
# with open("merged-openHathi-sentiment-dataset.json", "w") as f:
#    json.dump(dataset_data, f)

In [11]:
# !pip install -U datasets

In [12]:
# import pandas as pd
# df=pd.read_json("datasets-issues.jsonl", lines=True)
# df.head()

# from datasets import Dataset
# issues_dataset = Dataset.from_pandas(df)
# issues_dataset
# sample = issues_dataset.shuffle(seed=666).select(range(3))
# sample[0]
from datasets import load_dataset
dataset = load_dataset("SR08/1KJson",revision="main")

Downloading data:   0%|          | 0.00/775k [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

In [13]:
dataset

DatasetDict({
    train: Dataset({
        features: ['input', 'instruction', 'output'],
        num_rows: 999
    })
})

In [14]:
dataset = dataset["train"].train_test_split(test_size=0.2) # my data in HF have 1 train split only
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['input', 'instruction', 'output'],
        num_rows: 799
    })
    test: Dataset({
        features: ['input', 'instruction', 'output'],
        num_rows: 200
    })
})


In [15]:
dataset["test"]

Dataset({
    features: ['input', 'instruction', 'output'],
    num_rows: 200
})

In [16]:
cache_dir = "/content/drive/My Drive/hugging_cache" # Model Location
os.makedirs(cache_dir,exist_ok=True)


model_name = "AshishK/AK-OpenHathi-7B-Hi-Sharded-bf16"
bnb_config = create_bnb_config() # Creating Configuration


model, tokenizer = load_model(model_name, bnb_config)

config.json:   0%|          | 0.00/695 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/15 [00:00<?, ?it/s]

model-00001-of-00015.safetensors:   0%|          | 0.00/933M [00:00<?, ?B/s]

model-00002-of-00015.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

model-00003-of-00015.safetensors:   0%|          | 0.00/967M [00:00<?, ?B/s]

model-00004-of-00015.safetensors:   0%|          | 0.00/967M [00:00<?, ?B/s]

model-00005-of-00015.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

model-00006-of-00015.safetensors:   0%|          | 0.00/944M [00:00<?, ?B/s]

model-00007-of-00015.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

model-00008-of-00015.safetensors:   0%|          | 0.00/967M [00:00<?, ?B/s]

model-00009-of-00015.safetensors:   0%|          | 0.00/967M [00:00<?, ?B/s]

model-00010-of-00015.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

model-00011-of-00015.safetensors:   0%|          | 0.00/944M [00:00<?, ?B/s]

model-00012-of-00015.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

model-00013-of-00015.safetensors:   0%|          | 0.00/967M [00:00<?, ?B/s]

model-00014-of-00015.safetensors:   0%|          | 0.00/742M [00:00<?, ?B/s]

model-00015-of-00015.safetensors:   0%|          | 0.00/394M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/15 [00:00<?, ?it/s]

Some weights of LlamaForCausalLM were not initialized from the model checkpoint at AshishK/AK-OpenHathi-7B-Hi-Sharded-bf16 and are newly initialized: ['model.layers.28.self_attn.rotary_emb.inv_freq', 'model.layers.29.self_attn.rotary_emb.inv_freq', 'model.layers.9.self_attn.rotary_emb.inv_freq', 'model.layers.27.self_attn.rotary_emb.inv_freq', 'model.layers.10.self_attn.rotary_emb.inv_freq', 'model.layers.13.self_attn.rotary_emb.inv_freq', 'model.layers.1.self_attn.rotary_emb.inv_freq', 'model.layers.3.self_attn.rotary_emb.inv_freq', 'model.layers.7.self_attn.rotary_emb.inv_freq', 'model.layers.26.self_attn.rotary_emb.inv_freq', 'model.layers.8.self_attn.rotary_emb.inv_freq', 'model.layers.24.self_attn.rotary_emb.inv_freq', 'model.layers.0.self_attn.rotary_emb.inv_freq', 'model.layers.31.self_attn.rotary_emb.inv_freq', 'model.layers.20.self_attn.rotary_emb.inv_freq', 'model.layers.6.self_attn.rotary_emb.inv_freq', 'model.layers.4.self_attn.rotary_emb.inv_freq', 'model.layers.25.self_at

generation_config.json:   0%|          | 0.00/184 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/968k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/438 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.11k [00:00<?, ?B/s]

In [17]:
def generate_and_tokenize_prompt(data_point):
   full_prompt = generate_prompt(data_point)
   tokenized_full_prompt = tokenizer(full_prompt, padding=True, truncation=True,max_length=1024)
   return tokenized_full_prompt

In [18]:
# Function to generate predictions
import pandas as pd
from sklearn.model_selection import train_test_split
def generate_predictions(data):
    predictions = []
    for example in data:
        input_ids = example["input_ids"]
        with torch.no_grad():
            outputs = model(torch.tensor(input_ids).unsqueeze(0))
            logits = outputs.logits
            predicted_label = np.argmax(logits[0].cpu().numpy())
            predictions.append(predicted_label)
    return predictions

# # Function to evaluate predictions
# def evaluate(y_true, y_pred):
#     # Calculate accuracy
#     accuracy = accuracy_score(y_true, y_pred)
#     print(f'Accuracy: {accuracy:.3f}')

#     # Generate classification report
#     class_report = classification_report(y_true, y_pred)
#     print('\nClassification Report:')
#     print(class_report)

#     # Generate confusion matrix
#     conf_matrix = confusion_matrix(y_true, y_pred)
#     print('\nConfusion Matrix:')
#     print(conf_matrix)


In [19]:
# # Tokenize the test prompts using a loop or dictionary comprehension
# test_data_tokenized = {}
# for key, value in test_data.items():
# #     test_data_tokenized[key] = generate_and_tokenize_prompt(value)

In [20]:
training_data = dataset["train"].shuffle().map(generate_and_tokenize_prompt)
training_data

Map:   0%|          | 0/799 [00:00<?, ? examples/s]

Dataset({
    features: ['input', 'instruction', 'output', 'input_ids', 'attention_mask'],
    num_rows: 799
})

In [21]:
test_data = dataset["test"].shuffle().map(generate_and_tokenize_prompt)
test_data

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

Dataset({
    features: ['input', 'instruction', 'output', 'input_ids', 'attention_mask'],
    num_rows: 200
})

In [22]:
def find_all_linear_names(model):
   cls = bnb.nn.Linear4bit #if args.bits == 4 else (bnb.nn.Linear8bitLt if args.bits == 8 else torch.nn.Linear)
   lora_module_names = set()
   for name, module in model.named_modules():
       if isinstance(module, cls):
           names = name.split('.')
           lora_module_names.add(names[0] if len(names) == 1 else names[-1])


   if 'lm_head' in lora_module_names:  # needed for 16-bit
       lora_module_names.remove('lm_head')
   return list(lora_module_names)


def print_trainable_parameters(model, use_4bit=False):
   """
   Prints the number of trainable parameters in the model.
   """
   trainable_params = 0
   all_param = 0
   for _, param in model.named_parameters():
       num_params = param.numel()
       # if using DS Zero 3 and the weights are initialized empty
       if num_params == 0 and hasattr(param, "ds_numel"):
           num_params = param.ds_numel


       all_param += num_params
       if param.requires_grad:
           trainable_params += num_params
   if use_4bit:
       trainable_params /= 2
   print(
       f"all params: {all_param:,d} || trainable params: {trainable_params:,d} || trainable%: {100 * trainable_params / all_param}"
   )

In [23]:
def train(model, tokenizer, dataset, output_dir):
# Apply preprocessing to the model to prepare it by
# 1 - Enabling gradient checkpointing to reduce memory usage during fine-tuning
  model.gradient_checkpointing_enable()


# 2 - Using the prepare_model_for_kbit_training method from PEFT
  model = prepare_model_for_kbit_training(model)


# Get lora module names
modules = find_all_linear_names(model)
print(modules)


# Create PEFT config for these modules and wrap the model to PEFT
peft_config = create_peft_config(modules)
model = get_peft_model(model, peft_config)


# Print information about the percentage of trainable parameters
print_trainable_parameters(model)


# Training parameters
trainer = Trainer(
    model=model,
    train_dataset=training_data,
    args=TrainingArguments(
        num_train_epochs=2,
        per_device_train_batch_size=4,
        gradient_accumulation_steps=4,
        max_steps=500,
        learning_rate=2e-4,
        fp16=True,
        lr_scheduler_type ="cosine",
        logging_steps=10,
        warmup_ratio = 0.03,
        output_dir="outputs",
        optim='paged_adamw_32bit',
    ),
    data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False)
)


model.config.use_cache = False  # re-enable for inference to speed up predictions for similar inputs


### SOURCE https://github.com/artidoro/qlora/blob/main/qlora.py
# Verifying the datatypes before training


dtypes = {}
for _, p in model.named_parameters():
    dtype = p.dtype
    if dtype not in dtypes: dtypes[dtype] = 0
    dtypes[dtype] += p.numel()
total = 0
for k, v in dtypes.items(): total+= v
for k, v in dtypes.items():
    print(k, v, v/total)


do_train = True


# Launch training
print("Training...")


if do_train:
    train_result = trainer.train()
    metrics = train_result.metrics
    trainer.log_metrics("train", metrics)
    trainer.save_metrics("train", metrics)
    trainer.save_state()
    print(metrics)


###
output_dir = "results/llama2/final_checkpoint"
train(model, tokenizer, dataset, output_dir)

# Saving model
print("Saving last checkpoint of the model...")
os.makedirs(output_dir, exist_ok=True)
trainer.model.save_pretrained(output_dir)

['down_proj', 'gate_proj', 'up_proj', 'o_proj', 'q_proj', 'k_proj', 'v_proj']
all params: 3,671,986,176 || trainable params: 39,976,960 || trainable%: 1.088701266396053
torch.float16 394006528 0.10730065667872493
torch.uint8 3238002688 0.8818123306573146
torch.float32 39976960 0.01088701266396053
Training...


[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········································


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Step,Training Loss
10,3.0172
20,1.7689
30,1.6765
40,1.5559
50,1.4805
60,1.4132
70,1.33
80,1.3109
90,1.3799
100,1.3444


***** train metrics *****
  epoch                    =       10.0
  total_flos               = 23866233GF
  train_loss               =     0.6008
  train_runtime            = 1:00:10.28
  train_samples_per_second =      2.216
  train_steps_per_second   =      0.138
{'train_runtime': 3610.2895, 'train_samples_per_second': 2.216, 'train_steps_per_second': 0.138, 'total_flos': 2.5626172842246144e+16, 'train_loss': 0.6008183368444443, 'epoch': 10.0}
Saving last checkpoint of the model...


In [24]:
# def predict(test, model, tokenizer):
#     y_pred = []
#     for i in tqdm(range(len(dataset["test"]))):
#         prompt = dataset["test"].iloc[i]["Post"]
#         pipe = pipeline(task="text-generation",
#                         model=model,
#                         tokenizer=tokenizer,
#                         max_new_tokens = 1,
#                         temperature = 0.0,
#                        )
#         result = pipe(prompt)
#         answer = result[0]['generated_text'].split("=")[-1]
#         if "hate,offensive" in answer:
#             y_pred.append("hate,offensive")
#         elif "defamation,offensive" in answer:
#             y_pred.append("defamation,offensive")
#         elif "fake" in answer:
#             y_pred.append("fake")
#         elif "hate" in answer:
#             y_pred.append("hate")
#         elif "offensive" in answer:
#             y_pred.append("offensive")
#         elif "fake,hate" in answer:
#             y_pred.append("fake,hate")
#         elif "defamation" in answer:
#             y_pred.append("defamation")
#         elif "defamation,hate" in answer:
#             y_pred.append("defamation,hate")
#         elif "defamation,hate,offensive" in answer:
#             y_pred.append("defamation,hate,offensive")
#         elif "defamation,fake,offensive" in answer:
#             y_pred.append("defamation,fake,offensive")
#         elif "fake,offensive" in answer:
#             y_pred.append("fake,offensive")
#         elif "defamation,fake" in answer:
#             y_pred.append("defamation,fake")
#         elif "defamation,fake,hate" in answer:
#             y_pred.append("defamation,fake,hate")
#         elif "fake,hate,offensive" in answer:
#             y_pred.append("fake,hate,offensive")
#         elif "defamation,fake,hate,offensive" in answer:
#             y_pred.append("defamation,fake,hate,offensive")
#         else:
#             y_pred.append("non-hostile")
#     return y_pred

from tqdm import tqdm

def predict(test_data, model, tokenizer):
    y_pred = []
    for i in tqdm(range(len(test_data))):
        input_text = test_data[i]["input"]  # Assuming 'input' contains the prompt text
        pipe = pipeline(task="text-generation",
                        model=model,
                        tokenizer=tokenizer,
                        max_new_tokens=1,
                        temperature=0.0,
                       )
        result = pipe(input_text)
        answer = result[0]['generated_text'].split("=")[-1]
        if "hate,offensive" in answer:
            y_pred.append("hate,offensive")
        elif "defamation,offensive" in answer:
            y_pred.append("defamation,offensive")
        elif "fake" in answer:
            y_pred.append("fake")
        elif "hate" in answer:
            y_pred.append("hate")
        elif "offensive" in answer:
            y_pred.append("offensive")
        elif "fake,hate" in answer:
            y_pred.append("fake,hate")
        elif "defamation" in answer:
            y_pred.append("defamation")
        elif "defamation,hate" in answer:
            y_pred.append("defamation,hate")
        elif "defamation,hate,offensive" in answer:
            y_pred.append("defamation,hate,offensive")
        elif "defamation,fake,offensive" in answer:
            y_pred.append("defamation,fake,offensive")
        elif "fake,offensive" in answer:
            y_pred.append("fake,offensive")
        elif "defamation,fake" in answer:
            y_pred.append("defamation,fake")
        elif "defamation,fake,hate" in answer:
            y_pred.append("defamation,fake,hate")
        elif "fake,hate,offensive" in answer:
            y_pred.append("fake,hate,offensive")
        elif "defamation,fake,hate,offensive" in answer:
            y_pred.append("defamation,fake,hate,offensive")
        else:
            y_pred.append("non-hostile")
    return y_pred


In [25]:
test=dataset["test"]
test

Dataset({
    features: ['input', 'instruction', 'output'],
    num_rows: 200
})

In [26]:
from tqdm import tqdm
from transformers import pipeline
y_pred = predict(test, model, tokenizer)

  0%|          | 0/200 [00:00<?, ?it/s]Xformers is not installed correctly. If you want to use memory_efficient_attention to accelerate training use the following command to install Xformers
pip install xformers.
The model 'PeftModelForCausalLM' is not supported for text-generation. Supported models are ['BartForCausalLM', 'BertLMHeadModel', 'BertGenerationDecoder', 'BigBirdForCausalLM', 'BigBirdPegasusForCausalLM', 'BioGptForCausalLM', 'BlenderbotForCausalLM', 'BlenderbotSmallForCausalLM', 'BloomForCausalLM', 'CamembertForCausalLM', 'CodeGenForCausalLM', 'CpmAntForCausalLM', 'CTRLLMHeadModel', 'Data2VecTextForCausalLM', 'ElectraForCausalLM', 'ErnieForCausalLM', 'FalconForCausalLM', 'GitForCausalLM', 'GPT2LMHeadModel', 'GPT2LMHeadModel', 'GPTBigCodeForCausalLM', 'GPTNeoForCausalLM', 'GPTNeoXForCausalLM', 'GPTNeoXJapaneseForCausalLM', 'GPTJForCausalLM', 'LlamaForCausalLM', 'MarianForCausalLM', 'MBartForCausalLM', 'MegaForCausalLM', 'MegatronBertForCausalLM', 'MusicgenForCausalLM', 'MvpF

In [27]:
y_true = test_data["output"]

In [28]:
import numpy as np
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
evaluate(y_true, y_pred)

Accuracy: 0.615
Accuracy for label 0: 1.000
Accuracy for label 1: 0.000
Accuracy for label 2: 0.000
Accuracy for label 3: 0.000
Accuracy for label 4: 0.000
Accuracy for label 5: 0.000
Accuracy for label 7: 0.000
Accuracy for label 8: 0.000
Accuracy for label 9: 0.000
Accuracy for label 12: 0.000

Classification Report:
              precision    recall  f1-score   support

           0       0.61      1.00      0.76       123
           1       0.00      0.00      0.00         3
           2       0.00      0.00      0.00         5
           3       0.00      0.00      0.00        25
           4       0.00      0.00      0.00        22
           5       0.00      0.00      0.00         8
           7       0.00      0.00      0.00         8
           8       0.00      0.00      0.00         4
           9       0.00      0.00      0.00         1
          12       0.00      0.00      0.00         1

    accuracy                           0.61       200
   macro avg       0.06      

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [29]:
evaluation = pd.DataFrame({'text': test_data["input"],
                           'y_true':y_true,
                           'y_pred': y_pred},
                         )
evaluation.to_csv("test_predictions.csv", index=False)

In [30]:
evaluation

Unnamed: 0,text,y_true,y_pred
0,यही तो खूबसूरती है अपने देश की...\n\nउधर बनारस...,hate,non-hostile
1,RT @aadhiIa: उद्धव ठाकरे की BMC सेना का POK मे...,hate,non-hostile
2,पुलिस द्वारा चालान काटने पर मुसलमानो ने उनकी प...,hate,non-hostile
3,भारत की असंगठित अर्थव्यवस्था पर दूसरा बड़ा हमल...,non-hostile,non-hostile
4,.@RailMinIndia 2030 तक 33 बिलियन यूनिट की अपनी...,non-hostile,non-hostile
...,...,...,...
195,"सियासत के 'अजातशत्रु', कांग्रेस के लिए 'संकट म...",non-hostile,non-hostile
196,"पहले रोता रहता था मोदी बोलने नही देते, \nऔर जब...",defamation,non-hostile
197,"ताहिर-उमर तो झांकी है, बहुत ग़द्दार अभी बाक़ी ...",hate,non-hostile
198,अर्नब गोस्वामी से पत्रकारिता जगत को उतना ही फा...,"hate,offensive",non-hostile


In [31]:
from huggingface_hub import notebook_login

notebook_login()

#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

trainer.model.push_to_hub("SR08/openHathi-61.5", use_auth_token=True)

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

adapter_model.bin:   0%|          | 0.00/160M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/SR08/openHathi-61.5/commit/208df8d6b6872fd2affb4f3b3e047c5037fee7dc', commit_message='Upload model', commit_description='', oid='208df8d6b6872fd2affb4f3b3e047c5037fee7dc', pr_url=None, pr_revision=None, pr_num=None)

In [None]:
# Free memory for merging weights
# del model
del trainer
torch.cuda.empty_cache()

import gc
gc.collect()
gc.collect()