In [1]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, balanced_accuracy_score, classification_report, precision_recall_fscore_support
import csv
import huggingface_hub
from tqdm.notebook import tqdm

huggingface_hub.login("")

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: read).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [2]:
# Load your datasets
df = pd.read_csv('../data/unpreprocessed/train.csv')
df_test = pd.read_csv('../data/unpreprocessed/test.csv')
# Create label mappings
unique_labels = df['label'].unique()
label2id = {label: idx for idx, label in enumerate(unique_labels)}
id2label = {idx: label for label, idx in label2id.items()}

# Create label statistics
label_counts = df['label'].value_counts()
print("train label counts: ", label_counts)
label_counts = df_test['label'].value_counts()
print("test label counts: ", label_counts)

# Map labels to IDs
df['label'] = df['label'].map(label2id)
df_test['label'] = df_test['label'].map(label2id)

# Rename columns for consistency
df.columns = ['text', 'label']
df_test.columns = ['public_id','text', 'label']

# Shuffle and split the data
df_train = df.sample(frac=1).reset_index(drop=True)
df_test = df_test.sample(frac=1).reset_index(drop=True)
#df_test = df_test.iloc[:200]


train label counts:  label
fake       571
partial    349
truth      206
other      115
Name: count, dtype: int64
test label counts:  label
partial    125
fake       105
truth       62
other       38
Name: count, dtype: int64


In [3]:
### Model and Tokenizer Setup

model_name = "meta-llama/Meta-Llama-3.1-8B-Instruct"

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, add_prefix_space=True)
tokenizer.pad_token_id = tokenizer.eos_token_id  # Ensure padding token is set
tokenizer.pad_token = tokenizer.eos_token

# Configure quantization
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4"
)

# Load the causal language model with quantization
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map='auto',
    quantization_config=quantization_config,
    trust_remote_code=True
)

model.eval()  # Set model to evaluation mode

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 4096)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear4bit(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((4096,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((4096,), eps

In [4]:
### Helper Functions

def calculate_metrics(y_true, y_pred):
    accuracy = accuracy_score(y_true, y_pred)
    balanced_accuracy = balanced_accuracy_score(y_true, y_pred)
    precision, recall, f1, _ = precision_recall_fscore_support(y_true, y_pred, average='weighted')
    return {
        'accuracy': accuracy,
        'balanced_accuracy': balanced_accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1
    }

def write_scores(test_df, csv_file):
    y_test = test_df.label
    y_pred = test_df.predictions
    
    metrics = calculate_metrics(y_test, y_pred)
    
    print("Classification Report:")
    print(classification_report(y_test, y_pred))
    
    print("Metrics:")
    for metric, value in metrics.items():
        print(f"{metric.capitalize()}: {value:.4f}")
    
  
    # Write results to CSV
    with open(csv_file, 'w', newline='') as f:
        writer = csv.writer(f)
        writer.writerow(['Model', 'Examples per Label', 'Metric', 'Value'])
        writer.writerow([model_name, num_examples_per_label, '', ''])  # Add model info
        for metric, value in metrics.items():
            writer.writerow(['', '', metric.capitalize(), f"{value:.4f}"])


def generate_prompt(few_shot_examples, new_input):
    prompt = ""
    for example in few_shot_examples:
        prompt += f"Text: {example['text']}\nLabel: {example['label']}\n\n"
    prompt += f"Text: {new_input}\nLabel:"
    return prompt

def map_output_to_label(output_text, id2label):
    # Extract the label from the generated text
    # This assumes the model generates the label name directly after "Label:"
    label = output_text.strip().split('\n')[0]
    # Find the label that best matches the generated text
    for key in id2label.values():
        if key.lower() in label.lower():
            return key
    return None  # Or a default label

### Prepare Few-Shot Examples

def get_few_shot_examples(df_train, num_examples_per_label=2):
    few_shot = []
    
    # Calculate mean text length
    mean_length = df_train['text'].str.len().mean()
    
    # Define a range around the mean (e.g., ±20%)
    lower_bound = mean_length * 0.8
    upper_bound = mean_length * 1.2
    
    # For each label, select num_examples_per_label samples
    for label_id in id2label.keys():
        label_samples = df_train[df_train['label'] == label_id]
        
        # Filter samples within the desired length range
        filtered_samples = label_samples[
            (label_samples['text'].str.len() >= lower_bound) & 
            (label_samples['text'].str.len() <= upper_bound)
        ]
        
        # If we have enough filtered samples, use them; otherwise, fall back to original samples
        if len(filtered_samples) >= num_examples_per_label:
            selected_samples = filtered_samples.sample(n=num_examples_per_label)
        else:
            selected_samples = label_samples.sample(n=num_examples_per_label)
        
        few_shot.extend(selected_samples.to_dict(orient='records'))
    
    # Shuffle the examples to mix labels
    np.random.shuffle(few_shot)
    
    # Map label IDs back to label names
    for example in few_shot:
        example['label'] = id2label[example['label']]
    
    return few_shot

### In-Context Learning Inference

def in_context_predict(model, tokenizer, few_shot_examples, text, temperature=0, top_p=0.95):
    prompt = generate_prompt(few_shot_examples, text)

    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

    # Generate the output
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=1,  # Adjust based on expected label length
            temperature=temperature, # should be 0 for deterministic output
            top_p=top_p,
            do_sample=False, # no need for randomness
            eos_token_id=tokenizer.eos_token_id,
        )
    
    # Decode the generated tokens
    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    # Extract the label
    label = map_output_to_label(generated_text.split("Label:")[-1], id2label)
    return label

### Evaluation

def evaluate_icl(model, tokenizer, df_test, few_shot_examples, id2label, label2id, model_name, num_examples_per_label):
    y_true = []
    y_pred = []
    
    # Create a tqdm progress bar
    progress_bar = tqdm(total=len(df_test), desc="Processing", unit="sample")
    
    for _, row in df_test.iterrows():
        text = row['text']
        true_label = row['label']
        
        # Use in_context_predict function
        predicted_label = in_context_predict(model, tokenizer, few_shot_examples, text)
        predicted_label_id = label2id.get(predicted_label, -1) if predicted_label is not None else -1
        
        y_true.append(true_label)
        y_pred.append(predicted_label_id)
        
        # Update the progress bar
        progress_bar.update(1)
    
    # Close the progress bar
    progress_bar.close()
    
    # Add predictions to the dataframe
    df_test = df_test.copy()
    df_test['predictions'] = y_pred
    
    # Write scores to CSV
    write_scores(df_test, 'few_shot_results.csv', model_name, num_examples_per_label)




In [5]:
# Select few-shot examples
num_examples_per_label = 2 
few_shot_examples = get_few_shot_examples(df_train, num_examples_per_label=num_examples_per_label)


# Evaluate on the test set
evaluate_icl(model, tokenizer, df_test, few_shot_examples, id2label, label2id, model_name, num_examples_per_label)

Processing:   0%|          | 0/330 [00:00<?, ?sample/s]

Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128009 for

OutOfMemoryError: CUDA out of memory. Tried to allocate 8.13 GiB. GPU 0 has a total capacity of 23.68 GiB of which 5.76 GiB is free. Process 2450972 has 17.91 GiB memory in use. Of the allocated memory 11.59 GiB is allocated by PyTorch, and 6.01 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)