In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

model_name = "Orkhan/llama-2-7b-absa"

print("Loading Llama 7B model and tokenizer...")
print(f"CUDA Available: {torch.cuda.is_available()}")

try:
    llama_tokenizer = AutoTokenizer.from_pretrained(model_name, use_auth_token=True)
    
    # Try loading on GPU first, fall back to CPU if issues
    try:
        llama_model = AutoModelForCausalLM.from_pretrained(
            model_name,
            torch_dtype=torch.float16,
            device_map="auto",
            trust_remote_code=True,
            low_cpu_mem_usage=True
        )
    except Exception as e:
        print(f"GPU loading failed ({e}), falling back to CPU...")
        llama_model = AutoModelForCausalLM.from_pretrained(
            model_name,
            torch_dtype=torch.float32,
            device_map="cpu",
            trust_remote_code=True
        )
    
    print("Llama 7B model loaded successfully!")
    
    # Set pad token
    if llama_tokenizer.pad_token is None:
        llama_tokenizer.pad_token = llama_tokenizer.eos_token
    
    device = next(llama_model.parameters()).device
    print(f"Model device: {device}")
    print(f"Model dtype: {next(llama_model.parameters()).dtype}")
    
except Exception as e:
    print(f"Error loading model: {e}")
    print("You may need to authenticate with Hugging Face first:")
    print("  huggingface-cli login")
    
    
def get_llama_sentiment(text, max_length=512):
    """
    Get sentiment classification from Llama 7B using prompt-based approach.
    Returns sentiment label (Positive/Negative/Neutral) and confidence score.
    """
    try:
        # Truncate text to max_length characters
        text = text[:max_length]
        
        # Create a sentiment analysis prompt
        prompt = f"""Classify the sentiment of the following text.

Text: {text}

Sentiment:"""
        
        # Tokenize
        inputs = llama_tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512)
        
        # Get device and move inputs
        device = next(llama_model.parameters()).device
        input_ids = inputs['input_ids'].to(device)
        attention_mask = inputs.get('attention_mask').to(device) if 'attention_mask' in inputs else None
        
        # Ensure token ids are valid
        if input_ids.max() >= llama_model.config.vocab_size:
            print(f"Warning: token id {input_ids.max()} >= vocab size {llama_model.config.vocab_size}")
        
        # Generate response with conservative settings
        with torch.no_grad():
            outputs = llama_model.generate(
                input_ids,
                attention_mask=attention_mask,
                max_new_tokens=5,
                min_new_tokens=1,
                temperature=1.0,
                top_p=0.9,
                top_k=50,
                do_sample=False,  # Use greedy decoding
                pad_token_id=llama_tokenizer.eos_token_id,
                eos_token_id=llama_tokenizer.eos_token_id,
                use_cache=True
            )
        
        # Decode the response
        
        response = llama_tokenizer.decode(outputs[0], skip_special_tokens=True)
        response = response[len(prompt):].strip().lower()

        # Classify sentiment based on keywords
        if any(word in response for word in ["positive", "good", "great", "excellent", "wonderful", "amazing", "love"]):
            sentiment = "POSITIVE"
        elif any(word in response for word in ["negative", "bad", "poor", "awful", "terrible", "hate", "worst"]):
            sentiment = "NEGATIVE"
        else:
            sentiment = "NEUTRAL"
         
        return sentiment
    
    except Exception as e:
        print(f"Error in get_llama_sentiment: {type(e).__name__}: {e}")
        import traceback
        traceback.print_exc()
        return "NEUTRAL", 0.5

# Test the function
print("Testing Llama sentiment classification...")
test_text = "This movie was thoughtful"
sentiment = get_llama_sentiment(test_text)
print(f"Text: {test_text}\nPredicted Sentiment: {sentiment}")

  from .autonotebook import tqdm as notebook_tqdm


Loading Llama 7B model and tokenizer...
CUDA Available: True


`torch_dtype` is deprecated! Use `dtype` instead!
Loading checkpoint shards: 100%|██████████| 2/2 [00:12<00:00,  6.39s/it]


Llama 7B model loaded successfully!
Model device: cuda:0
Model dtype: torch.float16


The following generation flags are not valid and may be ignored: ['top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Testing Llama sentiment classification...
Text: This movie was thoughtful
Predicted Sentiment: NEUTRAL


In [19]:
import json

# Bidi override characters
LRO = chr(0x202D)  # Left-to-Right Override
RLO = chr(0x202E)  # Right-to-Left Override
LRI = chr(0x2066)  # Left-to-Right Isolate
RLI = chr(0x2067)  # Right-to-Left Isolate
PDI = chr(0x2069)  # Pop Directional Isolate
PDF = chr(0x202C)  # Pop Directional Formatting



class TagAttack:
    def __init__(self, perturbation_budget, tags= [0xE0001,*range(0xE0020, 0xE007F+1)]):
        self.perturbation_budget = perturbation_budget
        self.tags = tags

    def perturb(self, text):
        """
        Insert random Unicode tag characters into text, treating each
        Unicode codepoint as a single character.
        """
        # Convert string to list of codepoints
        codepoints = [ord(c) for c in text]

        for i in range(self.perturbation_budget):
            # Choose random position in codepoint list
            rand_index = random.randrange(len(codepoints) + 1)  # +1 to allow appending
            rand_tag = random.choice(self.tags)

            # Insert the tag codepoint at the chosen position
            codepoints.insert(rand_index, rand_tag)

        # Convert back to string
        return "".join(chr(cp) for cp in codepoints)


class VariationSelectorAttack:
    def __init__(self, perturbation_budget, variation_selectors = [*range(0xFE00, 0xFE0F + 1), *range(0xE0100, 0xE01EF + 1)]):
        self.perturbation_budget = perturbation_budget
        self.variation_selectors = variation_selectors

    def perturb(self, text):
        # Convert string to list of codepoints
        codepoints = [ord(c) for c in text]

        for _ in range(self.perturbation_budget):
            # Choose random position in codepoint list
            rand_index = random.randrange(len(codepoints) + 1)  # allow append
            rand_vs = random.choice(self.variation_selectors)

            # Insert the variation selector codepoint
            codepoints.insert(rand_index, rand_vs)

        # Convert back to string
        return "".join(chr(cp) for cp in codepoints)


class InvisibleCharAttack:
    def __init__(self, perturbation_budget, invisible_chars=[0x200B, 0x200C, 0x200D, 0x2060, 0xFEFF]):
        self.perturbation_budget = perturbation_budget
        self.invisible_chars = invisible_chars

    def perturb(self, text):
        # Convert string to list of codepoints
        codepoints = [ord(c) for c in text]

        for _ in range(self.perturbation_budget):
            # Choose random position in codepoint list
            rand_index = random.randrange(len(codepoints) + 1)  # allow append
            rand_invisible = random.choice(self.invisible_chars)

            # Insert the invisible character
            codepoints.insert(rand_index, rand_invisible)

        # Convert back to string
        return "".join(chr(cp) for cp in codepoints)


class HomoglyphAttack:
    def __init__(self, perturbation_budget, homoglyph_map_path='/home/stk31/textual-adversarial-defense/utils/homoglyphs/intentional.json'):
        self.perturbation_budget = perturbation_budget
        with open(homoglyph_map_path, 'r') as f:
            hex_map = json.load(f)
        # Convert hex strings to codepoints
        self.homoglyph_map = {int(k, 16): int(v, 16) for k, v in hex_map.items()}
        self.homoglyph_chars = list(self.homoglyph_map.keys())

    def perturb(self, text):
        # Convert string to list of codepoints
        codepoints = [ord(c) for c in text]
        
        # Replace random characters with homoglyphs
        indices = list(range(len(codepoints)))
        random.shuffle(indices)
        
        for i in indices[:min(self.perturbation_budget, len(indices))]:
            cp = codepoints[i]
            # If character has a homoglyph, replace it
            if cp in self.homoglyph_map:
                codepoints[i] = self.homoglyph_map[cp]

        # Convert back to string
        return "".join(chr(cp) for cp in codepoints)


class DeletionCharAttack:
    def __init__(self, perturbation_budget, deletion_char=0x8):
        self.perturbation_budget = perturbation_budget
        self.deletion_char = deletion_char  # Backspace character
        self.ascii_chars = list(range(32, 127))  # Printable ASCII characters

    def perturb(self, text):
        # Convert string to list of codepoints
        codepoints = [ord(c) for c in text]
        
        for _ in range(self.perturbation_budget):
            # Choose random position in codepoint list
            rand_index = random.randrange(len(codepoints) + 1)  # allow append
            # Insert random ASCII character followed by deletion character
            rand_ascii = random.choice(self.ascii_chars)
            codepoints.insert(rand_index, rand_ascii)
            codepoints.insert(rand_index + 1, self.deletion_char)

        # Convert back to string
        return "".join(chr(cp) for cp in codepoints)

import json
import random

# Bidi override characters
RLO = chr(0x202E)   # Right-to-Left Override
PDF = chr(0x202C)   # Pop Directional Formatting
RLI = chr(0x2067)   # Right-to-Left Isolate
PDI = chr(0x2069)   # Pop Directional Isolate
LRO = chr(0x202D)   # Left-to-Right Override
LRI = chr(0x2066)   # Left-to-Right Isolate
# ... (Other classes like Swap, TagAttack, etc., remain unchanged) ...

class BidiAttack:
    def __init__(self, perturbation_budget):
        self.perturbation_budget = perturbation_budget
    
    def _encode_swap_spoof(self, one, two):
        """
        Creates a string that contains the characters 'two' followed by 'one' 
        in the data, but is displayed as 'one' followed by 'two'.
        Sequence: RLO + two + one + PDF
        """
        # LRO, LRI, RLO, LRI, el.two, PDI, LRI, el.one, PDI, PDF, PDI, PDF
        
        # Display order is visually reversed by RLO
        return LRO + LRI+ RLO + LRI + two + PDI + LRI + one + PDI + PDF + PDI + PDF

    def perturb(self, text):
        """
        Swaps random *non-overlapping* adjacent character pairs in the 
        underlying data and uses Bidi controls to visually reverse the swap.
        """
        chars = list(text)
        n = len(chars)
        
        possible_start_indices = list(range(n - 1))
        random.shuffle(possible_start_indices)
        
        swaps_to_make = set()
        swaps_remaining = self.perturbation_budget
        
        # Select non-overlapping swap positions
        for i in possible_start_indices:
            if i not in swaps_to_make and i + 1 not in swaps_to_make:
                if swaps_remaining > 0:
                    swaps_to_make.add(i)
                    # Add the next index to the set to prevent it from starting a new swap
                    swaps_to_make.add(i + 1) 
                    swaps_remaining -= 1
                else:
                    break

        swaps_to_make = []
        available_positions = list(range(n - 1))
        random.shuffle(available_positions)
        
        processed_indices = set()
        
        for pos in available_positions:
            # A swap starts at 'pos' and involves 'pos' and 'pos + 1'
            if pos not in processed_indices and (pos + 1) not in processed_indices:
                if len(swaps_to_make) < self.perturbation_budget:
                    swaps_to_make.append(pos)
                    # Mark both characters in the pair as 'used'
                    processed_indices.add(pos)
                    processed_indices.add(pos + 1)
                else:
                    break
        
        
        # Build perturbed text
        perturbed_text = ""
        i = 0
        while i < n:
            if i in swaps_to_make and chars[i].isalpha() and chars[i + 1].isalpha():
                # This index starts a swap
                char_one = chars[i]
                char_two = chars[i + 1]
                
                # Data: two, one (visually displays as: one, two)
                spoofed_pair = self._encode_swap_spoof(char_one, char_two)
                perturbed_text += spoofed_pair
                
                # Advance counter by 2 since both characters were processed
                i += 2 
            else:
                # Add normal character
                perturbed_text += chars[i]
                i += 1
                
        return perturbed_text

In [3]:
from datasets import load_dataset
import _pipeline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
# Testing Llama sentiment classification...
sst2_llama = load_dataset("glue", "sst2", split="train")
data = sst2_llama.select(range(100)).to_list()

accurate = 0
for row in data:
    print(row)
    text = row["sentence"]
    label = row["label"]
    sentiment = get_llama_sentiment(text)
    if sentiment == "POSITIVE" and label == 1:
        accurate += 1
    elif sentiment == "NEGATIVE" and label == 0:
        accurate += 1
    
print(f"Llama SST-2 Accuracy: {accurate}/{len(data)} = {accurate/len(data):.4f}")

{'sentence': 'hide new secretions from the parental units ', 'label': 0, 'idx': 0}
{'sentence': 'contains no wit , only labored gags ', 'label': 0, 'idx': 1}
{'sentence': 'that loves its characters and communicates something rather beautiful about human nature ', 'label': 1, 'idx': 2}
{'sentence': 'remains utterly satisfied to remain the same throughout ', 'label': 0, 'idx': 3}
{'sentence': 'on the worst revenge-of-the-nerds clichés the filmmakers could dredge up ', 'label': 0, 'idx': 4}
{'sentence': "that 's far too tragic to merit such superficial treatment ", 'label': 0, 'idx': 5}
{'sentence': 'demonstrates that the director of such hollywood blockbusters as patriot games can still turn out a small , personal film with an emotional wallop . ', 'label': 1, 'idx': 6}
{'sentence': 'of saucy ', 'label': 1, 'idx': 7}
{'sentence': "a depressed fifteen-year-old 's suicidal poetry ", 'label': 0, 'idx': 8}
{'sentence': "are more deeply thought through than in most ` right-thinking ' films ",

In [89]:


print("\n" + "="*80)
print("LLAMA 7B SENTIMENT CLASSIFICATION ATTACK EXPERIMENT")
print("="*80)

# Use subset of SST-2 data for faster testing with Llama
print("\nLoading SST-2 dataset...")
sst2_llama = load_dataset("glue", "sst2", split="validation")
num_samples_llama = 50   # Smaller subset for Llama due to slower inference
texts_llama = sst2_llama['sentence'][:num_samples_llama]

# Initialize results list
llama_results = []

# Perturbation budgets to test
budgets_llama = [5, 10, 15, 20]  # Include 0 to get baseline predictions

print(f"Testing {num_samples_llama} samples across budgets: {budgets_llama}\n")

# Define attack types (same as before)
attack_types_llama = ['tag', 'invisible', 'deletion', 'bidi', 'combined']

# Get baseline predictions for original texts FIRST
print("Getting baseline predictions for original texts...")
original_preds_llama = []

for i, text in enumerate(texts_llama):
    sentiment = get_llama_sentiment(text)
    original_preds_llama.append(sentiment)
    print(f"  Sample {i+1}: {text[:50]}... -> {sentiment}")

print(f"\nBaseline predictions: {original_preds_llama}\n")

# Process each budget (skip budget 0 in loop since we already have baseline)
for budget in budgets_llama:
    if budget == 0:
        continue  # Skip budget 0, we already have baseline predictions
    
    print(f"\n=== Budget: {budget} ===")
    
    # Initialize attacks
    tag_attack_llama = TagAttack(perturbation_budget=budget)
    invisible_attack_llama = InvisibleCharAttack(perturbation_budget=budget)
    deletion_attack_llama = DeletionCharAttack(perturbation_budget=budget)
    bidi_attack_llama = BidiAttack(perturbation_budget=budget)
    
    # Test each attack type
    for attack_name in attack_types_llama:
        print(f"  Testing {attack_name} attack...", end=" ", flush=True)
        
        perturbed_preds = []
        perturbed_confs = []
        sanitized_preds = []
        sanitized_confs = []
        
        for text in texts_llama:
            # Apply attack
            if attack_name == 'tag':
                perturbed = tag_attack_llama.perturb(text)
            elif attack_name == 'invisible':
                perturbed = invisible_attack_llama.perturb(text)
            elif attack_name == 'deletion':
                perturbed = deletion_attack_llama.perturb(text)
            elif attack_name == 'bidi':
                perturbed = bidi_attack_llama.perturb(text)
            elif attack_name == 'combined':
                perturbed = tag_attack_llama.perturb(text)
                perturbed = invisible_attack_llama.perturb(perturbed)
                perturbed = deletion_attack_llama.perturb(perturbed)
                perturbed = bidi_attack_llama.perturb(perturbed)
            
            # Get prediction on perturbed text
            sentiment = get_llama_sentiment(perturbed)
            perturbed_preds.append(sentiment)
            
            # Apply sanitizers to the perturbed text
            p = _pipeline.Pipeline()
            if attack_name == 'combined':
                p.add_bidi_sanitizer()
                p.add_tag_sanitizer()
                p.add_invisible_sanitizer()
                p.add_deletion_sanitizer()
            else:
                # Map attack to sanitizer
                if attack_name == 'tag':
                    p.add_tag_sanitizer()
                elif attack_name == 'invisible':
                    p.add_invisible_sanitizer()
                elif attack_name == 'deletion':
                    p.add_deletion_sanitizer()
                elif attack_name == 'bidi':
                    p.add_bidi_sanitizer()
            
            sanitized = p.sanitize(perturbed)
            
            # Get prediction on sanitized text
            sentiment = get_llama_sentiment(sanitized)
            sanitized_preds.append(sentiment)
        
        # Calculate metrics
        attack_success_rate = sum(1 for i in range(num_samples_llama) if perturbed_preds[i] != original_preds_llama[i]) / num_samples_llama * 100
        defense_recovery_rate = sum(1 for i in range(num_samples_llama) if sanitized_preds[i] == original_preds_llama[i]) / num_samples_llama * 100
        
        llama_results.append({
            'model': 'Llama-7B',
            'budget': budget,
            'attack': attack_name,
            'attack_success_rate': f"{attack_success_rate:.1f}%",
            'defense_recovery_rate': f"{defense_recovery_rate:.1f}%",
        })
        
        print(f"Attack Success: {attack_success_rate:.1f}% | Defense Recovery: {defense_recovery_rate:.1f}%")

# Create results dataframe
llama_results_df = pd.DataFrame(llama_results)
print("\n" + "="*80)
print("LLAMA 7B EXPERIMENT RESULTS")
print("="*80)
print(llama_results_df.to_string(index=False))

# Save to CSV
llama_csv_file = "llama7b_attack_defense_results.csv"
llama_results_df.to_csv(llama_csv_file, index=False)
print(f"\nResults saved to {llama_csv_file}")



LLAMA 7B SENTIMENT CLASSIFICATION ATTACK EXPERIMENT

Loading SST-2 dataset...
Testing 50 samples across budgets: [5, 10, 15, 20]

Getting baseline predictions for original texts...
  Sample 1: it 's a charming and often affecting journey . ... -> POSITIVE
  Sample 2: unflinchingly bleak and desperate ... -> NEGATIVE
  Sample 3: allows us to hope that nolan is poised to embark a... -> POSITIVE
  Sample 4: the acting , costumes , music , cinematography and... -> POSITIVE
  Sample 5: it 's slow -- very , very slow . ... -> NEGATIVE
  Sample 6: although laced with humor and a few fanciful touch... -> POSITIVE
  Sample 7: a sometimes tedious film . ... -> NEGATIVE
  Sample 8: or doing last year 's taxes with your ex-wife . ... -> NEGATIVE
  Sample 9: you do n't have to know about music to appreciate ... -> POSITIVE
  Sample 10: in exactly 89 minutes , most of which passed as sl... -> NEGATIVE
  Sample 11: the mesmerizing performances of the leads keep the... -> POSITIVE
  Sample 12: it tak