In this notebook we add a crossover function to the basic genetic algorithm shown in [here](https://www.kaggle.com/code/aatiffraz/basic-genetic-search-algorithm). 

It quickly converges towards the minimum of sample 1. By choosing a random position for the crossover, the algorithm escapes local minima easier, but converges at a slower rate.

**Versions:**
- version 1: Implementing the crossover function
- version 2: Reactivating mutation, extending the genpool
- version 5: Implementing a more interesting mutation function
- version 6: Changing the crossover function

In [1]:
pip install -U bitsandbytes

Collecting bitsandbytes
  Downloading bitsandbytes-0.45.0-py3-none-manylinux_2_24_x86_64.whl.metadata (2.9 kB)
Downloading bitsandbytes-0.45.0-py3-none-manylinux_2_24_x86_64.whl (69.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m69.1/69.1 MB[0m [31m22.7 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hInstalling collected packages: bitsandbytes
Successfully installed bitsandbytes-0.45.0
Note: you may need to restart the kernel to use updated packages.


In [2]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import random

import math
import time

import gc
import os
from math import exp
from collections import Counter
from typing import List, Optional, Union

import transformers
import torch


# Read the Data

We read in a currently good performing notebook.

In [3]:
df = pd.read_csv('/kaggle/input/diminutive-effort-tpu/submission.csv')

# Perplexity Function

In [4]:
# Copied from https://www.kaggle.com/code/cdeotte/brute-force-first-sample-perplexity-470

os.environ['OMP_NUM_THREADS'] = '1'
os.environ['TOKENIZERS_PARALLELISM'] = 'false'
PAD_TOKEN_LABEL_ID = torch.nn.CrossEntropyLoss().ignore_index
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


class ParticipantVisibleError(Exception):
    pass

class PerplexityCalculator:
    """
    Calculates perplexity of text using a pre-trained language model.

    Adapted from https://github.com/asahi417/lmppl/blob/main/lmppl/ppl_recurrent_lm.py

    Parameters
    ----------
    model_path : str
        Path to the pre-trained language model

    load_in_8bit : bool, default=False
        Use 8-bit quantization for the model. Requires CUDA.

    device_map : str, default="auto"
        Device mapping for the model.
    """

    def __init__(
        self,
        model_path: str,
        load_in_8bit: bool = False,
        device_map: str = 'auto',
    ):
        self.tokenizer = transformers.AutoTokenizer.from_pretrained(model_path,padding_side="right")
        # Configure model loading based on quantization setting and device availability
        if load_in_8bit:
            if DEVICE.type != 'cuda':
                raise ValueError('8-bit quantization requires CUDA device')
                
            #quantization_config = transformers.BitsAndBytesConfig(load_in_8bit=True)
            #quantization_config = transformers.BitsAndBytesConfig(load_in_4bit=True)

            quantization_config = transformers.BitsAndBytesConfig(
                load_in_4bit = True,
                bnb_4bit_quant_type = "fp4", #fp4 nf4
                bnb_4bit_use_double_quant = False,
                bnb_4bit_compute_dtype=torch.float16,
            )
            
            self.model = transformers.AutoModelForCausalLM.from_pretrained(
                model_path,
                quantization_config=quantization_config,
                device_map=device_map,
            )
        else:
            self.model = transformers.AutoModelForCausalLM.from_pretrained(
                model_path,
                torch_dtype=torch.float16 if DEVICE.type == 'cuda' else torch.float32,
                device_map=device_map,
            )

        self.loss_fct = torch.nn.CrossEntropyLoss(reduction='none')

        self.model.eval()
        #if not load_in_8bit:
        #    self.model.to(DEVICE)  # Explicitly move the model to the device

    def get_perplexity(
        self, input_texts: Union[str, List[str]], batch_size: 32
    ) -> Union[float, List[float]]:
        """
        Calculates the perplexity of given texts.

        Parameters
        ----------
        input_texts : str or list of str
            A single string or a list of strings.

        batch_size : int, default=None
            Batch size for processing. Defaults to the number of input texts.

        verbose : bool, default=False
            Display progress bar.

        Returns
        -------
        float or list of float
            A single perplexity value if input is a single string,
            or a list of perplexity values if input is a list of strings.

        Examples
        --------
        >>> import pandas as pd
        >>> model_path = "/kaggle/input/gemma-2/transformers/gemma-2-9b/2"
        >>> scorer = PerplexityCalculator(model_path=model_path)

        >>> submission = pd.DataFrame({
        ...     'id': [0, 1, 2],
        ...     'text': ["this is a normal english sentence", "thsi is a slihgtly misspelled zr4g sentense", "the quick brown fox jumps over the lazy dog"]
        ... })
        >>> perplexities = scorer.get_perplexity(submission["text"].tolist())
        >>> perplexities[0] < perplexities[1]
        True
        >>> perplexities[2] < perplexities[0]
        True

        >>> perplexities = scorer.get_perplexity(["this is a sentence", "another sentence"])
        >>> all(p > 0 for p in perplexities)
        True

        >>> scorer.clear_gpu_memory()
        """
        single_input = isinstance(input_texts, str)
        input_texts = [input_texts] if single_input else input_texts

        loss_list = []

        batches = len(input_texts)//batch_size + (len(input_texts)%batch_size != 0)
        for j in range(batches):
            
            a = j*batch_size
            b = (j+1)*batch_size
            input_batch = input_texts[a:b]
        
            with torch.no_grad():

                # Explicitly add sequence boundary tokens to the text
                text_with_special = [f"{self.tokenizer.bos_token}{text}{self.tokenizer.eos_token}" for text in input_batch]

                # Tokenize
                model_inputs = self.tokenizer(
                    text_with_special,
                    return_tensors='pt',
                    add_special_tokens=False,
                    padding=True
                )

                if 'token_type_ids' in model_inputs:
                    model_inputs.pop('token_type_ids')

                model_inputs = {k: v.to(DEVICE) for k, v in model_inputs.items()}

                # Get model output
                output = self.model(**model_inputs, use_cache=False)
                logits = output['logits']

                label = model_inputs['input_ids']
                label[label == self.tokenizer.pad_token_id] = PAD_TOKEN_LABEL_ID

                # Shift logits and labels for calculating loss
                shift_logits = logits[..., :-1, :].contiguous()  # Drop last prediction
                shift_labels = label[..., 1:].contiguous()  # Drop first input

                # Calculate token-wise loss
                loss = self.loss_fct(
                    shift_logits.view(-1, shift_logits.size(-1)),
                    shift_labels.view(-1)
                )

                loss = loss.view(len(logits), -1)
                valid_length = (shift_labels != PAD_TOKEN_LABEL_ID).sum(dim=-1)
                loss = torch.sum(loss, -1) / valid_length

                loss_list += loss.cpu().tolist()

                # Debug output
                #print(f"\nProcessing: '{text}'")
                #print(f"With special tokens: '{text_with_special}'")
                #print(f"Input tokens: {model_inputs['input_ids'][0].tolist()}")
                #print(f"Target tokens: {shift_labels[0].tolist()}")
                #print(f"Input decoded: {self.tokenizer.decode(model_inputs['input_ids'][0])}")
                #print(f"Target decoded: {self.tokenizer.decode(shift_labels[0])}")
                #print(f"Individual losses: {loss.tolist()}")
                #print(f"Average loss: {sequence_loss.item():.4f}")

        ppl = [exp(i) for i in loss_list]

        # print("\nFinal perplexities:")
        # for text, perp in zip(input_texts, ppl):
        #     print(f"Text: '{text}'")
        #     print(f"Perplexity: {perp:.2f}")

        return ppl[0] if single_input else ppl

    def clear_gpu_memory(self) -> None:
        """Clears GPU memory by deleting references and emptying caches."""
        if not torch.cuda.is_available():
            return

        # Delete model and tokenizer if they exist
        if hasattr(self, 'model'):
            del self.model
        if hasattr(self, 'tokenizer'):
            del self.tokenizer

        # Run garbage collection
        gc.collect()

        # Clear CUDA cache and reset memory stats
        with DEVICE:
            torch.cuda.empty_cache()
            torch.cuda.ipc_collect()
            torch.cuda.reset_peak_memory_stats()


scorer = PerplexityCalculator('/kaggle/input/gemma-2/transformers/gemma-2-9b/2')

Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

# Genetic Algorithm

Here we define the crossover function and the genetic algorithm.

## Crossover Function

The crossover function generates two offspring from two parent sequences. It typically selects a small segment of 2 to 5 words (occasionally a larger segment) from each parent and inserts this segment into the corresponding position (or into a random position if random_position == True) in the offspring. The remaining positions in the offspring are filled with the words from the other parent, maintaining their original order.

In [5]:
def order_crossover(p1, p2, random_position = False, verbose = False):
    # Split the parent texts into lists of words
    # Ensure both parents are lists of words 
    #p1 = p1.split() 
    #p2 = p2.split()
    size = len(p1)
    child1, child2 = [None]*size, [None]*size

    # Ensure both parents have the same number of words
    if len(p1) != len(p2):
        print(f"parent 1: {p1}")
        print(f"parent2: {p2}")
        raise ValueError("Both parents must have the same number of words.")

    # Randomly select a subsequence
    if random.random() < 0.2: #in 20% of the cases a larger crossover
        crossover_size = random.choice([2, size-1])
    else: #random crossover
        crossover_size = random.choice([2, 5])
    start = random.randint(0, size - crossover_size-1) 
    end = start + crossover_size
    if verbose:
        print(f"Selected subsequence length {crossover_size} from index {start} to {end-1}")

    # Copy the subsequence to the children at a random position
    if random_position:
        start_new = random.randint(0, size - crossover_size-1)
        end_new = start_new + crossover_size
    else:
        start_new = start
        end_new = end
    if verbose:
        print(f"Selected new position is {start_new}")
    child1[start_new:end_new] = p1[start:end]
    child2[start_new:end_new] = p2[start:end]

    # Fill the remaining positions
    def fill_remaining(child, parent):
        child_temp = child.copy()
        parent_idx = 0
        for i in range(size):
            if child[i] is None:
                while parent_idx < size and parent[parent_idx] in child_temp:
                    child_temp.remove(parent[parent_idx])
                    parent_idx += 1
                if parent_idx < size:
                    child[i] = parent[parent_idx]
                    parent_idx += 1

    fill_remaining(child1, p2)
    fill_remaining(child2, p1)

    # Join the lists of words back into strings
    #child1 = " ".join(child1)
    #child2 = " ".join(child2)

    return child1, child2

Now just a small test that crossover is working: 

In [6]:
# Example parents
p1 = "hohoho this is a merry santa crossover test in a genetic workshop"
p2 = "crossover workshop hohoho in a this test merry genetic is santa a"

# Perform crossover
child1, child2 = order_crossover(p1.split(), p2.split(), random_position = True, verbose = True)

print("Child1:", ' '.join(child1))
print("Child2:", ' '.join(child2))

Selected subsequence length 2 from index 4 to 5
Selected new position is 9
Child1: crossover workshop hohoho in a this test genetic is merry santa a
Child2: hohoho is merry santa crossover test in a genetic a this workshop


## Basic Genetic Algorithm

Now we define the Genetic Algorithm as in [this notebook](https://www.kaggle.com/code/aatiffraz/basic-genetic-search-algorithm), but add the cross-over function. We comment out some of the functionalities of original code, but we reintroduce the possibility for mutations. We define the mutation function seperately. Mutation keeps the genpool diverse and the rate can be adjusted by the parameter `mutation_rate`.

In [7]:
def mutate(child): 
    #mutation_type = random.choice(['swap2', 'swap3','swap', 'inversion', 'scramble', 'shuffle']) 
    mutation_type = random.choice(['swap2', 'swap3','swap', 'inversion', 'scramble']) 
    #child = child.split() # Convert child to a list of words 
    n = len(child) 
    if mutation_type == "swap2":
        i,j = random.sample(range(n),2)
        child[i], child[j] = child[j], child[i] 
    elif mutation_type == "swap3":
        i,j,k = random.sample(range(n),3)
        child[i], child[j], child[k] = child[j], child[k], child[i] 
    elif mutation_type == 'swap': 
        mutation_size = random.randint(1, n//3) 
        for _ in range(mutation_size): 
            i, j = random.sample(range(n), 2) 
            child[i], child[j] = child[j], child[i] 
    elif mutation_type == 'inversion': 
        start, end = sorted(random.sample(range(n), 2)) 
        child[start:end+1] = reversed(child[start:end+1]) 
    elif mutation_type == 'scramble': 
        start, end = sorted(random.sample(range(n), 2)) 
        subset = child[start:end+1] 
        random.shuffle(subset) 
        child[start:end+1] = subset 
    elif mutation_type == "shuffle":
        random.shuffle(child)
    return child #" ".join(child)

I noticed that it is more practical to deal with single words instead of the full text, as it allows to keep good working word combinations together. Therefore I changed the algorithm accordingly. The one point which is not working with word combinations is the step with the set, where only unique solutions are scored. This needs to be commented out if the aim is to work with word combinations.

In [8]:
#copied from https://www.kaggle.com/code/aatiffraz/basic-genetic-search-algorithm
def genetic_search(population, scorer, iterations, population_size, top_k, mutation_rate = 0.2, 
                   random_position = False, verbose = False):
    print(f"Starting \n  ------------------------------------------------------\n")
    start_time = time.time()
    words = population[0]#.split()
    n = len(words)

    #Start with a the parent population
    #random population
    # Large initial search population
    #population = [" ".join(random.sample(words, n)) for _ in range(population_size*20)]
    #population.append(" ".join(words))
    population_text = [" ".join(population[i]) for i in range(len(population))]

    scores = scorer.get_perplexity(population_text, batch_size=2)

    best_score = float('inf')
    best_sequence = None
    last_best_edited = 0
    last_mutation = 0
    last_best_score = float("inf")
    last_mutation = 0
    
    for iteration in range(iterations):
        # Select few top sequences
        top_indices = np.argsort(scores)[:top_k]
        # Why should only the fittest make it? Come on, the others want to have fun too:
        remaining_indices = np.argsort(scores)[top_k:] 
        # Add a random sample of 15 from the remaining indices 
        random_sample = random.sample(list(remaining_indices), min(top_k // 3, len(remaining_indices))) 
        top_indices = np.concatenate((top_indices, random_sample)).astype(int)
        
        top_sequences = [population[i] for i in top_indices]
        top_scores = [scores[i] for i in top_indices]
        last_best_edited += 1
        last_mutation += 1

        if top_scores[0] < best_score:
            best_score = top_scores[0]
            best_sequence = top_sequences[0]
            last_best_edited = 0
            if verbose:
                print(f" Best new sequence: {best_sequence}")

        print(f"Best Scores: {top_scores[0:3]} on {iteration}th iteration in {time.time() - start_time}s")
        
        new_population = []
        for _ in range(population_size):
            # Choose two random top sequence as a parents
            parent1 = random.choice(top_sequences)#.split()
            #parent1 = weighted_random_choice(population, probabilities)
            # Select parent2 ensuring it's different from parent1 
            while True: 
                parent2 = random.choice(top_sequences)#.split() 
                #parent2 = weighted_random_choice(population, probabilities)
                if parent2 != parent1:
                    break
            #print(parent1)
            #print(parent2)
            ## Make a few random muytations
            #mutation_size = random.random()
            #for random_change in range(math.floor(n*mutation_size)):
            #    i, j = random.sample(range(n), 2)
            #    parent[i], parent[j] = parent[j], parent[i]

            #new_population.append(" ".join(parent))
            child1, child2 = order_crossover(parent1, parent2, random_position = random_position)
            # 50% chance of mutation
            if random.random() < mutation_rate:
                mutate(child1)
            if random.random() < mutation_rate:
                mutate(child2)
            new_population.append(child1)
            new_population.append(child2)

        # V2, adding more random search for the larger texts' exponentially increasing search spaces
        #new_population += [" ".join(random.sample(words, n)) for _ in range(population_size)]

        
        # Check for plateau, aggressively introduce more random search
        # Environment can be hard sometimes. We need mutations.
        #if last_best_edited > 3 and np.min(scores) >= last_best_score and last_mutation > 3:
        if top_scores[0:3] == last_best_score:
            print_sequence = " ".join(top_sequences[0])
            print(f"Oh no, plateau at iteration {iteration}. Best sequence is {print_sequence}. Our population mutates.")
            ## Make a few random mutations
            new_population2 = []
            for child in new_population:
                #child = child.split()
                #random.shuffle(child)
                #child = " ".join(child)
                child = mutate(child)
                #child = mutate(child)
                #child = mutate(child)
                new_population2.append(child)
            new_population = new_population2
            last_mutation = 0

            #new_population += [" ".join(random.sample(words, n)) for _ in range(population_size*20)]

        # Don't seem to be going anywhere
        #if last_best_edited >= 20:
        #    print("Finding Nothing, Breaking off")
        #    break

        # Score the new population and update
        # Initialize a set to track unique solutions 
        # This needs to be changed or commented out for dealing with word combinations
        unique_solutions = set() 
        ## Add solutions to the set 
        for solution in new_population: 
            unique_solutions.add(" ".join(solution)) 
        ## Convert the set back to a list 
        new_population = list(unique_solutions)
        new_population = [solution.split() for solution in unique_solutions]
        population = new_population
        
        
        last_best_score = top_scores[0:3] #min(scores)
        scores = scorer.get_perplexity([" ".join(new_population[i]) for i in range(len(population))], batch_size=4)
    
    return best_sequence, best_score

# Let's Go

We choose sample 1 and a shuffle of sample 1 as starting points. The second parent should be very different from the first.

In [9]:
population_size = 200
top_k = 40
iterations = 30
mutation_rate = 0.2
random_position = True

#for idx, text in enumerate(df["text"].to_list()):
population = []
santa1 = "advent chimney elf family fireplace gingerbread mistletoe ornament reindeer scrooge"
santa2 = "family ornament scrooge chimney gingerbread advent reindeer fireplace mistletoe elf"
population.append(santa1.split())
population.append(santa2.split())
best_sequence, best_score = genetic_search(
        population, scorer, iterations=iterations, population_size=population_size, top_k=top_k,
        mutation_rate = mutation_rate, random_position = random_position,
    )
print_sequence = " ".join(best_sequence)
print(f"Sample {1}: Best perplexity={best_score:.2f}, Best permutation: {print_sequence}")
df.loc[0, "text"] = print_sequence
df.loc[0,"score"] = best_score

df.to_csv("submission.csv",index=False)

Starting 
  ------------------------------------------------------

Best Scores: [3887.9021574548156, 7853.778244357167] on 0th iteration in 0.6811323165893555s
Best Scores: [1430.2792730137226, 1577.0026226114692, 1608.1062123274673] on 1th iteration in 7.68685507774353s
Best Scores: [975.3657218509629, 1223.3826929169447, 1242.6481670549958] on 2th iteration in 19.950644731521606s
Best Scores: [864.1261369598841, 898.5490089196367, 927.0720162676876] on 3th iteration in 32.306334495544434s
Best Scores: [571.1525003973912, 639.6604572352473, 672.9821121368258] on 4th iteration in 44.67410850524902s
Best Scores: [571.1525003973912, 622.4067077254782, 627.2883040361144] on 5th iteration in 57.32476496696472s
Best Scores: [538.6481096972173, 562.2976016279617, 571.1525003973912] on 6th iteration in 69.69499826431274s
Best Scores: [538.6481096972173, 557.921766975976, 566.7077563764824] on 7th iteration in 81.84695410728455s
Best Scores: [528.2297121579952, 557.921766975976, 562.297601627