In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/gemma-2/transformers/gemma-2-9b/2/model.safetensors.index.json
/kaggle/input/gemma-2/transformers/gemma-2-9b/2/model-00001-of-00008.safetensors
/kaggle/input/gemma-2/transformers/gemma-2-9b/2/config.json
/kaggle/input/gemma-2/transformers/gemma-2-9b/2/model-00003-of-00008.safetensors
/kaggle/input/gemma-2/transformers/gemma-2-9b/2/model-00002-of-00008.safetensors
/kaggle/input/gemma-2/transformers/gemma-2-9b/2/model-00007-of-00008.safetensors
/kaggle/input/gemma-2/transformers/gemma-2-9b/2/README.md
/kaggle/input/gemma-2/transformers/gemma-2-9b/2/model-00008-of-00008.safetensors
/kaggle/input/gemma-2/transformers/gemma-2-9b/2/tokenizer.json
/kaggle/input/gemma-2/transformers/gemma-2-9b/2/tokenizer_config.json
/kaggle/input/gemma-2/transformers/gemma-2-9b/2/model-00005-of-00008.safetensors
/kaggle/input/gemma-2/transformers/gemma-2-9b/2/model-00006-of-00008.safetensors
/kaggle/input/gemma-2/transformers/gemma-2-9b/2/special_tokens_map.json
/kaggle/input/gemma-2/transformer

In [2]:
import kagglehub

# Download latest version
path = kagglehub.model_download("google/gemma-2/transformers/gemma-2-9b")

print("Path to model files:", path)

Path to model files: /kaggle/input/gemma-2/transformers/gemma-2-9b/2


In [3]:
"""Evaluation metric for Santa 2024."""

import gc
import os
from math import exp
from collections import Counter
from typing import List, Optional, Union

import numpy as np
import pandas as pd
import transformers
import torch

os.environ['OMP_NUM_THREADS'] = '1'
os.environ['TOKENIZERS_PARALLELISM'] = 'false'
PAD_TOKEN_LABEL_ID = torch.nn.CrossEntropyLoss().ignore_index
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


class ParticipantVisibleError(Exception):
    pass


def score(
    solution: pd.DataFrame,
    submission: pd.DataFrame,
    row_id_column_name: str,
    model_path: str = '/kaggle/input/gemma-2/transformers/gemma-2-9b/2',
    load_in_8bit: bool = False,
    clear_mem: bool = False,
) -> float:
    """
    Calculates the mean perplexity of submitted text permutations compared to an original text.

    Parameters
    ----------
    solution : DataFrame
        DataFrame containing the original text in a column named 'text'.
        Includes a row ID column specified by `row_id_column_name`.

    submission : DataFrame
        DataFrame containing the permuted text in a column named 'text'.
        Must have the same row IDs as the solution.
        Includes a row ID column specified by `row_id_column_name`.

    row_id_column_name : str
        Name of the column containing row IDs.
        Ensures aligned comparison between solution and submission.

    model_path : str, default='/kaggle/input/gemma-2/transformers/gemma-2-9b/2'
        Path to the serialized LLM.

    load_in_8bit : bool, default=False
        Use 8-bit quantization for the model. Requires CUDA.

    clear_mem : bool, default=False
        Clear GPU memory after scoring by clearing the CUDA cache.
        Useful for testing.

    Returns
    -------
    float
        The mean perplexity score. Lower is better.

    Raises
    ------
    ParticipantVisibleError
        If the submission format is invalid or submitted strings are not valid permutations.

    Examples
    --------
    >>> import pandas as pd
    >>> model_path = "/kaggle/input/gemma-2/transformers/gemma-2-9b/2"
    >>> solution = pd.DataFrame({
    ...     'id': [0, 1],
    ...     'text': ["this is a normal english sentence", "the quick brown fox jumps over the lazy dog"]
    ... })
    >>> submission = pd.DataFrame({
    ...     'id': [0, 1],
    ...     'text': ["sentence english normal a is this", "lazy the over jumps fox brown quick the dog"]
    ... })
    >>> score(solution, submission, 'id', model_path=model_path, clear_mem=True) > 0
    True
    """
    # Check that each submitted string is a permutation of the solution string
    sol_counts = solution.loc[:, 'text'].str.split().apply(Counter)
    sub_counts = submission.loc[:, 'text'].str.split().apply(Counter)
    invalid_mask = sol_counts != sub_counts
    if invalid_mask.any():
        raise ParticipantVisibleError(
            'At least one submitted string is not a valid permutation of the solution string.'
        )

    # Calculate perplexity for the submitted strings
    sub_strings = [
        ' '.join(s.split()) for s in submission['text'].tolist()
    ]  # Split and rejoin to normalize whitespace
    scorer = PerplexityCalculator(
    model_path='/kaggle/input/gemma-2/transformers/gemma-2-9b/2', 
    load_in_8bit=False
)
 # Initialize the perplexity calculator with a pre-trained model
    perplexities = scorer.get_perplexity(
        sub_strings
    )  # Calculate perplexity for each submitted string

    if clear_mem:
        # Just move on if it fails. Not essential if we have the score.
        try:
            scorer.clear_gpu_memory()
        except:
            print('GPU memory clearing failed.')

    return float(np.mean(perplexities))


class PerplexityCalculator:
    """
    Calculates perplexity of text using a pre-trained language model.

    Adapted from https://github.com/asahi417/lmppl/blob/main/lmppl/ppl_recurrent_lm.py

    Parameters
    ----------
    model_path : str
        Path to the pre-trained language model

    load_in_8bit : bool, default=False
        Use 8-bit quantization for the model. Requires CUDA.

    device_map : str, default="auto"
        Device mapping for the model.
    """

    def __init__(
        self,
        model_path: str,
        load_in_8bit: bool = False,
        device_map: str = 'auto',
    ):
        self.tokenizer = transformers.AutoTokenizer.from_pretrained(model_path)
        # Configure model loading based on quantization setting and device availability
        if load_in_8bit:
            if DEVICE.type != 'cuda':
                raise ValueError('8-bit quantization requires CUDA device')
            quantization_config = transformers.BitsAndBytesConfig(load_in_8bit=True)
            self.model = transformers.AutoModelForCausalLM.from_pretrained(
                model_path,
                quantization_config=quantization_config,
                device_map=device_map,
            )
        else:
            self.model = transformers.AutoModelForCausalLM.from_pretrained(
                model_path,
                torch_dtype=torch.float16 if DEVICE.type == 'cuda' else torch.float32,
                device_map=device_map,
            )

        self.loss_fct = torch.nn.CrossEntropyLoss(reduction='none')

        self.model.eval()

    def get_perplexity(self, input_texts: Union[str, List[str]], batch_size: int = 8, debug=False) -> Union[float, List[float]]:
        """
        Optimized perplexity calculation using batching for efficiency.
        """
        single_input = isinstance(input_texts, str)
        input_texts = [input_texts] if single_input else input_texts
        loss_list = []
        
        for i in range(0, len(input_texts), batch_size):
            batch_texts = input_texts[i:i + batch_size]
            with torch.no_grad():
                model_inputs = self.tokenizer(
                        batch_texts,
                        return_tensors='pt',
                        padding=True,
                        truncation=True,
                        max_length=512,  # Limit input sequence length
                        add_special_tokens=True,
                    ).to(DEVICE)

                
                
                output = self.model(**model_inputs)
                logits = output['logits']
                
                shift_logits = logits[..., :-1, :].contiguous()
                shift_labels = model_inputs['input_ids'][..., 1:].contiguous()
                
                loss = self.loss_fct(
                    shift_logits.view(-1, shift_logits.size(-1)),
                    shift_labels.view(-1)
                )
                
                batch_losses = loss.view(len(batch_texts), -1).sum(dim=1) / model_inputs['attention_mask'].sum(dim=1)
                loss_list.extend(batch_losses.cpu().tolist())
                
                if debug:
                    print(f"Processed batch {i // batch_size + 1} of {len(input_texts) // batch_size + 1}")
    
        ppl = [exp(i) for i in loss_list]
        return ppl[0] if single_input else ppl

    def clear_gpu_memory(self) -> None:
        """Enhanced memory clearing."""
        if torch.cuda.is_available():
            del self.model
            del self.tokenizer
            gc.collect()
            with torch.cuda.device(DEVICE):
                torch.cuda.empty_cache()
                torch.cuda.ipc_collect()


In [4]:
import pandas as pd
import numpy as np
import random
import math
from copy import deepcopy


class Particle:
    def __init__(self, sequence, solution_df, scorer):
        self.sequence = sequence[:]
        self.velocity = [0] * len(sequence)
        self.best_sequence = sequence[:]
        self.best_perplexity = float('inf')
        self.current_perplexity = float('inf')
        self.scorer = scorer  # PerplexityCalculator instance
        self.evaluate(solution_df)

    def evaluate(self, solution_df):
        submission_df = pd.DataFrame({'id': [solution_df['id'].iloc[0]], 'text': [' '.join(self.sequence)]})
        self.current_perplexity = self.scorer.get_perplexity([' '.join(self.sequence)])[0]  # Using get_perplexity
        if self.current_perplexity < self.best_perplexity:
            self.best_perplexity = self.current_perplexity
            self.best_sequence = self.sequence[:]


class Ant:
    def __init__(self, sequence, solution_df, scorer):
        self.sequence = sequence[:]
        self.current_perplexity = float('inf')
        self.best_perplexity = float('inf')
        self.solution_df = solution_df
        self.scorer = scorer
        self.evaluate()

    def evaluate(self):
        submission_df = pd.DataFrame({'id': [self.solution_df['id'].iloc[0]], 'text': [' '.join(self.sequence)]})
        self.current_perplexity = self.scorer.get_perplexity([' '.join(self.sequence)])[0]
        if self.current_perplexity < self.best_perplexity:
            self.best_perplexity = self.current_perplexity

def simulated_annealing(sequence, solution_df, scorer, initial_temp=100, cooling_rate=0.95, max_iterations=20):
    """SA comme recherche locale"""
    current_sequence = sequence[:]
    best_sequence = sequence[:]

    submission_df = pd.DataFrame({'id': [solution_df['id'].iloc[0]], 'text': [' '.join(best_sequence)]})
    best_perplexity = scorer.get_perplexity([' '.join(best_sequence)])[0]  # Using get_perplexity
    current_temp = initial_temp

    for _ in range(max_iterations):
        # Générer un voisin par permutation
        neighbor = current_sequence[:]
        i, j = random.sample(range(len(neighbor)), 2)
        neighbor[i], neighbor[j] = neighbor[j], neighbor[i]

        # Évaluer le voisin
        neighbor_submission = pd.DataFrame({'id': [solution_df['id'].iloc[0]], 'text': [' '.join(neighbor)]})
        neighbor_perplexity = scorer.get_perplexity([' '.join(neighbor)])[0]  # Using get_perplexity

        # Accepter ou rejeter la solution
        delta = neighbor_perplexity - best_perplexity
        if delta < 0 or random.random() < math.exp(-delta / current_temp):
            current_sequence = neighbor[:]
            if neighbor_perplexity < best_perplexity:
                best_sequence = neighbor[:]
                best_perplexity = neighbor_perplexity

        current_temp *= cooling_rate

    return best_sequence, best_perplexity

def aco_with_sa(text, solution_df, scorer, n_ants=1, max_iterations=3, alpha=1.0, beta=2.0, evaporation_rate=0.5, pheromone_init=1.0):
    """ACO combiné avec SA"""
    words = text.split()
    n_words = len(words)

    # Initialiser la matrice des phéromones
    pheromones = np.full((n_words, n_words), pheromone_init)

    # Initialisation des meilleurs résultats
    global_best_sequence = None
    global_best_perplexity = float('inf')

    for iteration in range(max_iterations):
        print(f"Iteration {iteration + 1}/{max_iterations}, Best perplexity: {global_best_perplexity}")

        ants = []
        for _ in range(n_ants):
            # Construire une solution pour chaque fourmi
            sequence = []
            available_words = words[:]
            for _ in range(n_words):
                probabilities = []
                for word in available_words:
                    word_idx = words.index(word)
                    probabilities.append(pheromones[len(sequence), word_idx] ** alpha)
                probabilities = np.array(probabilities)
                probabilities /= probabilities.sum()
                chosen_word = random.choices(available_words, weights=probabilities, k=1)[0]
                sequence.append(chosen_word)
                available_words.remove(chosen_word)

            # Créer une fourmi
            ant = Ant(sequence, solution_df, scorer)
            ants.append(ant)

        # Recherche locale avec SA
        for ant in ants:
            improved_sequence, improved_perplexity = simulated_annealing(
                ant.sequence,
                solution_df,
                scorer,
                initial_temp=100,
                cooling_rate=0.95,
                max_iterations=10
            )
            if improved_perplexity < ant.best_perplexity:
                ant.sequence = improved_sequence[:]
                ant.best_perplexity = improved_perplexity

            # Mettre à jour les meilleurs résultats globaux
            if ant.best_perplexity < global_best_perplexity:
                global_best_sequence = ant.sequence[:]
                global_best_perplexity = ant.best_perplexity

        # Mettre à jour les phéromones
        pheromones *= (1 - evaporation_rate)
        for ant in ants:
            for i in range(len(ant.sequence) - 1):
                word_idx1 = words.index(ant.sequence[i])
                word_idx2 = words.index(ant.sequence[i + 1])
                pheromones[word_idx1, word_idx2] += 1.0 / ant.best_perplexity

    return global_best_sequence, global_best_perplexity


In [6]:
def optimize_sequences(batch_size=5):
    sample_submission = pd.read_csv("/kaggle/input/sample-submission/sample_submission.csv")
    results = []

    # Divide data into batches
    num_batches = (len(sample_submission) + batch_size - 1) // batch_size

    for batch_idx in range(num_batches):
        batch_start = batch_idx * batch_size
        batch_end = min((batch_idx + 1) * batch_size, len(sample_submission))
        batch_data = sample_submission.iloc[batch_start:batch_end]

        print(f"\nProcessing batch {batch_idx + 1}/{num_batches}")

        try:
            scorer = PerplexityCalculator(
                model_path='/kaggle/input/gemma-2/transformers/gemma-2-9b/2', 
                load_in_8bit=False
            )

            for idx, row in batch_data.iterrows():
                specific_solution = pd.DataFrame({'id': [row['id']], 'text': [row['text']]})
                optimized_text, perplexity = aco_with_sa(
                    row['text'],
                    specific_solution,
                    scorer,
                    n_ants=10,
                    max_iterations=10
                )
                print(f"ID: {row['id']}, Final Perplexity: {perplexity}")
                results.append({'id': row['id'], 'text': optimized_text})

            # Intermediate save
            temp_df = pd.DataFrame(results)
            temp_df.to_csv(f"submission_temp_batch_{batch_idx+1}.csv", index=False)

        except Exception as e:
            print(f"Error processing batch {batch_idx + 1}: {str(e)}")
            # Handle batch errors gracefully
            for idx, row in batch_data.iterrows():
                results.append({'id': row['id'], 'text': row['text']})

        # Clear GPU memory after each batch
        if torch.cuda.is_available():
            gc.collect()
            with torch.cuda.device(DEVICE):
                torch.cuda.empty_cache()
                torch.cuda.ipc_collect()

    # Final save
    submission = pd.DataFrame(results)
    submission.to_csv("submission.csv", index=False)
    return submission


if __name__ == "__main__":
        print("Starting optimization...")
        final_submission = optimize_sequences()
        print("Optimization completed!")

Starting optimization...

Processing batch 1/2


Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

Iteration 1/10, Best perplexity: inf
Iteration 2/10, Best perplexity: 497.56245599588635
Iteration 3/10, Best perplexity: 497.56245599588635
Iteration 4/10, Best perplexity: 497.56245599588635
Iteration 5/10, Best perplexity: 497.56245599588635
Iteration 6/10, Best perplexity: 488.2291821601048
Iteration 7/10, Best perplexity: 351.6909877849536
Iteration 8/10, Best perplexity: 351.6909877849536
Iteration 9/10, Best perplexity: 351.6909877849536
Iteration 10/10, Best perplexity: 351.6909877849536
ID: 0, Final Perplexity: 351.6909877849536
Iteration 1/10, Best perplexity: inf
Iteration 2/10, Best perplexity: 1272.0827503862215
Iteration 3/10, Best perplexity: 1272.0827503862215
Iteration 4/10, Best perplexity: 1272.0827503862215
Iteration 5/10, Best perplexity: 1272.0827503862215
Iteration 6/10, Best perplexity: 1272.0827503862215
Iteration 7/10, Best perplexity: 1084.7002618019842
Iteration 8/10, Best perplexity: 1084.7002618019842
Iteration 9/10, Best perplexity: 1084.7002618019842
Ite

Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

Iteration 1/10, Best perplexity: inf
Iteration 2/10, Best perplexity: 723.9097493150708
Iteration 3/10, Best perplexity: 713.6122015598163
Iteration 4/10, Best perplexity: 707.6895578115276
Iteration 5/10, Best perplexity: 660.9927970947293
Iteration 6/10, Best perplexity: 660.9927970947293
Iteration 7/10, Best perplexity: 660.9927970947293
Iteration 8/10, Best perplexity: 660.9927970947293
Iteration 9/10, Best perplexity: 660.9927970947293
Iteration 10/10, Best perplexity: 660.9927970947293
ID: 5, Final Perplexity: 660.9927970947293
Optimization completed!


* The implementation combines Ant Colony Optimization (ACO) with Simulated Annealing (SA) to optimize sequences and minimize perplexity. ACO is used to construct candidate sequences based on pheromone trails that evolve with iterations, while SA refines individual solutions through a local search. 
* This hybrid approach leverages the exploratory power of ACO and the exploitative strength of SA. Observations include the processing of data in batches to handle large datasets, ensuring efficient optimization. 
* Each batch undergoes multiple iterations, with noticeable improvements in the "Best perplexity" metric for most IDs, indicating that the optimization is effectively reducing perplexity. 
* Final perplexities reveal lower values, showcasing enhanced model performance and suggesting the approach's success in generating better predictions. 
* The combination of ACO and SA allows for a robust balance between exploration and exploitation in the search space.