In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import kagglehub

# Download latest version
path = kagglehub.model_download("google/gemma-2/transformers/gemma-2-9b")

print("Path to model files:", path)

Path to model files: /kaggle/input/gemma-2/transformers/gemma-2-9b/2


In [None]:
"""Evaluation metric for Santa 2024."""

import gc
import os
from math import exp
from collections import Counter
from typing import List, Optional, Union

import numpy as np
import pandas as pd
import transformers
import torch

os.environ['OMP_NUM_THREADS'] = '1'
os.environ['TOKENIZERS_PARALLELISM'] = 'false'
PAD_TOKEN_LABEL_ID = torch.nn.CrossEntropyLoss().ignore_index
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


class ParticipantVisibleError(Exception):
    pass


def score(
    solution: pd.DataFrame,
    submission: pd.DataFrame,
    row_id_column_name: str,
    model_path: str = '/kaggle/input/gemma-2/transformers/gemma-2-9b/2',
    load_in_8bit: bool = False,
    clear_mem: bool = False,
) -> float:
    """
    Calculates the mean perplexity of submitted text permutations compared to an original text.

    Parameters
    ----------
    solution : DataFrame
        DataFrame containing the original text in a column named 'text'.
        Includes a row ID column specified by `row_id_column_name`.

    submission : DataFrame
        DataFrame containing the permuted text in a column named 'text'.
        Must have the same row IDs as the solution.
        Includes a row ID column specified by `row_id_column_name`.

    row_id_column_name : str
        Name of the column containing row IDs.
        Ensures aligned comparison between solution and submission.

    model_path : str, default='/kaggle/input/gemma-2/transformers/gemma-2-9b/2'
        Path to the serialized LLM.

    load_in_8bit : bool, default=False
        Use 8-bit quantization for the model. Requires CUDA.

    clear_mem : bool, default=False
        Clear GPU memory after scoring by clearing the CUDA cache.
        Useful for testing.

    Returns
    -------
    float
        The mean perplexity score. Lower is better.

    Raises
    ------
    ParticipantVisibleError
        If the submission format is invalid or submitted strings are not valid permutations.

    Examples
    --------
    >>> import pandas as pd
    >>> model_path = "/kaggle/input/gemma-2/transformers/gemma-2-9b/2"
    >>> solution = pd.DataFrame({
    ...     'id': [0, 1],
    ...     'text': ["this is a normal english sentence", "the quick brown fox jumps over the lazy dog"]
    ... })
    >>> submission = pd.DataFrame({
    ...     'id': [0, 1],
    ...     'text': ["sentence english normal a is this", "lazy the over jumps fox brown quick the dog"]
    ... })
    >>> score(solution, submission, 'id', model_path=model_path, clear_mem=True) > 0
    True
    """
    # Check that each submitted string is a permutation of the solution string
    sol_counts = solution.loc[:, 'text'].str.split().apply(Counter)
    sub_counts = submission.loc[:, 'text'].str.split().apply(Counter)
    invalid_mask = sol_counts != sub_counts
    if invalid_mask.any():
        raise ParticipantVisibleError(
            'At least one submitted string is not a valid permutation of the solution string.'
        )

    # Calculate perplexity for the submitted strings
    sub_strings = [
        ' '.join(s.split()) for s in submission['text'].tolist()
    ]  # Split and rejoin to normalize whitespace
    scorer = PerplexityCalculator(
    model_path='/kaggle/input/gemma-2/transformers/gemma-2-9b/2', 
    load_in_8bit=False
)
 # Initialize the perplexity calculator with a pre-trained model
    perplexities = scorer.get_perplexity(
        sub_strings
    )  # Calculate perplexity for each submitted string

    if clear_mem:
        # Just move on if it fails. Not essential if we have the score.
        try:
            scorer.clear_gpu_memory()
        except:
            print('GPU memory clearing failed.')

    return float(np.mean(perplexities))


class PerplexityCalculator:
    """
    Calculates perplexity of text using a pre-trained language model.

    Adapted from https://github.com/asahi417/lmppl/blob/main/lmppl/ppl_recurrent_lm.py

    Parameters
    ----------
    model_path : str
        Path to the pre-trained language model

    load_in_8bit : bool, default=False
        Use 8-bit quantization for the model. Requires CUDA.

    device_map : str, default="auto"
        Device mapping for the model.
    """

    def __init__(
        self,
        model_path: str,
        load_in_8bit: bool = False,
        device_map: str = 'auto',
    ):
        self.tokenizer = transformers.AutoTokenizer.from_pretrained(model_path)
        # Configure model loading based on quantization setting and device availability
        if load_in_8bit:
            if DEVICE.type != 'cuda':
                raise ValueError('8-bit quantization requires CUDA device')
            quantization_config = transformers.BitsAndBytesConfig(load_in_8bit=True)
            self.model = transformers.AutoModelForCausalLM.from_pretrained(
                model_path,
                quantization_config=quantization_config,
                device_map=device_map,
            )
        else:
            self.model = transformers.AutoModelForCausalLM.from_pretrained(
                model_path,
                torch_dtype=torch.float16 if DEVICE.type == 'cuda' else torch.float32,
                device_map=device_map,
            )

        self.loss_fct = torch.nn.CrossEntropyLoss(reduction='none')

        self.model.eval()

    def get_perplexity(self, input_texts: Union[str, List[str]], batch_size: int = 8, debug=False) -> Union[float, List[float]]:
        """
        Optimized perplexity calculation using batching for efficiency.
        """
        single_input = isinstance(input_texts, str)
        input_texts = [input_texts] if single_input else input_texts
        loss_list = []
        
        for i in range(0, len(input_texts), batch_size):
            batch_texts = input_texts[i:i + batch_size]
            with torch.no_grad():
                model_inputs = self.tokenizer(
                        batch_texts,
                        return_tensors='pt',
                        padding=True,
                        truncation=True,
                        max_length=512,  # Limit input sequence length
                        add_special_tokens=True,
                    ).to(DEVICE)

                
                
                output = self.model(**model_inputs)
                logits = output['logits']
                
                shift_logits = logits[..., :-1, :].contiguous()
                shift_labels = model_inputs['input_ids'][..., 1:].contiguous()
                
                loss = self.loss_fct(
                    shift_logits.view(-1, shift_logits.size(-1)),
                    shift_labels.view(-1)
                )
                
                batch_losses = loss.view(len(batch_texts), -1).sum(dim=1) / model_inputs['attention_mask'].sum(dim=1)
                loss_list.extend(batch_losses.cpu().tolist())
                
                if debug:
                    print(f"Processed batch {i // batch_size + 1} of {len(input_texts) // batch_size + 1}")
    
        ppl = [exp(i) for i in loss_list]
        return ppl[0] if single_input else ppl

    def clear_gpu_memory(self) -> None:
        """Enhanced memory clearing."""
        if torch.cuda.is_available():
            del self.model
            del self.tokenizer
            gc.collect()
            with torch.cuda.device(DEVICE):
                torch.cuda.empty_cache()
                torch.cuda.ipc_collect()


In [4]:
import pandas as pd

# Load the dataset with perplexity 390
input_file = "/kaggle/input/dataset39/submission (5).csv"
data = pd.read_csv(input_file)

In [None]:
import pandas as pd
import random
import numpy as np

class SimulatedAnnealing:
    def __init__(self, initial_temperature, cooling_rate, max_iterations, random_state=42):
        random.seed(random_state)
        np.random.seed(random_state)
        self.initial_temperature = initial_temperature
        self.cooling_rate = cooling_rate
        self.max_iterations = max_iterations

    def swap(self, sequence):
        """Enhanced neighbor generation: Reversals and adaptive swaps."""
        neighbor = sequence[:]
        if random.random() < 0.5:  # 50% chance to reverse a segment
            i, j = sorted(random.sample(range(len(neighbor)), 2))
            neighbor[i:j] = reversed(neighbor[i:j])
        else:  # Otherwise, swap multiple pairs
            num_swaps = random.randint(2, max(5, int(len(neighbor) * 0.1)))
            for _ in range(num_swaps):
                i, j = random.sample(range(len(neighbor)), 2)
                neighbor[i], neighbor[j] = neighbor[j], neighbor[i]
        return neighbor

    def solve(self, text, scorer):
        words = text.split()
        current_sequence = words[:]
        best_sequence = words[:]
        current_perplexity = scorer.get_perplexity(' '.join(current_sequence))
        best_perplexity = current_perplexity
        temperature = self.initial_temperature
        log_energies = []

        while temperature > 0.1:  # Lower final temperature threshold
            for _ in range(self.max_iterations):
                neighbor = self.swap(current_sequence)
                neighbor_perplexity = scorer.get_perplexity(' '.join(neighbor))
                delta = neighbor_perplexity - current_perplexity

                if delta < 0 or random.random() < np.exp(-delta / temperature):
                    current_sequence = neighbor
                    current_perplexity = neighbor_perplexity

                    if current_perplexity < best_perplexity:
                        best_sequence = current_sequence
                        best_perplexity = current_perplexity

            log_energies.append(best_perplexity)
            print(f"Temperature: {temperature:.2f}, Best Perplexity: {best_perplexity}")
            temperature *= self.cooling_rate

        return ' '.join(best_sequence), best_perplexity

            

def optimize_sequences_with_sa(input_file, output_file):
    """
    Optimize text sequences using Simulated Annealing and save results in a CSV.

    Parameters:
    - input_file (str): Path to the input CSV file with 'id' and 'text'.
    - output_file (str): Path to save the output CSV file.
    """
    # Load the dataset
    data = pd.read_csv(input_file)
    results = []

    # Initialize Simulated Annealing with specific parameters
    sa = SimulatedAnnealing(
        initial_temperature=50.0,  # Try a higher initial temperature
        cooling_rate=0.98,  # Slower cooling
        max_iterations=100,  # Increase the iterations
        random_state=42
    )

    # Initialize the scorer
    scorer = PerplexityCalculator(
        model_path='/kaggle/input/gemma-2/transformers/gemma-2-9b/2',
        load_in_8bit=False
    )

    # Optimize each text sequence
    for idx, row in data.iterrows():
        print(f"Processing ID: {row['id']}")
        optimized_text, best_perplexity = sa.solve(row['text'], scorer)
        results.append({'id': row['id'], 'text': optimized_text})

    # Save the results to a CSV file
    submission = pd.DataFrame(results)
    submission.to_csv(output_file, index=False)
    print(f"Results saved to {output_file}")

if __name__ == "__main__":
    input_file = "/kaggle/input/dataset39/submission (5).csv"
    output_file = "/kaggle/working/submission_refined.csv"
    optimize_sequences_with_sa(input_file, output_file)




    

Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

Processing ID: 0
Temperature: 50.00, Best Perplexity: 292.68395088034254
Temperature: 49.00, Best Perplexity: 292.68395088034254
Temperature: 48.02, Best Perplexity: 292.68395088034254
Temperature: 47.06, Best Perplexity: 292.68395088034254
Temperature: 46.12, Best Perplexity: 292.68395088034254
Temperature: 45.20, Best Perplexity: 292.68395088034254
Temperature: 44.29, Best Perplexity: 292.68395088034254
Temperature: 43.41, Best Perplexity: 292.68395088034254
Temperature: 42.54, Best Perplexity: 292.68395088034254
Temperature: 41.69, Best Perplexity: 292.68395088034254
Temperature: 40.85, Best Perplexity: 292.68395088034254
Temperature: 40.04, Best Perplexity: 292.68395088034254
Temperature: 39.24, Best Perplexity: 292.68395088034254
Temperature: 38.45, Best Perplexity: 292.68395088034254
Temperature: 37.68, Best Perplexity: 292.68395088034254
Temperature: 36.93, Best Perplexity: 292.68395088034254
Temperature: 36.19, Best Perplexity: 292.68395088034254
Temperature: 35.47, Best Perple

In [3]:
import pandas as pd  
import numpy as np   
import random        
import math          
from copy import deepcopy  

# Définition de la classe Ant (Fourmi)
class Ant:
    def __init__(self, sequence, scorer):
        # Initialisation de la fourmi avec une séquence de mots et un évaluateur (scorer)
        self.sequence = sequence[:]  # Séquence actuelle de la fourmi
        self.best_sequence = sequence[:]  # Meilleure séquence trouvée par la fourmi
        self.best_perplexity = float('inf')  # Meilleure perplexité (initialisée à l'infini)
        self.scorer = scorer  # Évaluateur pour calculer la perplexité
        self.evaluate()  # Évaluation initiale de la séquence

    def evaluate(self):
        """Évalue la séquence et met à jour la meilleure solution."""
        self.current_perplexity = self.scorer.get_perplexity(' '.join(self.sequence))  # Calcule la perplexité
        if self.current_perplexity < self.best_perplexity:  # Si la perplexité est meilleure
            self.best_perplexity = self.current_perplexity  # Met à jour la meilleure perplexité
            self.best_sequence = self.sequence[:]  # Met à jour la meilleure séquence

# Définition de la fonction de Recuit Simulé (Simulated Annealing)
def simulated_annealing(sequence, scorer, initial_temp=150, cooling_rate=0.95, max_iterations=20):
    """Recuit Simulé pour la recherche locale."""
    current_sequence = sequence[:]  # Séquence actuelle
    best_sequence = sequence[:]  # Meilleure séquence trouvée
    best_perplexity = scorer.get_perplexity(' '.join(best_sequence))  # Calcule la perplexité initiale
    current_temp = initial_temp  # Température initiale

    for _ in range(max_iterations):
        # Génère un voisin en échangeant deux mots aléatoirement
        neighbor = current_sequence[:]
        i, j = random.sample(range(len(neighbor)), 2)
        neighbor[i], neighbor[j] = neighbor[j], neighbor[i]

        # Évalue le voisin
        neighbor_perplexity = scorer.get_perplexity(' '.join(neighbor))

        # Accepte ou rejette le voisin en fonction de la perplexité et de la température
        delta = neighbor_perplexity - best_perplexity
        if delta < 0 or random.random() < math.exp(-delta / current_temp):
            current_sequence = neighbor[:]
            if neighbor_perplexity < best_perplexity:
                best_sequence = neighbor[:]
                best_perplexity = neighbor_perplexity

        current_temp *= cooling_rate  # Refroidissement de la température

    return best_sequence, best_perplexity  # Retourne la meilleure séquence et sa perplexité

# Définition de la fonction ACO combinée avec le Recuit Simulé
def aco_with_sa(text, scorer, n_ants=25, max_iterations=20, alpha=1.0,
                beta=2.0, evaporation_rate=0.5, pheromone_init=1.0):
    """ACO combiné avec SA pour l'optimisation de séquences."""
    # Divise le texte en mots
    words = text.split()
    # Nombre de mots dans le texte
    n_words = len(words)

    # Initialisation de la matrice de phéromones
    # Chaque transition entre deux mots a une quantité initiale de phéromones (pheromone_init)
    pheromones = np.full((n_words, n_words), pheromone_init)

    # Initialisation de la meilleure solution globale
    # Commence avec la séquence originale des mots
    global_best_sequence = words[:]
    # Calcule la perplexité de la séquence originale
    global_best_perplexity = scorer.get_perplexity(' '.join(global_best_sequence))

    # Boucle principale sur le nombre d'itérations
    for iteration in range(max_iterations):
        print(f"Iteration {iteration + 1}/{max_iterations}, Best perplexity: {global_best_perplexity}")

        # Liste pour stocker les fourmis
        ants = []
        # Crée une fourmi pour chaque fourmi dans la colonie
        for _ in range(n_ants):
            # Construction d'une solution pour chaque fourmi
            sequence = []
            # Liste des mots disponibles pour la construction de la séquence
            available_words = words[:]
            for _ in range(n_words):
                probabilities = []
                # Calcule les probabilités de transition pour chaque mot disponible
                for word in available_words:
                    word_idx = words.index(word)
                    # Probabilité basée sur les phéromones et l'heuristique (alpha et beta)
                    probabilities.append(pheromones[len(sequence), word_idx] ** alpha)
                # Normalisation des probabilités pour qu'elles somment à 1
                probabilities = np.array(probabilities)
                probabilities /= probabilities.sum()
                # Choix d'un mot en fonction des probabilités
                chosen_word = random.choices(available_words, weights=probabilities, k=1)[0]
                # Ajoute le mot choisi à la séquence
                sequence.append(chosen_word)
                # Retire le mot choisi des mots disponibles
                available_words.remove(chosen_word)

            # Crée une fourmi avec la séquence construite et l'évaluateur (scorer)
            ant = Ant(sequence, scorer)
            # Ajoute la fourmi à la liste des fourmis
            ants.append(ant)

        # Recherche locale avec Recuit Simulé pour chaque fourmi
        for ant in ants:
            # Applique le Recuit Simulé pour améliorer la séquence de la fourmi
            improved_sequence, improved_perplexity = simulated_annealing(
                ant.sequence,
                scorer,
                initial_temp=150,
                cooling_rate=0.98,
                max_iterations=10
            )
            # Si la perplexité améliorée est meilleure, met à jour la séquence de la fourmi
            if improved_perplexity < ant.best_perplexity:
                ant.sequence = improved_sequence[:]
                ant.best_perplexity = improved_perplexity

            # Met à jour la meilleure solution globale si la fourmi a trouvé une meilleure séquence
            if ant.best_perplexity < global_best_perplexity:
                global_best_sequence = ant.sequence[:]
                global_best_perplexity = ant.best_perplexity

        # Mise à jour des phéromones
        # Évaporation des phéromones : réduit toutes les phéromones par un facteur (1 - evaporation_rate)
        pheromones *= (1 - evaporation_rate)
        # Ajout de phéromones sur les chemins utilisés par les fourmis
        for ant in ants:
            for i in range(len(ant.sequence) - 1):
                # Indices des mots dans la séquence
                word_idx1 = words.index(ant.sequence[i])
                word_idx2 = words.index(ant.sequence[i + 1])
                # Ajoute des phéromones proportionnellement à la qualité de la séquence (1 / perplexité)
                pheromones[word_idx1, word_idx2] += 1.0 / ant.best_perplexity

    # Retourne la meilleure séquence globale et sa perplexité
    return global_best_sequence, global_best_perplexity
    
# Définition de la fonction pour optimiser les séquences par lots
def optimize_sequences(batch_size=5):
    """Optimise les séquences en utilisant ACO avec SA."""
    sample_submission = pd.read_csv("/kaggle/input/dataset/sample_submission.csv")  # Charge les données
    results = []  # Pour stocker les résultats
    perplexity_scores = []  # Pour stocker les scores de perplexité

    # Divise les données en lots
    num_batches = (len(sample_submission) + batch_size - 1) // batch_size

    for batch_idx in range(num_batches):
        batch_start = batch_idx * batch_size
        batch_end = min((batch_idx + 1) * batch_size, len(sample_submission))
        batch_data = sample_submission.iloc[batch_start:batch_end]

        print(f"\nProcessing batch {batch_idx + 1}/{num_batches}")

        try:
            # Initialisation de l'évaluateur de perplexité
            scorer = PerplexityCalculator(
                model_path='/kaggle/input/gemma-2/transformers/gemma-2-9b/2',
                load_in_8bit=False
            )

            # Traitement de chaque ligne dans le lot
            for idx, row in batch_data.iterrows():
                optimized_text, perplexity = aco_with_sa(
                    row['text'],
                    scorer,
                    n_ants=45,  
                    max_iterations=20,  
                    alpha=1.0,
                    beta=2.0,
                    evaporation_rate=0.5,
                    pheromone_init=1.0
                )
                print(f"ID: {row['id']}, Final Perplexity: {perplexity}")
                results.append({'id': row['id'], 'text': optimized_text})  # Ajoute le résultat
                perplexity_scores.append(perplexity)  # Ajoute la perplexité

            # Sauvegarde intermédiaire des résultats
            temp_df = pd.DataFrame(results)
            temp_df.to_csv(f"submission_temp_batch_{batch_idx+1}.csv", index=False)

        except Exception as e:
            print(f"Error processing batch {batch_idx + 1}: {str(e)}")
            # Gestion des erreurs : sauvegarde la séquence originale en cas d'échec
            for idx, row in batch_data.iterrows():
                results.append({'id': row['id'], 'text': row['text']})
                perplexity_scores.append(float('inf'))  # Utilise l'infini pour les séquences échouées

    # Sauvegarde finale des résultats
    submission = pd.DataFrame(results)
    submission.to_csv("/kaggle/working/submission.csv", index=False)

    # Calcul et affichage des métriques
    perplexity_scores = np.array(perplexity_scores)
    lowest_perplexity = np.min(perplexity_scores)  # Perplexité minimale
    average_perplexity = np.mean(perplexity_scores)  # Perplexité moyenne

    print("\nFinal Results:")
    print(f"Lowest Perplexity: {lowest_perplexity}")
    print(f"Average Perplexity: {average_perplexity}")

    return submission  # Retourne les résultats finaux

# Point d'entrée du programme
if __name__ == "__main__":
    print("Starting optimization...")
    final_submission = optimize_sequences()  # Lance l'optimisation
    print("Optimization completed!")

Starting optimization...

Processing batch 1/2


Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

Iteration 1/20, Best perplexity: 2349.3494739881
Iteration 2/20, Best perplexity: 440.42301065338194
Iteration 3/20, Best perplexity: 440.42301065338194
Iteration 4/20, Best perplexity: 404.5354733638809
Iteration 5/20, Best perplexity: 361.2114152344687
Iteration 6/20, Best perplexity: 361.2114152344687
Iteration 7/20, Best perplexity: 361.2114152344687
Iteration 8/20, Best perplexity: 361.2114152344687
Iteration 9/20, Best perplexity: 361.2114152344687
Iteration 10/20, Best perplexity: 361.2114152344687
Iteration 11/20, Best perplexity: 361.2114152344687
Iteration 12/20, Best perplexity: 361.2114152344687
Iteration 13/20, Best perplexity: 361.2114152344687
Iteration 14/20, Best perplexity: 361.2114152344687
Iteration 15/20, Best perplexity: 361.2114152344687
Iteration 16/20, Best perplexity: 361.2114152344687
Iteration 17/20, Best perplexity: 336.45752274778374
Iteration 18/20, Best perplexity: 336.45752274778374
Iteration 19/20, Best perplexity: 336.45752274778374
Iteration 20/20, B

Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

Iteration 1/20, Best perplexity: 346.46759300228723
Iteration 2/20, Best perplexity: 346.46759300228723
Iteration 3/20, Best perplexity: 346.46759300228723
Iteration 4/20, Best perplexity: 346.46759300228723
Iteration 5/20, Best perplexity: 346.46759300228723
Iteration 6/20, Best perplexity: 346.46759300228723
Iteration 7/20, Best perplexity: 346.46759300228723
Iteration 8/20, Best perplexity: 346.46759300228723
Iteration 9/20, Best perplexity: 346.46759300228723
Iteration 10/20, Best perplexity: 346.46759300228723
Iteration 11/20, Best perplexity: 346.46759300228723
Iteration 12/20, Best perplexity: 346.46759300228723
Iteration 13/20, Best perplexity: 346.46759300228723
Iteration 14/20, Best perplexity: 346.46759300228723
Iteration 15/20, Best perplexity: 346.46759300228723
Iteration 16/20, Best perplexity: 346.46759300228723
Iteration 17/20, Best perplexity: 346.46759300228723
Iteration 18/20, Best perplexity: 346.46759300228723
Iteration 19/20, Best perplexity: 346.46759300228723
It