# LMSYS - Chatbot Arena Human Preference Predictions

<div align="center">
    <img src="https://i.ibb.co/wJMF5HL/lmsys.png">
</div>

In this Kaggle challenge, our team will experiment with three different methods to determine which approach yields better results:

1. **Method 1: Using Pretrained Gemma 2 Model**
   - Utilize the pretrained Gemma 2 model to leverage its capabilities for the task at hand.

2. **Method 2: Using DeBERTa + TF-IDF + Word2Vec + Length + LightGBM**
   - Apply a combination of text vectorization techniques, including DeBERTa, TF-IDF, Word2Vec, and length features, along with LightGBM for model training and prediction.

3. **Method 3: Combining Gemma and DeBERTa + TF-IDF + Word2Vec + LightGBM**
   - Combine the predictions from the Gemma model with those from the DeBERTa + TF-IDF + Word2Vec + LightGBM model to create a final prediction.

By comparing these methods, we aim to identify which approach performs best for this challenge.

# 📖 | Meta Data 

The competition dataset comprises user interactions from the ChatBot Arena. In each interaction, a judge presents one or more prompts to two different large language models and then indicates which model provided the more satisfactory response. The training data contains `55,000` rows, with an expected `25,000` rows in the test set.

## Files

### `train.csv`
- `id`: Unique identifier for each row.
- `model_[a/b]`: Model identity, present in train.csv but not in test.csv.
- `prompt`: Input prompt given to both models.
- `response_[a/b]`: Model_[a/b]'s response to the prompt.
- `winner_model_[a/b/tie]`: Binary columns indicating the judge's selection (ground truth target).

### `test.csv`
- `id`: Unique identifier for each row.
- `prompt`: Input prompt given to both models.
- `response_[a/b]`: Model_[a/b]'s response to the prompt.

# Gemma 2 + Deberta + TF-IDF + Word2Vec + Length

In [None]:
# !pip install -q -U bitsandbytes --no-index --find-links ../input/llm-detect-pip/
# !pip install -q -U transformers --no-index --find-links ../input/llm-detect-pip/
# !pip install -q -U tokenizers --no-index --find-links ../input/llm-detect-pip/
# !pip install -q -U peft --no-index --find-links ../input/llm-detect-pip/

In [None]:
!pip install transformers peft accelerate bitsandbytes \
    -U --no-index --find-links /kaggle/input/lmsys-wheel-files

# 📚 | Import Libraries 

In [None]:
from transformers import Gemma2ForSequenceClassification, GemmaTokenizerFast, BitsAndBytesConfig
from transformers.data.data_collator import pad_without_fast_tokenizer_warning
from peft import PeftModel
from dataclasses import dataclass
from concurrent.futures import ThreadPoolExecutor
import sklearn
from threading import Thread
import gc
import os
import io
import json
import random
import pickle
import zipfile
import datetime
import time

import torch
import sklearn
import numpy as np
import pandas as pd
from transformers import AutoTokenizer, LlamaModel, LlamaForSequenceClassification, BitsAndBytesConfig
from peft import get_peft_config, PeftModel, PeftConfig, get_peft_model, LoraConfig, TaskType
from torch.cuda.amp import autocast
from IPython.display import display
import torch.nn.functional as F
import tokenizers

In [None]:
import os
import regex as re

from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, f1_score, recall_score, precision_score, accuracy_score, roc_auc_score, log_loss
from sklearn.preprocessing import LabelEncoder
from scipy.sparse import csr_matrix, save_npz, load_npz, hstack

import lightgbm as lgb

from tqdm import tqdm

import gensim
import itertools
from gensim.utils import simple_preprocess
from gensim.models import Word2Vec

from transformers import DebertaV2Tokenizer, DebertaV2Model
import torch

import joblib
import unicodedata
import re

import time

from sklearn.metrics.pairwise import cosine_similarity



# Gemma Model Part

In [None]:
@dataclass
class Config:
    gemma_dir = '/kaggle/input/merged-v157-8bit'
    max_length = 3072
    batch_size = 2
    device = torch.device("cuda")    
    tta = True  # test time augmentation. <prompt>-<model-b's response>-<model-a's response>
    spread_max_length = False  # whether to apply max_length//3 on each input or max_length on the concatenated input

cfg = Config()

# 📄 Load and Process Test Data

In [None]:
test = pd.read_csv('/kaggle/input/lmsys-chatbot-arena/test.csv')


In [None]:
import json

def process_text(text):
    return json.loads(text)

test.loc[:, 'prompt'] = test['prompt'].apply(process_text)
test.loc[:, 'response_a'] = test['response_a'].apply(process_text)
test.loc[:, 'response_b'] = test['response_b'].apply(process_text)

In [None]:
display(test.head(5))

# ✂️ Tokenize Function For Gemma

In [None]:
# def tokenize(tokenizer, prompt, response_a, response_b, max_length=cfg.max_length, spread_max_length=cfg.spread_max_length):
#     # Handle different formats for different tokenizers
#     if isinstance(tokenizer, GemmaTokenizerFast):
#         prompt = ["<prompt>: " + p for p in prompt]
#         response_a = ["\n\n<response_a>: " + r_a for r_a in response_a]
#         response_b = ["\n\n<response_b>: " + r_b for r_b in response_b]
#     else:
#         prompt = ["User prompt: " + p for p in prompt]
#         response_a = ["\n\nModel A :\n" + r_a for r_a in response_a]
#         response_b = ["\n\n--------\n\nModel B:\n" + r_b for r_b in response_b]
    
#     # Tokenize with spread max length
#     if spread_max_length:
#         prompt = tokenizer(prompt, max_length=max_length//3, truncation=True, padding=False).input_ids
#         response_a = tokenizer(response_a, max_length=max_length//3, truncation=True, padding=False).input_ids
#         response_b = tokenizer(response_b, max_length=max_length//3, truncation=True, padding=False).input_ids
#         input_ids = [p + r_a + r_b for p, r_a, r_b in zip(prompt, response_a, response_b)]
#         attention_mask = [[1] * len(i) for i in input_ids]
#     # Tokenize without spread max length
#     else:
#         text = [p + r_a + r_b for p, r_a, r_b in zip(prompt, response_a, response_b)]
#         tokenized = tokenizer(text, max_length=max_length, truncation=True, padding=False)
#         input_ids = tokenized.input_ids
#         attention_mask = tokenized.attention_mask
    
#     return input_ids, attention_mask

In [None]:
def tokenize(
    tokenizer, prompt, response_a, response_b, max_length=cfg.max_length, spread_max_length=cfg.spread_max_length
):

    text = []
    for pp,aa,bb in zip(prompt,response_a,response_b):
        
        rounds = [
            f"<start_of_turn>prompt\n{pp[i]}<end_of_turn>\n"
            +f"<start_of_turn>response_a\n{aa[i]}<end_of_turn>\n"
            +f"<start_of_turn>response_b\n{bb[i]}<end_of_turn>"
            for i in range(len(pp))
        ]
        
        # CONCATENATE
        tmp = "\n".join(rounds)
        for k in range(len(rounds)):
            tmp = "\n".join(rounds[k:])
            if len( tokenizer(tmp)["input_ids"] ) < max_length: 
                break
        text.append( tmp )

    tokenized = tokenizer(text, max_length=max_length, truncation=True, padding=False)
    input_ids = tokenized.input_ids
    attention_mask = tokenized.attention_mask
    return input_ids, attention_mask

# 🧠 Tokenizer Setup and Data Preparation

In [None]:
%%time

tokenizer = GemmaTokenizerFast.from_pretrained(cfg.gemma_dir)
tokenizer.add_eos_token = True
tokenizer.padding_side = "right"

data = pd.DataFrame()
data["id"] = test["id"]
data["input_ids"], data["attention_mask"] = tokenize(tokenizer, test["prompt"], test["response_a"], test["response_b"])
data["length"] = data["input_ids"].apply(len)

aug_data = pd.DataFrame()
aug_data["id"] = test["id"]
# swap response_a & response_b
aug_data['input_ids'], aug_data['attention_mask'] = tokenize(tokenizer, test["prompt"], test["response_b"], test["response_a"])
aug_data["length"] = aug_data["input_ids"].apply(len)

In [None]:
print(tokenizer.decode(data["input_ids"][0]))

In [None]:
print(tokenizer.decode(aug_data["input_ids"][0]))

# 🤖 | Load model

In [None]:
# Load base model on GPU 0
device_0 = torch.device('cuda:0')
model_0 = Gemma2ForSequenceClassification.from_pretrained(
    cfg.gemma_dir,
    device_map=device_0,
    use_cache=False,
)

# Load base model on GPU 1
device_1 = torch.device('cuda:1')
model_1 = Gemma2ForSequenceClassification.from_pretrained(
    cfg.gemma_dir,
    device_map=device_1,
    use_cache=False,
)

In [None]:
model_0 = PeftModel.from_pretrained(model_0, cfg.lora_dir)
model_1 = PeftModel.from_pretrained(model_1, cfg.lora_dir)

# 🔍 Inference Function

In [None]:
@torch.no_grad()
@torch.cuda.amp.autocast()
def inference(df, model, device, batch_size=cfg.batch_size, max_length=cfg.max_length):
    a_win, b_win, tie = [], [], []
    
    for start_idx in range(0, len(df), batch_size):
        end_idx = min(start_idx + batch_size, len(df))
        tmp = df.iloc[start_idx:end_idx]
        input_ids = tmp["input_ids"].to_list()
        attention_mask = tmp["attention_mask"].to_list()
        inputs = pad_without_fast_tokenizer_warning(
            tokenizer,
            {"input_ids": input_ids, "attention_mask": attention_mask},
            padding="longest",
            pad_to_multiple_of=None,
            return_tensors="pt",
        )
        outputs = model(**inputs.to(device))
        proba = outputs.logits.softmax(-1).cpu()
        
        a_win.extend(proba[:, 0].tolist())
        b_win.extend(proba[:, 1].tolist())
        tie.extend(proba[:, 2].tolist())
    
    df["winner_model_a"] = a_win
    df["winner_model_b"] = b_win
    df["winner_tie"] = tie
    
    return df

# ⚡ Perform Inference Using Threading

In [None]:
st = time.time()

# sort by input length to fully leverage dynaminc padding
data = data.sort_values("length", ascending=False)
# the total #tokens in sub_1 and sub_2 should be more or less the same
sub_1 = data.iloc[0::2].copy()
sub_2 = data.iloc[1::2].copy()

with ThreadPoolExecutor(max_workers=2) as executor:
    results = executor.map(inference, (sub_1, sub_2), (model_0, model_1), (device_0, device_1))

result_df = pd.concat(list(results), axis=0)
proba = result_df[["winner_model_a", "winner_model_b", "winner_tie"]].values

print(f"elapsed time: {time.time() - st}")

In [None]:
st = time.time()

if cfg.tta:
    data = aug_data.sort_values("length", ascending=False)  # sort by input length to boost speed
    sub_1 = data.iloc[0::2].copy()
    sub_2 = data.iloc[1::2].copy()

    with ThreadPoolExecutor(max_workers=2) as executor:
        results = executor.map(inference, (sub_1, sub_2), (model_0, model_1), (device_0, device_1))

    tta_result_df = pd.concat(list(results), axis=0)
    # recall TTA's order is flipped
    tta_proba = tta_result_df[["winner_model_b", "winner_model_a", "winner_tie"]].values 
    # average original result and TTA result.
    proba = (proba + tta_proba) / 2

print(f"elapsed time: {time.time() - st}")

In [None]:
result_df.loc[:, "winner_model_a"] = proba[:, 0]
result_df.loc[:, "winner_model_b"] = proba[:, 1]
result_df.loc[:, "winner_tie"] = proba[:, 2]
gemma_result_df = result_df[["id", 'winner_model_a', 'winner_model_b', 'winner_tie']]
# gemma_result_df.to_csv('submission.csv', index=False)
display(gemma_result_df)

# Save gemma result

In [None]:
TARGETS = ['winner_model_a', 'winner_model_b', 'winner_tie']

gemma_preds = result_df[TARGETS].values

# 🤖 | Deberta + TF-IDF + Word2Vec + Length Model Part

In [None]:
train = pd.read_csv("/kaggle/input/lmsys-chatbot-arena/train.csv")
test = pd.read_csv("/kaggle/input/lmsys-chatbot-arena/test.csv")

vectorize_on_train_and_test = False

#quick_test for training on small part of train data (and not using bunch of GPU on submit)
#(if this is on - saved models won't be fully trained)
quick_test = True
quick_test_items = 800

#automatically disable quick_test if we detect actual test data... (assures full training when scoring)
if (len(test)) > 3:quick_test = False
    
if quick_test: train = train.head(quick_test_items)
    
target_columns = ['winner_model_a', 'winner_model_b', 'winner_tie']

columns_to_vectorize = ["prompt", "response_a", "response_b"]

train.head(5)

In [None]:
train = pd.read_csv("/kaggle/input/lmsys-chatbot-arena/train.csv")
test = pd.read_csv("/kaggle/input/lmsys-chatbot-arena/test.csv")
vectorize_on_train_and_test = False
#quick_test for training on small part of train data (and not using bunch of GPU on submit)
#(if this is on - saved models won't be fully trained)
quick_test = True
quick_test_items = 1000
#automatically disable quick_test if we detect actual test data... (assures full training when scoring)
if (len(test)) > 3:quick_test = False
if quick_test: train = train.head(quick_test_items)

def process(input_str):
    stripped_str = input_str.strip('[]')
    sentences = [s.strip('"') for s in stripped_str.split('","')]
    return  ' '.join(sentences)
test.loc[:, 'prompt'] = test['prompt'].apply(process)
test.loc[:, 'response_a'] = test['response_a'].apply(process)
test.loc[:, 'response_b'] = test['response_b'].apply(process)
train.loc[:, 'prompt'] = train['prompt'].apply(process)
train.loc[:, 'response_a'] = train['response_a'].apply(process)
train.loc[:, 'response_b'] = train['response_b'].apply(process)

target_columns = ['winner_model_a', 'winner_model_b', 'winner_tie']
columns_to_vectorize = ["prompt", "response_a", "response_b"]
train['label'] = train[target_columns].idxmax(axis=1) 
label_encoder = LabelEncoder()
train['label'] = label_encoder.fit_transform(train['label'])
train = train[columns_to_vectorize + ['label']]
train.head(3)

# Deberta: Function to extract text features

In [None]:
deberta_path = "/kaggle/input/debertav3base"

# Load DeBERTa tokenizer and model
tokenizer = DebertaV2Tokenizer.from_pretrained(deberta_path)
transformer_model = DebertaV2Model.from_pretrained(deberta_path).cuda()

# Function to extract features (got best performance of batch_size 2-3 range)
def batch_extract_transformer_features(texts, tokenizer, model, batch_size=2, max_length=1024):
    total_texts = 0
    total_over_max_length = 0
    
    features = []
    model.eval()  # Set model to evaluation mode
    # Use autocast for mixed precision
    with torch.cuda.amp.autocast():
        for i in range(0, len(texts), batch_size):
            batch_texts = texts[i:i+batch_size]
            inputs = tokenizer(batch_texts, return_tensors='pt', max_length=max_length, truncation=True, padding=True).to('cuda')
            
            input_ids = inputs['input_ids']
            # Check for truncation
            for j, input_id in enumerate(input_ids):
                total_texts += 1
                if (input_id == tokenizer.pad_token_id).sum() == 0 and input_id.shape[0] == max_length:
                    total_over_max_length +=1
            
            with torch.no_grad():
                outputs = model(**inputs)
            batch_features = outputs.last_hidden_state[:, 0, :].cpu().numpy()
            features.extend(batch_features)
            # clear cache / print status "."
            if i % (batch_size * 10) == 0:
                torch.cuda.empty_cache()
                print(".", end="")
                
    print ("Ratio of texts over max_length tokens:", total_over_max_length / total_texts)
    return np.vstack(features)

# Deberta: Extract features for prompt and both responses
* Also adding difference between two responses to array (seemed to help)

In [None]:
def get_transformer_vectors(df):
    vectors = []
    for column in tqdm(columns_to_vectorize, desc="Vectorizing Columns"):
        print("Vectorizing", column)
        vectors.append(batch_extract_transformer_features(df[column].tolist(), tokenizer, transformer_model))

    vectors = np.array(vectors)
    vectors = np.transpose(vectors, (1, 0, 2))

    # Compute average difference
    avg_dif = vectors[:, 1, :] - vectors[:, 2, :]
    avg_dif = avg_dif.reshape(vectors.shape[0], 1, vectors.shape[2])
    vectors = np.concatenate((vectors, avg_dif), axis=1)

    # Calculate cosine similarities and append them
    similarities = []
    for i in range(vectors.shape[0]):
        prompt_vec = vectors[i, 0, :].reshape(1, -1)
        response1_vec = vectors[i, 1, :].reshape(1, -1)
        response2_vec = vectors[i, 2, :].reshape(1, -1)
        
        # Cosine similarity between prompt and response1
        sim_prompt_resp1 = cosine_similarity(prompt_vec, response1_vec)[0][0]
        
        # Cosine similarity between prompt and response2
        sim_prompt_resp2 = cosine_similarity(prompt_vec, response2_vec)[0][0]
        
        # Cosine similarity between response1 and response2
        sim_resp1_resp2 = cosine_similarity(response1_vec, response2_vec)[0][0]
        
        similarities.append([sim_prompt_resp1, sim_prompt_resp2, sim_resp1_resp2])
    
    similarities = np.array(similarities)
    
    # Reshape vectors to 2D
    vectors = vectors.reshape(len(vectors), -1)
    
    # Concatenate vectors and similarities
    final_vectors = np.concatenate((vectors, similarities), axis=1)
    
    return final_vectors

start_time = time.time()
transformer_train_vectors = get_transformer_vectors(train)
print(time.time() - start_time, "seconds")
transformer_train_vectors.shape

# Define text for Word2Vec and TF-IDF Vectorizer training

In [None]:
train_text = train[['prompt', 'response_a', 'response_b']].astype(str).apply(lambda x: ' '.join(x), axis=1)
test_text = test[['prompt', 'response_a', 'response_b']].astype(str).apply(lambda x: ' '.join(x), axis=1)

if vectorize_on_train_and_test:
    vector_fit_text = pd.concat([train_text, test_text], axis=0).reset_index(drop=True)
else:
    vector_fit_text = train_text

# Word2Vec: Initialize / train on our train data...
* Here we train Word2Vec to capture word relationships on our text columns....

In [None]:
print("Training Word2Vec...")
train_tokens = vector_fit_text.map(simple_preprocess)

#performance vector_size much better at 60 than 150
vectors = Word2Vec(train_tokens, vector_size=60, window=3, seed=1, workers=4)
vectors.save("word2vec_trained.model")

print("Done.")

# Word2Vec: Function to return average, min and max vectors for a text body
* Word2Vec provides vector values for each word - but we need a single vector that represents the entire text
* We do this by taking the average vector for all words in the text
* We also can return the minimum / maximum values across all vector components

In [None]:
def get_w2v_doc_vector(model, tokens, mode = "mean"):
    def doc_vector(words):
        vectors_in_doc = [model.wv[w] for w in words if w in model.wv]
        if (len(vectors_in_doc) == 0): return np.zeros(vectors.vector_size)
        if (mode == "mean"): return np.mean(vectors_in_doc, axis=0)
        if (mode == "min"): return np.min(vectors_in_doc, axis=0)
        if (mode == "max"): return np.max(vectors_in_doc, axis=0)

    def replace_nan_with_default(x, default_vector):
        return default_vector if np.isnan(x).any() else x

    X = tokens.map(doc_vector)

    #default vector is average of all
    default_vector = X[ False == X.isnull() ].mean()
    
    return np.stack([replace_nan_with_default(vector, default_vector) for vector in X])

# Word2Vec: Vectorize prompt and both responses 
* Word2Vec is used to generate mean, min and max vectors for the prompt and both responses
* We additionally generate a column with vector that's the difference between the two prompts
* Generating vectors with the differences between the prompt and responses didn't help score 

In [None]:
def get_word2vec_vectors(df):
    word2vec_vectors = []
    for column in tqdm(columns_to_vectorize, desc="Vectorizing Columns"):
        print("Vectorizing", column)
        column_tokens = df[column].map(simple_preprocess)

        word2vec_vectors.append(get_w2v_doc_vector(vectors, column_tokens, mode="mean"))
        word2vec_vectors.append(get_w2v_doc_vector(vectors, column_tokens, mode="min"))
        word2vec_vectors.append(get_w2v_doc_vector(vectors, column_tokens, mode="max"))

    #adjust array config
    word2vec_vectors = np.array(word2vec_vectors)
    word2vec_vectors = np.transpose(word2vec_vectors, (1, 0, 2))
 
    #generate a vector that is response_a - response_b (means values) / append it to array
    avg_dif = word2vec_vectors[:, 3, :] - word2vec_vectors[:, 6, :]
    avg_dif = avg_dif.reshape(word2vec_vectors.shape[0], 1, word2vec_vectors.shape[2])
    word2vec_vectors = np.concatenate((word2vec_vectors, avg_dif), axis=1)
    
    #flatten
    word2vec_vectors = np.array(word2vec_vectors).reshape(len(word2vec_vectors), -1)
    return word2vec_vectors

word2vec_train_vectors = get_word2vec_vectors(train)

# TF-IDF: Fit vectorizer on prompts and both responses

In [None]:
# Define a named tokenizer function
#produces better results than using "word" for analyzer
def custom_tokenizer(text):
    return re.findall(r'[^\W]+', text)

#word-level vectorizer
tfidf_word_vectorizer = TfidfVectorizer(
    ngram_range=(1, 5),
    tokenizer=custom_tokenizer,
    token_pattern=None,
    strip_accents='unicode',
    min_df=4,
    max_features=300
)

#char-level vectorizer
tfidf_char_vectorizer = TfidfVectorizer(analyzer='char', ngram_range=(1, 5), max_features=1000, min_df=4)

def batch_process(texts, batch_size):
    for i in range(0, len(texts), batch_size):
        yield texts[i:i + batch_size]

#doing in batches so we can see progress
batch_size = 1000
for batch in tqdm(batch_process(vector_fit_text, batch_size), total=np.ceil(len(vector_fit_text) / batch_size)):
    if len(batch) >= tfidf_word_vectorizer.min_df:
        tfidf_word_vectorizer.fit(batch)
    if len(batch) >= tfidf_char_vectorizer.min_df:
        tfidf_char_vectorizer.fit(batch)
        
joblib.dump(tfidf_word_vectorizer, "tfidf_word_vectorizer.pkl")
joblib.dump(tfidf_char_vectorizer, "tfidf_char_vectorizer.pkl")

# TF-IDF: Vectorize text columns - and combine in hstack

In [None]:
def get_tfidf_vectors(df):
    vectorized_columns = []
    for column in columns_to_vectorize:
        vectorized_columns.append(tfidf_word_vectorizer.transform(df[column]))
        vectorized_columns.append(tfidf_char_vectorizer.transform(df[column]))
    return hstack(vectorized_columns)

tfidf_train_vectors = get_tfidf_vectors(train)

# Get Length Features
* Adapted from https://www.kaggle.com/code/currypurin/lmsys-lengthfeature-and-tf-idf-v2

In [None]:
def has_none(vals) -> int:
    # some responses contains null and probably they are useful for prediction
    return int(any(val is None for val in vals))


def str_length(vals) -> int:
    length = 0
    for val in vals:
        if isinstance(val, str):
            length += len(val)
    return length


def get_length_features(data: pd.DataFrame):
    length_feature_array = []
    length_feature_array.append(data["response_a"].apply(str_length))
    length_feature_array.append(data["response_b"].apply(str_length))
    
    length_feature_array.append(length_feature_array[0] - length_feature_array[1])
    length_feature_array.append((length_feature_array[0] + length_feature_array[1]) / 2)
    length_feature_array.append((length_feature_array[0] / length_feature_array[1]))
    
    length_feature_array.append(data["response_a"].apply(has_none))
    length_feature_array.append(data["response_b"].apply(has_none))
    length_feature_array.append(data["response_a"].apply(has_none) - data["response_b"].apply(has_none))
    length_feature_array = np.array(length_feature_array).reshape(len(length_feature_array), -1)
    length_feature_array = np.transpose(length_feature_array, (1, 0))

    return length_feature_array

train_length_features = get_length_features(train)


# Deberta + Word2Vec + TF-IDF + Length Features: Assemble vectors!
* Also saving out to disk...

In [None]:
transformer_train_vectors_csr = csr_matrix(transformer_train_vectors)
word2vec_train_vectors_csr = csr_matrix(word2vec_train_vectors)  
train_length_features_csr = csr_matrix(train_length_features)

#save all the components
save_npz(os.path.join(".", 'transformer_train_vectors.npz'), transformer_train_vectors_csr)
save_npz(os.path.join(".", 'word2vec_train_vectors.npz'), word2vec_train_vectors_csr)
save_npz(os.path.join(".", 'train_length_features.npz'), train_length_features_csr)

combined_train_vectors = hstack([tfidf_train_vectors, transformer_train_vectors, word2vec_train_vectors_csr, train_length_features_csr]) 

In [None]:
# tfidf_train_vectors_csr = csr_matrix(tfidf_train_vectors)
# train_length_features_csr = csr_matrix(train_length_features)
# combined_train_vectors = hstack([tfidf_train_vectors,  train_length_features_csr]) 
# print(combined_train_vectors.shape)

# Vectorize our test data

In [None]:
print("Vectorizing test text...")

#our transformer
transformer_test_vectors = get_transformer_vectors(test)

#get Word2Vec
word2vec_test_vectors = get_word2vec_vectors(test)

#get TF-IDF
tfidf_test_vectors = get_tfidf_vectors(test)

#Length features
test_length_features = get_length_features(test)

#combine them!
transformer_test_vectors_csr = csr_matrix(transformer_test_vectors)  # Convert transformer vectors to a CSR matrix
word2vec_test_vectors_csr = csr_matrix(word2vec_test_vectors)  
test_length_features_csr = csr_matrix(test_length_features)
combined_test_vectors = hstack([tfidf_test_vectors, transformer_test_vectors_csr, word2vec_test_vectors_csr, test_length_features_csr]) 

In [None]:
# print("Vectorizing test text...")
# #get TF-IDF
# tfidf_test_vectors = get_tfidf_vectors(test)
# #Length features
# test_length_features = get_length_features(test)
# #combine them!
# tfidf_test_vectors_csr = csr_matrix(tfidf_test_vectors)
# test_length_features_csr = csr_matrix(test_length_features)
# combined_test_vectors = hstack([tfidf_test_vectors_csr, test_length_features_csr]) 
# print("Done!")

# Train LightGBM

In [None]:
model_filename = 'lightgbm_model.pkl'

max_estimators = 1000
early_stopping_limit = 50

# Data preparation
X = combined_train_vectors
# y = train[target_columns].idxmax(axis=1)
y_encoded = train['label'].values

# Encode labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Split dataset
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.05, random_state=42)

# LightGBM parameters
params = {
    'n_estimators': max_estimators,
    'max_depth': 4,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'objective': 'multiclass',
    'num_class': 3,
    'metric': 'multi_logloss',
    'random_state': 42,
    'learning_rate': 0.03,
    'verbose': -1  # keep logs quiet
}

# Create the model
model = lgb.LGBMClassifier(**params)

def callback(env):
    if env.iteration % 10 == 0: print ("Iteration:", env.iteration, "\tLog Loss:", env.evaluation_result_list[0][2])

model.fit(
    X_train, y_train,
    eval_set=[(X_test, y_test)],
    eval_metric='multi_logloss',    
    callbacks=[lgb.early_stopping(stopping_rounds=early_stopping_limit), callback]  
)

# Save the model to disk
joblib.dump(model, model_filename)
print(f"Model saved to {model_filename}")

y_pred_proba = model.predict_proba(X_test)

logloss = log_loss(y_test, y_pred_proba)
print(f"\nLog Loss: {logloss}")

y_pred = np.argmax(y_pred_proba, axis=1)  # Convert probabilities to class labels
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

In [None]:
# from sklearn.model_selection import StratifiedKFold
# from scipy.sparse import csr_matrix
# max_estimators = 1000
# early_stopping_limit = 50

# # Data preparation
# X = combined_train_vectors
# y_encoded = train['label'].values

# # LightGBM parameters
# params = {
#     'n_estimators': max_estimators,
#     'max_depth': 4,
#     'subsample': 0.8,
#     'colsample_bytree': 0.8,
#     'objective': 'multiclass',
#     'num_class': 3,
#     'metric': 'multi_logloss',
#     'random_state': 42,
#     'learning_rate': 0.03,
#     'verbose': -1  # keep logs quiet
# }

# # Create the model
# model = lgb.LGBMClassifier(**params)

# # 5-fold cross-validation
# stratified_k_fold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
# logloss_scores = []
# accuracy_scores = []
# test_pred_list = []

# for fold, (train_indices, val_indices) in enumerate(stratified_k_fold.split(X, y_encoded)):
#     print(f"\nFold {fold + 1}")
#     X_train_fold, X_val_fold = X[train_indices], X[val_indices]
#     y_train_fold, y_val_fold = y_encoded[train_indices], y_encoded[val_indices]

#     def callback(env):
#         if env.iteration % 10 == 0: print ("Iteration:", env.iteration, "\tLog Loss:", env.evaluation_result_list[0][2])

#     model.fit(
#         X_train_fold, y_train_fold,
#         eval_set=[(X_val_fold, y_val_fold)],
#         eval_metric='multi_logloss',
#         callbacks=[lgb.early_stopping(stopping_rounds=early_stopping_limit), callback]
#     )

#     y_pred_proba_fold = model.predict_proba(X_val_fold)
#     logloss_fold = log_loss(y_val_fold, y_pred_proba_fold)
#     logloss_scores.append(logloss_fold)
#     print(f"Log Loss: {logloss_fold}")
    
#     y_pred_fold = np.argmax(y_pred_proba_fold, axis=1)
#     accuracy_fold = accuracy_score(y_val_fold, y_pred_fold)
#     accuracy_scores.append(accuracy_fold)
#     print(f"Accuracy: {accuracy_fold}")

#     test_pred_list.append(model.predict_proba(combined_test_vectors[-test.shape[0]:]))

#     # Save the model to disk
#     model_filename = 'lightgbm_model'
#     model_filename_ = model_filename + f"_fold_{fold + 1}.pkl"
#     joblib.dump(model, model_filename_)
#     print(f"\nModel saved to {model_filename_}")

# # Calculate and print average scores
# average_logloss = np.mean(logloss_scores)
# average_accuracy = np.mean(accuracy_scores)
# print(f"\nAverage Log Loss: {average_logloss}")
# print(f"Average Accuracy: {average_accuracy}")

# Predict

In [None]:
preds_test = model.predict_proba(combined_test_vectors[-test.shape[0]:])
# preds_test = np.mean(test_pred_list, axis=0)

# submission = pd.DataFrame({
#     'id': test["id"],
#     'winner_model_a': preds_test[:, 0],
#     'winner_model_b': preds_test[:, 1], 
#     'winner_tie': preds_test[:, 2]
# })
# submission.to_csv('submission.csv', index=False)
# print(submission)

# 🧪 | Blend predictions of 2 models

In [None]:
lgb_wt = 0.2
preds = lgb_wt * preds_test + (1 - lgb_wt) * gemma_preds

In [None]:
preds

# 📬 | Submission

In [None]:
submission = pd.DataFrame({
    'id': test["id"],
    'winner_model_a': preds[:, 0],
    'winner_model_b': preds[:, 1], 
    'winner_tie': preds[:, 2]
})
submission.to_csv('submission.csv', index=False)