# Integrated Model
This script ensembles two pre-trained models (Mistral-7B and Qwen-3.8B) 

In [1]:
import os
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'

# 只安装必要的包
os.system('pip install "/kaggle/input/bitsandbytes-0-46-1/bitsandbytes-0.46.1-py3-none-manylinux_2_24_x86_64.whl" --no-build-isolation --no-deps --force-reinstall')

# 导入所有必要的库
import torch
from torch.utils.data import DataLoader   
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from transformers import AutoTokenizer, AutoModelForSequenceClassification, DataCollatorWithPadding, BitsAndBytesConfig
from datasets import Dataset
import gc

print("Step 1: Libraries imported and environment set up.")

Processing /kaggle/input/bitsandbytes-0-46-1/bitsandbytes-0.46.1-py3-none-manylinux_2_24_x86_64.whl
Installing collected packages: bitsandbytes
Successfully installed bitsandbytes-0.46.1


2025-08-25 08:01:49.926390: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1756108910.130646      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1756108910.189738      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


Step 1: Libraries imported and environment set up.


# 2.Efficient Data Loading

In [2]:
DATA_PATH = '/kaggle/input/map-charting-student-math-misunderstandings/'

train = pd.read_csv(f'{DATA_PATH}train.csv')
test = pd.read_csv(f'{DATA_PATH}test.csv')

print(f"Step 2: Data loaded - Train: {train.shape}, Test: {test.shape}")

Step 2: Data loaded - Train: (36696, 7), Test: (3, 5)


# 3.Fast Feature Engineering

In [3]:
le = LabelEncoder()
train.Misconception = train.Misconception.fillna('NA')
train['target'] = train.Category + ':' + train.Misconception
train['label'] = le.fit_transform(train['target'])

# 创建正确答案映射
idx = train.apply(lambda row: row.Category.split('_')[0], axis=1) == 'True'
correct = train.loc[idx].copy()
correct['c'] = correct.groupby(['QuestionId', 'MC_Answer']).MC_Answer.transform('count')
correct = correct.sort_values('c', ascending=False).drop_duplicates(['QuestionId'])[['QuestionId', 'MC_Answer']]
correct['is_correct'] = 1

test = test.merge(correct, on=['QuestionId', 'MC_Answer'], how='left')
test.is_correct = test.is_correct.fillna(0)

print(f"Step 3: Feature engineering completed - {len(le.classes_)} classes")

Step 3: Feature engineering completed - 65 classes


# 4.Format Input

In [4]:
# def format_input(row):
#     correctness = "This answer is correct." if row['is_correct'] else "This answer is incorrect."
#     return f"Question: {row['QuestionText']}\nAnswer: {row['MC_Answer']}\n{correctness}\nStudent Explanation: {row['StudentExplanation']}"

# test['text'] = test.apply(format_input, axis=1)
# ds_test = Dataset.from_pandas(test)

# print("Step 4: Model input formatted.")

In [5]:
def format_input(row):
    correctness = "This answer is correct." if row['is_correct'] else "This answer is incorrect."
    return f"""Math Question: {row['QuestionText']}

Student's Answer: {row['MC_Answer']}
Answer Status: {correctness}

Student's Explanation: {row['StudentExplanation']}

Analyze the student's mathematical reasoning and identify any misconceptions."""

test['text'] = test.apply(format_input, axis=1)
ds_test = Dataset.from_pandas(test)
print("Step 4: Model input formatted.")

Step 4: Model input formatted.


# 5.Optimized Model Inference

In [6]:
def get_predictions(model_path, ds_test):
    """优化的模型推理函数 - 针对Kaggle T4 GPU优化"""
    
    print(f"Loading model from: {model_path}")
    
    # 清理内存
    torch.cuda.empty_cache()
    gc.collect()
    
    # 使用4bit量化降低内存占用
    quantization_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_compute_dtype=torch.float16,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type="nf4"
    )
    
    # 尝试不同的attention实现
    for attn_type in ["sdpa", "eager"]:  # SDPA对T4最兼容
        try:
            print(f"Trying {attn_type} attention...")
            model = AutoModelForSequenceClassification.from_pretrained(
                model_path,
                device_map="cuda",
                attn_implementation=attn_type,
                quantization_config=quantization_config,
                torch_dtype=torch.float16,
                trust_remote_code=True
            )
            print(f"Model loaded successfully with {attn_type}")
            break
        except Exception as e:
            print(f"{attn_type} failed: {str(e)[:100]}")
            continue
    else:
        raise RuntimeError("Failed to load model with any attention type")
    
    # 加载tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
    model.config.pad_token_id = tokenizer.pad_token_id
    
    # 快速tokenization - 使用较短的序列长度
    def tokenize(batch):
        return tokenizer(batch["text"], padding="max_length", truncation=True, max_length=128)
    
    ds_test_tokenized = ds_test.map(tokenize, batched=True)
    
    # 推理设置
    model.eval()
    data_collator = DataCollatorWithPadding(tokenizer)
    ds_test_tokenized.set_format(type='torch', columns=['input_ids', 'attention_mask'])
    
    # 对于小数据集，可以使用稍大的batch size
    batch_size = min(4, len(ds_test))  # 自适应batch size
    dataloader = DataLoader(ds_test_tokenized, batch_size=batch_size, collate_fn=data_collator, shuffle=False)
    
    print(f"Starting inference with batch_size={batch_size}")
    
    # 推理循环
    all_predictions = []
    with torch.no_grad():
        for i, batch in enumerate(dataloader):
            inputs = {k: v.to("cuda") for k, v in batch.items() if k in tokenizer.model_input_names}
            outputs = model(**inputs)
            all_predictions.append(outputs.logits.cpu().numpy())
            
            # 清理内存
            del inputs, outputs
            if i % 2 == 0:
                torch.cuda.empty_cache()
    
    # 合并预测结果
    predictions = np.concatenate(all_predictions, axis=0)
    
    # 彻底清理
    del model, tokenizer, dataloader, all_predictions
    gc.collect()
    torch.cuda.empty_cache()
    
    print(f"Inference completed. Shape: {predictions.shape}")
    return predictions

print("Step 5: Optimized inference function defined.")

Step 5: Optimized inference function defined.


# 6.Model Ensemble Inference

In [7]:
mistral_model_path = '/kaggle/input/map-exp-14-full/MAP_EXP_14_FULL'
qwen_model_path = '/kaggle/input/qwen3-8b-map-competition/MAP_EXP_16_FULL'

print("Starting ensemble inference...")

# 获取两个模型的预测
predictions_mistral = get_predictions(mistral_model_path, ds_test)
predictions_qwen = get_predictions(qwen_model_path, ds_test)

print("Step 6: Both models completed successfully!")

Starting ensemble inference...
Loading model from: /kaggle/input/map-exp-14-full/MAP_EXP_14_FULL
Trying sdpa attention...


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Model loaded successfully with sdpa


Map:   0%|          | 0/3 [00:00<?, ? examples/s]

Starting inference with batch_size=3
Inference completed. Shape: (3, 65)
Loading model from: /kaggle/input/qwen3-8b-map-competition/MAP_EXP_16_FULL
Trying sdpa attention...


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Model loaded successfully with sdpa


Map:   0%|          | 0/3 [00:00<?, ? examples/s]

Starting inference with batch_size=3
Inference completed. Shape: (3, 65)
Step 6: Both models completed successfully!


# 7.Generate Submission

In [8]:
# 加权集成预测 - 给Qwen更高权重
mistral_weight = 0.2 # Mistral权重
qwen_weight = 0.8   # Qwen权重 (更高)

print(f"Using weighted ensemble: Mistral({mistral_weight}) + Qwen({qwen_weight})")

# 加权平均替代简单平均
ensembled_predictions = mistral_weight * predictions_mistral + qwen_weight * predictions_qwen

# 获取Top 3预测
top3_indices = np.argsort(-ensembled_predictions, axis=1)[:, :3]

# 转换为标签
flat_top3 = top3_indices.flatten()
decoded_labels = le.inverse_transform(flat_top3)
top3_labels = decoded_labels.reshape(top3_indices.shape)

# 格式化输出
joined_preds = [" ".join(preds) for preds in top3_labels]

# 创建提交文件
submission_df = pd.DataFrame({
    "row_id": test.row_id.values,
    "Category:Misconception": joined_preds
})

submission_df.to_csv("submission.csv", index=False)

print("Step 7: Weighted ensemble submission file created successfully!")
print(f"Submission shape: {submission_df.shape}")
print(f"Applied weights: Mistral={mistral_weight}, Qwen={qwen_weight}")
print("First few predictions:")
print(submission_df.head())

Using weighted ensemble: Mistral(0.2) + Qwen(0.8)
Step 7: Weighted ensemble submission file created successfully!
Submission shape: (3, 2)
Applied weights: Mistral=0.2, Qwen=0.8
First few predictions:
   row_id                             Category:Misconception
0   36696   True_Neither:NA True_Correct:NA False_Neither:NA
1   36697  False_Neither:NA False_Misconception:WNB False...
2   36698   True_Neither:NA False_Neither:NA True_Correct:NA
