From 04dde303b87258f09678ec3ae94d9f597ab8d591 Mon Sep 17 00:00:00 2001
From: Gabriele Sarti <gabriele.sarti996@gmail.com>
Date: Tue, 30 Apr 2024 10:09:49 +0200
Subject: [PATCH] Add transformers v4.40 models to config, update changelog

---
 CHANGELOG.md                   | 20 ++++++++++++++++++++
 inseq/models/model_config.yaml | 14 +++++++++++++-
 2 files changed, 33 insertions(+), 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 9d7e27d..b4a36ee 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,3 +1,23 @@
 # Changelog
 
 *This file contains a high-level description of changes that were merged into the Inseq main branch since the last release. Refer to the [releases page](https://github.com/inseq-team/inseq/releases) for an exhaustive overview of changes introduced at each release.*
+
+## 🚀 Features
+
+- Added new models `DbrxForCausalLM`, `OlmoForCausalLM`, `Phi3ForCausalLM`, `Qwen2MoeForCausalLM` to model config.
+
+## 🔧 Fixes and Refactoring
+
+- Fix the issue in the attention implementation from [#268](https://github.com/inseq-team/inseq/issues/268) where non-terminal position in the tensor were set to nan if they were 0s ([#269](https://github.com/inseq-team/inseq/pull/269)).
+  
+- Fix the pad token in cases where it is not specified by default in the loaded model (e.g. for Qwen models) ([#269](https://github.com/inseq-team/inseq/pull/269)).
+
+- Fix bug reported in [#266](https://github.com/inseq-team/inseq/issues/266) making `value_zeroing` unusable for SDPA attention. This enables using the method on models using SDPA attention as default (e.g. `GemmaForCausalLM`) without passing `model_kwargs={'attn_implementation': 'eager'}` ([#267](https://github.com/inseq-team/inseq/pull/267)).
+
+## 📝 Documentation and Tutorials
+
+*No changes*
+
+## 💥 Breaking Changes
+
+*No changes*
\ No newline at end of file
diff --git a/inseq/models/model_config.yaml b/inseq/models/model_config.yaml
index 9618135..1b2433a 100644
--- a/inseq/models/model_config.yaml
+++ b/inseq/models/model_config.yaml
@@ -11,6 +11,9 @@ CodeGenForCausalLM:
 CohereForCausalLM:
     self_attention_module: "self_attn"
     value_vector: "value_states"
+DbrxForCausalLM:
+    self_attention_module: "attn"
+    value_vector: "value_states"
 FalconForCausalLM:
     self_attention_module: "self_attention"
     value_vector: "value_layer"
@@ -44,6 +47,9 @@ MixtralForCausalLM:
 MptForCausalLM:
     self_attention_module: "attn"
     value_vector: "value_states"
+OlmoForCausalLM:
+    self_attention_module: "self_attn"
+    value_vector: "value_states"
 OpenAIGPTLMHeadModel:
     self_attention_module: "attn"
     value_vector: "value"
@@ -53,9 +59,15 @@ OPTForCausalLM:
 PhiForCausalLM:
     self_attention_module: "self_attn"
     value_vector: "value_states"
+Phi3ForCausalLM:
+    self_attention_module: "self_attn"
+    value_vector: "value_states"
 Qwen2ForCausalLM:
     self_attention_module: "self_attn"
     value_vector: "value_states"
+Qwen2MoeForCausalLM:
+    self_attention_module: "self_attn"
+    value_vector: "value_states"
 StableLmForCausalLM:
     self_attention_module: "self_attn"
     value_vector: "value_states"
@@ -114,4 +126,4 @@ T5ForConditionalGeneration:
 UMT5ForConditionalGeneration:
     self_attention_module: "SelfAttention"
     cross_attention_module: "EncDecAttention"
-    value_vector: "value_states"
\ No newline at end of file
+    value_vector: "value_states"