In [1]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Sun Feb 9 2025

@author: Yaning
"""

# Import necessary libraries
import torch
import torch.nn as nn
import torch.optim as optim
# import torch.nn.functional as F # mainly for ReLU
import numpy as np
import copy
import re

In [81]:
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.1-8B")
model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-3.1-8B")

Downloading shards: 100%|████████████████████████████████████████████████████████████| 4/4 [06:21<00:00, 95.48s/it]
Loading checkpoint shards: 100%|█████████████████████████████████████████████████████| 4/4 [00:08<00:00,  2.03s/it]


In [None]:
# # Load model directly
# from transformers import AutoTokenizer, AutoModelForCausalLM, GPT2Model

# tokenizer = AutoTokenizer.from_pretrained("dbmdz/german-gpt2")
# model = GPT2Model.from_pretrained("dbmdz/german-gpt2")
# model.eval()

  from .autonotebook import tqdm as notebook_tqdm


GPT2Model(
  (wte): Embedding(50265, 768)
  (wpe): Embedding(1024, 768)
  (drop): Dropout(p=0.0, inplace=False)
  (h): ModuleList(
    (0-11): 12 x GPT2Block(
      (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (attn): GPT2Attention(
        (c_attn): Conv1D(nf=2304, nx=768)
        (c_proj): Conv1D(nf=768, nx=768)
        (attn_dropout): Dropout(p=0.0, inplace=False)
        (resid_dropout): Dropout(p=0.0, inplace=False)
      )
      (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (mlp): GPT2MLP(
        (c_fc): Conv1D(nf=3072, nx=768)
        (c_proj): Conv1D(nf=768, nx=3072)
        (act): NewGELUActivation()
        (dropout): Dropout(p=0.0, inplace=False)
      )
    )
  )
  (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
)

In [3]:
s_text = "Haid geh ich in die Schdadt, um paar Eingaufn zu machn. Dann treffsch mich mit Feiern im Café un genießn e heeßn Gaffee. Es is'n schieener Tach, un ich freu mich offn Oobnd."
h_text = "Heute gehe ich in die Stadt, um ein paar Einkäufe zu machen. Danach treffe ich mich mit Freunden im Café und genieße einen heißen Kaffee. Es ist ein schöner Tag, und ich freue mich auf den Abend."

In [None]:
processed_s_text = s_text.lower()
processed_s_text = re.sub(r'[.,]', " ", processed_s_text)
processed_s_text = re.sub(r'\s+', " ", processed_s_text)

In [10]:
processed_h_text = h_text.lower()
processed_h_text = re.sub(r'[.,]', " ", processed_h_text)
processed_h_text = re.sub(r'\s+', " ", processed_h_text)

In [36]:
processed_h_text

'heute gehe ich in die stadt um ein paar einkäufe zu machen danach treffe ich mich mit freunden im café und genieße einen heißen kaffee es ist ein schöner tag und ich freue mich auf den abend '

In [23]:
test_h = processed_h_text[:16]
test_s = processed_s_text[:16]

In [20]:
processed_h_text

'heute gehe ich in die stadt um ein paar einkäufe zu machen danach treffe ich mich mit freunden im café und genieße einen heißen kaffee es ist ein schöner tag und ich freue mich auf den abend '

In [25]:
test_h

'heute gehe ich i'

In [None]:
with open('/home/yahu202d/workspaces/horse/yahu202d-saexy/data/hochdeutsch.txt', 'r') as file:
    h_text = file.read()

with open('/home/yahu202d/workspaces/horse/yahu202d-saexy/data/sachsen.txt', 'r') as file:
    s_text = file.read()

In [26]:
# Define the Transition Layer (T)
class TransitionLayer(nn.Module):
    def __init__(self, embedding_dim):
        super(TransitionLayer, self).__init__()
        # Trainable transition matrix to map standard to dialect
        self.transition_matrix = nn.Parameter(torch.randn(embedding_dim, embedding_dim))
    
    def forward(self, standard_embeddings):
        # Apply the transformation: H_dialect = T * H_standard
        return torch.matmul(standard_embeddings, self.transition_matrix)

In [27]:
embedding_dim = model.config.hidden_size  # LLaMA hidden size (usually 4096 for LLaMA-7B)
transition_layer = TransitionLayer(embedding_dim)

In [28]:
# Move everything to GPU (if available)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
transition_layer.to(device)

TransitionLayer()

In [30]:
tokenizer.pad_token = tokenizer.eos_token

In [7]:
test_sentence = "Guten Morgen"

In [56]:
tokens_h = tokenizer(processed_h_text, padding=True, truncation=True, return_tensors="pt")
tokens_s = tokenizer(processed_s_text, padding=True, truncation=True, return_tensors="pt")

In [65]:
type(tokens_s)

transformers.tokenization_utils_base.BatchEncoding

In [66]:
from transformers import BatchEncoding

In [67]:
tokens_s_first_20 = BatchEncoding({
    'input_ids': tokens_s['input_ids'][:, :20],
    'attention_mask': tokens_s['attention_mask'][:, :20]
})

In [68]:
tokens_h_first_20 = BatchEncoding({
    'input_ids': tokens_h['input_ids'][:, :20],
    'attention_mask': tokens_h['attention_mask'][:, :20]
})

In [None]:
tokens_h_first_20

: 

In [56]:
# Example parallel dataset (Standard ↔ Dialect)
standard_sentences = [
    "Haid geh ich in die Schdadt",
    "um paar Eingaufn zu machn",
    "Dann treffsch mich mit Feiern im Café un genießn e heeßn Gaffee"
]

dialect_sentences = [
    "Heute gehe ich in die Stadt",
    "um ein paar Einkäufe zu machen",
    "Danach treffe ich mich mit Freunden im Café und genieße einen heißen Kaffee"
]

# Tokenize the sentences
def tokenize_sentences(sentences):
    return tokenizer(sentences, padding=True, truncation=True, return_tensors="pt")

standard_inputs = tokenize_sentences(standard_sentences)
dialect_inputs = tokenize_sentences(dialect_sentences)


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [70]:
# Define Mean Squared Error loss
criterion = nn.MSELoss()

# Optimizer (e.g., Adam)
optimizer = optim.Adam(transition_layer.parameters(), lr=0.001)

# Training loop
num_epochs = 100
for epoch in range(num_epochs):
    optimizer.zero_grad()
    
    # Get standard language embeddings
    with torch.no_grad():
        standard_outputs = model(**tokens_h_first_20.to(device))
        # standard_outputs = model.base_model(**standard_inputs.to(device), output_hidden_states=True)
        # standard_hidden_states = standard_outputs.hidden_states[-1]  # Last hidden state (batch_size, seq_len, hidden_size)
        standard_hidden_states = standard_outputs.last_hidden_state

    # Get dialect embeddings (ground truth)
    with torch.no_grad():
        dialect_outputs = model(**tokens_s_first_20.to(device))
        dialect_hidden_states = dialect_outputs.last_hidden_state
    
    # Apply the transition matrix to map standard to dialect space
    predicted_dialect_hidden_states = transition_layer(standard_hidden_states)
    
    # Compute the loss between transformed embeddings and ground truth dialect embeddings
    loss = criterion(predicted_dialect_hidden_states, dialect_hidden_states)
    
    # Backward pass and optimize
    loss.backward()
    optimizer.step()
    
    if (epoch + 1) % 2 == 0:
        print(f'Epoch [{epoch + 1}/{num_epochs}], Loss: {loss.item():.4f}')

# After training, the transition layer will contain the trained matrix
print("Training complete. Transition matrix learned.")

Epoch [2/100], Loss: 2566.3745
Epoch [4/100], Loss: 2495.0471
Epoch [6/100], Loss: 2425.3965
Epoch [8/100], Loss: 2357.4358
Epoch [10/100], Loss: 2291.1714
Epoch [12/100], Loss: 2226.6003
Epoch [14/100], Loss: 2163.7163
Epoch [16/100], Loss: 2102.5088
Epoch [18/100], Loss: 2042.9645
Epoch [20/100], Loss: 1985.0653
Epoch [22/100], Loss: 1928.7887
Epoch [24/100], Loss: 1874.1079
Epoch [26/100], Loss: 1820.9930
Epoch [28/100], Loss: 1769.4128
Epoch [30/100], Loss: 1719.3331
Epoch [32/100], Loss: 1670.7191
Epoch [34/100], Loss: 1623.5348
Epoch [36/100], Loss: 1577.7441
Epoch [38/100], Loss: 1533.3096
Epoch [40/100], Loss: 1490.1947
Epoch [42/100], Loss: 1448.3630
Epoch [44/100], Loss: 1407.7780
Epoch [46/100], Loss: 1368.4037
Epoch [48/100], Loss: 1330.2047
Epoch [50/100], Loss: 1293.1460
Epoch [52/100], Loss: 1257.1937
Epoch [54/100], Loss: 1222.3137
Epoch [56/100], Loss: 1188.4736
Epoch [58/100], Loss: 1155.6409
Epoch [60/100], Loss: 1123.7842
Epoch [62/100], Loss: 1092.8724
Epoch [64/10

In [72]:
model = AutoModelForCausalLM.from_pretrained("dbmdz/german-gpt2")

In [88]:
model.to("cpu")

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 4096)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((4096,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((4096,), eps=1e-05)
      )
    )
    (norm): LlamaRMSNorm((4096,), eps=1e-05)
    (rotary_

In [93]:
# After training, use the trained transition matrix to generate dialect responses
max_length = 10
def generate_dialect_response(input_text):
    # Tokenize the input question
    inputs = tokenizer(input_text, return_tensors="pt").to("cpu")
    input_ids = inputs["input_ids"]
    # Get standard language embeddings
    with torch.no_grad():
        for _ in range(max_length):
            outputs = model(**inputs.to("cpu"), output_hidden_states=True)
            standard_hidden_states = outputs.hidden_states[-1]  # Last hidden state
    
            # Apply the transition matrix to convert standard to dialect
            # dialect_hidden_states = transition_layer(standard_hidden_states)
            dialect_hidden_states = standard_hidden_states

    
            # Use transformed dialect embeddings to generate text
            outputs.logits = model.lm_head(dialect_hidden_states)
    
            # Generate text from the transformed hidden states (using argmax or beam search)
            generated_tokens = torch.argmax(outputs.logits, dim=-1)
            input_ids = torch.cat([input_ids, generated_tokens], dim=-1)


    response = tokenizer.decode(input_ids[0], skip_special_tokens=True)
    
    return response

# Example question
input_question = "was ist die liebe"
dialect_response = generate_dialect_response(input_question)
print(f"Input: {input_question}")
print(f"Dialect Response: {dialect_response}")


Input: was ist die liebe
Dialect Response: was ist die liebeQuestion born das beste
Question born das beste
Question born das beste
Question born das beste
Question born das beste
Question born das beste
Question born das beste
Question born das beste
Question born das beste
Question born das beste



In [94]:
def generate_with_embeddings(input_text, model, tokenizer, max_length=50, device="cpu"):
    # Tokenize the input text
    inputs = tokenizer(input_text, return_tensors="pt").to(device)
    input_ids = inputs["input_ids"]
    
    # Store embeddings of all generated tokens
    all_embeddings = []

    with torch.no_grad():
        for _ in range(max_length):
            # Pass input_ids through the model to get logits and hidden states
            outputs = model(input_ids=input_ids, output_hidden_states=True)
            
            # Get the last hidden state (embeddings for the most recent token)
            hidden_state = outputs.hidden_states[-1][:, -1, :]  # Shape: (batch_size, embedding_dim)
            all_embeddings.append(hidden_state)  # Save the embedding
            
            # Get logits for the last token
            logits = outputs.logits[:, -1, :]  # Shape: (batch_size, vocab_size)
            
            # Generate the next token using greedy decoding (argmax)
            next_token = torch.argmax(logits, dim=-1).unsqueeze(-1)  # Shape: (batch_size, 1)
            
            # Append the new token to input_ids for the next iteration
            input_ids = torch.cat([input_ids, next_token], dim=-1)
            
            # Optional: Stop if end-of-sequence token is generated
            if next_token.item() == tokenizer.eos_token_id:
                break

    # Decode the final sequence of tokens
    generated_text = tokenizer.decode(input_ids[0], skip_special_tokens=True)
    
    return generated_text, torch.cat(all_embeddings, dim=0)  # Return the generated text and embeddings


In [96]:
# Example question
input_question = "was ist die liebe"
dialect_response = generate_with_embeddings(input_question,model,tokenizer)
print(f"Input: {input_question}")
print(f"Dialect Response: {dialect_response}")

Input: was ist die liebe
Dialect Response: ('was ist die liebe\nwas ist die liebe\nPost by 1st » 2019-09-11 08:00\nwas ist die liebe\nPost by 1st » 2019-09-11 08:00\nwas ist die', tensor([[-0.3350,  1.3654,  3.1655,  ...,  0.6161,  0.4388,  0.4800],
        [ 0.8461,  2.3800,  2.1139,  ..., -0.0284,  0.4820,  0.6537],
        [ 1.8327,  0.3759,  1.4126,  ...,  0.6590,  0.6365,  2.0774],
        ...,
        [ 0.8370,  2.4210,  2.0880,  ..., -1.1345, -0.2371,  0.3876],
        [ 1.4597,  1.0331, -0.0953,  ...,  1.2783,  0.6289,  1.7850],
        [ 0.8383,  1.6625,  1.0387,  ..., -0.4015,  0.3411,  0.7449]]))


In [128]:
def ask_llama(question, model, tokenizer, max_length=10, device="cpu"):
    # Tokenize the input question
    inputs = tokenizer(question, return_tensors="pt").to(device)
    
    # Generate the response
    with torch.no_grad():
        output = model.generate(
            inputs.input_ids,
            max_length=max_length,
            temperature=0.7,  # Controls randomness (lower = more deterministic)
            top_k=50,         # Top-k sampling for diversity
            do_sample=True    # Enable sampling for less repetitive responses
        )
    
    
    # Decode the response
    response = tokenizer.decode(output[0], skip_special_tokens=True)
    return response, output[0]

# Example: Ask a question
question = "Was ist die liebekraft?"
response, output = ask_llama(question, model, tokenizer)
print("LLaMA's Response:", response)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


LLaMA's Response: Was ist die liebekraft? - Was


In [114]:
tokenizer.decode(output[8])

' Paris'

In [129]:
output

tensor([128000,  27125,   6127,   2815,  10457,  77614,   3017,     30,    482,
         15148])

In [117]:
test = "diese übersetzung ist sehr unterschiedlich"

In [119]:
test_output = tokenizer(test)

In [131]:
tokenizer.decode(77614)

'bek'