In [None]:
from datasets import load_dataset
import jsonlines

from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

In [2]:
data_file = "adamawa_english_fulfulde_french_fub.tsv"
dataset = load_dataset("csv", data_files=data_file, delimiter="\t", split="train")

In [3]:
with jsonlines.open("adamawa_english_fulfulde_french_fub.jsonl", "w") as writer:
    for i in range(len(dataset)):
        writer.write({"english": dataset[i]['English'], "fulfulde": dataset[i]['Fulfulde'], "french": dataset[i]['French']})
        

In [None]:
data_file = "adamawa_english_fulfulde_french_fub.jsonl"
dataset = load_dataset("json", data_files=data_file, split="train")
print(dataset[1])

In [53]:
def tokenize_function(examples, tokenizer):
    prompt_template = """en: {english}\nff: {fulfulde}"""
    return tokenizer(prompt_template.format(**examples), padding="max_length", truncation=True, max_length=32, add_special_tokens=True)

    # return tokenizer(examples["text"], padding="max_length", truncation=True)

In [54]:
model_name = "NYTK/translation-marianmt-en-hu"


In [55]:
special_chars = ['Ɓ','ɓ', 'Ɗ','ɗ','Ŋ','ŋ','Ƴ','ƴ','Ñ','ñ']

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [None]:
text = dataset[1]
tokens = tokenize_function(text, tokenizer)
subwords = tokenizer.convert_ids_to_tokens(tokens["input_ids"])
print(tokens)
print(subwords)

In [None]:
num_added_tokens = tokenizer.add_tokens(special_chars)
print(f"Added {num_added_tokens} tokens to the vocabulary")

In [None]:
tokens = tokenize_function(text, tokenizer)
subwords = tokenizer.convert_ids_to_tokens(tokens["input_ids"])
print(tokens)
print(subwords)

In [60]:
english = "Hello, how are you?"
fulfulde = "Sannu, a jamo?"

input_json = {"english": english, "fulfulde": fulfulde}

In [None]:
tokens = tokenize_function(input_json, tokenizer)
subwords = tokenizer.convert_ids_to_tokens(tokens["input_ids"])
print(tokens)
print(subwords)

In [62]:
model_name = "Helsinki-NLP/opus-mt-en-ha"
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [None]:
tokens = tokenize_function(text, tokenizer)
subwords = tokenizer.convert_ids_to_tokens(tokens["input_ids"])
print(tokens)
print(subwords)

In [64]:
from ollama import chat

In [None]:
prompt_template = """
{priming}

{question}

{decorator}

Your solution:
"""

priming_text = "You are an expert at writing clear, concise, Python code."

In [None]:
prompt = "Explain this code:\n\n{code_snippet}"
code_snippet = "def add(a, b):\n    return a + b"
formatted_prompt = prompt.format(code_snippet=code_snippet)

# Call your local model – you can adjust parameters via the 'options' dictionary,
# e.g. setting temperature, max tokens, etc.
response = chat(
    model="llama3.2",  # replace with your desired model name (after pulling it via 'ollama pull')
    messages=[{'role': 'user', 'content': formatted_prompt}],
    options={'temperature': 0.7}  # set temperature, max tokens, etc.
)

# Print the assistant's response
print(response['message']['content'])

In [None]:
from assistant import generate_response

task_type = "code_generation" 


response = generate_response(task_type)

# Display the response
print("\n=== AI Response ===\n")
print(response)


In [None]:
custom_inputs = {
    "priming": "You are an expert Python programmer who writes efficient and optimized code.",
    "question": "Write a Python function to find the factorial of a number.",
    "decorator": "Explain each step in detail."
}

# Call with arguments
response = generate_response("code_generation", custom_inputs)

# Display the response
print("\n=== AI Response ===\n")
print(response)