<a href="https://colab.research.google.com/github/isikaykarakus/Foreo_AI_Internship/blob/main/foreow2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from huggingface_hub import notebook_login
notebook_login()

In [None]:
# -*- coding: utf-8 -*-
# Multilingual PoC: EN/ES/PL/TR + Gemma 270M (with safe fallbacks) + style control

!pip -q install sentence-transformers faiss-cpu transformers pandas

import pandas as pd
from sentence_transformers import SentenceTransformer
import faiss
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline


In [None]:
# -----------------------------
# 1) Tiny multilingual dataset
# -----------------------------
rows = [
    # English
    {"lang":"en","phrase":"spill the tea","meaning":"share gossip or a secret","usage":"She spilled the tea about the new launch.","source_url":"https://example.com"},
    {"lang":"en","phrase":"low-key","meaning":"subtly; a little bit; not openly","usage":"I’m low-key excited about this collab.","source_url":"https://example.com"},
    # Spanish
    {"lang":"es","phrase":"estar en las nubes","meaning":"estar distraído; no prestar atención","usage":"En clase siempre está en las nubes.","source_url":"https://example.com"},
    {"lang":"es","phrase":"ponerse las pilas","meaning":"empezar a esforzarse; ponerse activo","usage":"Tenemos que ponernos las pilas antes del lanzamiento.","source_url":"https://example.com"},
    # Polish
    {"lang":"pl","phrase":"mieć muchy w nosie","meaning":"być markotnym; mieć zły humor","usage":"Dziś ma muchy w nosie i nie chce rozmawiać.","source_url":"https://example.com"},
    {"lang":"pl","phrase":"nie być w sosie","meaning":"mieć gorszy dzień; być nie w nastroju","usage":"Szef jest dziś nie w sosie.","source_url":"https://example.com"},
    # Turkish
    {"lang":"tr","phrase":"kafayı yemek","meaning":"çok sinirlenmek ya da aklını kaçıracak gibi olmak","usage":"Sunum bozulunca az daha kafayı yiyordum.","source_url":"https://example.com"},
    {"lang":"tr","phrase":"gaza gelmek","meaning":"kolayca coşup harekete geçmek; kışkırtılmak","usage":"Arkadaşları söyleyince hemen gaza geldi.","source_url":"https://example.com"},
]
df = pd.DataFrame(rows)
display(df)


In [None]:

# ------------------------------------------
# 2) Multilingual embeddings + FAISS index
# ------------------------------------------
# Use a compact multilingual model so all 4 languages live in the same space
EMB_ID = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
emb_model = SentenceTransformer(EMB_ID)

# What we embed for retrieval (language-agnostic)
df["blob"] = df["phrase"] + " — " + df["meaning"] + " — " + df["usage"]
embeddings = emb_model.encode(df["blob"].tolist(), normalize_embeddings=True)

# Cosine via inner product on normalized vectors
index = faiss.IndexFlatIP(embeddings.shape[1])
index.add(embeddings)


In [None]:


# -----------------------------------------
# 3) Generator: prefer Gemma 270M, fallback
# -----------------------------------------
# Some Gemma variants require accepting license / HF login.
# We'll attempt Gemma-270M first; if that fails, fall back to a tiny open model.
MODEL_CANDIDATES = [
    "google/gemma-3-270m",     # preferred (if available to you)
    "google/gemma-2-2b-it",    # small-ish instruct fallback
    "google/flan-t5-small"     # tiny fallback that always works
]

loaded_id = None
gen = None
for mid in MODEL_CANDIDATES:
    try:
        tok = AutoTokenizer.from_pretrained(mid)
        if "t5" in mid.lower():
            # text2text style
            gen = pipeline("text2text-generation", model=mid, tokenizer=tok)
        else:
            # causal LM style
            lm = AutoModelForCausalLM.from_pretrained(mid)
            gen = pipeline("text-generation", model=lm, tokenizer=tok)
        loaded_id = mid
        break
    except Exception as e:
        print(f"[warn] Could not load {mid}: {e}")

print(f"[info] Loaded generator: {loaded_id}")


In [None]:

# -------------------------------
# 4) Retrieval & explanation
# -------------------------------
STYLES = {
    "learner": "Explain simply for language learners. Avoid slang in the explanation and include ONE short example.",
    "casual":  "Use a casual, friendly tone and keep it short.",
    "formal":  "Use a clear, formal, brand-safe tone suitable for documentation."
}

def search(query: str, k: int = 3):
    qv = emb_model.encode([query], normalize_embeddings=True)
    D, I = index.search(qv, k)
    hits = df.iloc[I[0]].copy()
    hits["score"] = [float(s) for s in D[0]]
    return hits

def _gen_text(prompt: str):
    # Works for both text2text and causal pipelines
    if "t5" in (loaded_id or "").lower():
        return gen(prompt, max_new_tokens=140)[0]["generated_text"]
    else:
        return gen(prompt, max_new_tokens=140, do_sample=False)[0]["generated_text"]

def explain(query: str, style: str = "learner", k: int = 3):
    hits = search(query, k)
    context = "\n".join([f"- [{r.lang}] {r.phrase}: {r.meaning} (e.g., {r.usage})" for _, r in hits.iterrows()])
    style_instr = STYLES.get(style, STYLES["learner"])

    prompt = (
        f"Explain the expression '{query}'. {style_instr}\n"
        f"Use the retrieved examples below as context and mention the language code in examples.\n"
        f"Retrieved examples:\n{context}\n\n"
        f"Answer:"
    )
    out = _gen_text(prompt)
    return hits[["lang","phrase","meaning","usage","source_url","score"]], out


In [None]:

# -----------------------------------
# 5) Run one example per language
# -----------------------------------
queries = ["spill the tea", "estar en las nubes", "mieć muchy w nosie", "kafayı yemek"]
for q in queries:
    table, answer = explain(q, style="learner", k=3)
    display(table)
    print(f"\n--- {q} | learner ---\n{answer}\n")

# Show style customisation on one term
_, ans_casual = explain("spill the tea", style="casual", k=3)
print("\n=== STYLE: CASUAL ===\n", ans_casual)
_, ans_formal = explain("spill the tea", style="formal", k=3)
print("\n=== STYLE: FORMAL ===\n", ans_formal)
