<a href="https://colab.research.google.com/github/isikaykarakus/Foreo_AI_Internship/blob/main/foreow2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from huggingface_hub import notebook_login, whoami
notebook_login()          # paste your HF access token when prompted
print("HF user:", whoami().get("name"))


In [None]:
import os, json, pathlib
home = pathlib.Path("~").expanduser()
hf_dir = home / ".huggingface"
hf_dir.mkdir(exist_ok=True)
# This saves the token into the expected file so future sessions auto-auth
with open(hf_dir / "token", "w") as f:
    f.write(os.environ.get("HF_TOKEN",""))


In [None]:
# -*- coding: utf-8 -*-
# Multilingual PoC: EN/ES/PL/TR + Gemma 270M (with safe fallbacks) + style control

!pip -q install sentence-transformers faiss-cpu transformers pandas


import os
import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from IPython.display import display



In [None]:
rows = [
    # 1) Gossip / reveal secret
    {"concept_id":"gossip","lang":"en","phrase":"spill the tea","meaning":"share gossip or reveal a secret","usage":"She spilled the tea about the new launch.","source_url":""},
    {"concept_id":"gossip","lang":"es","phrase":"soltar la sopa","meaning":"contar un secreto o chisme","usage":"Al final soltó la sopa sobre la campaña.","source_url":""},
    {"concept_id":"gossip","lang":"pl","phrase":"puścić farbę","meaning":"zdradzić sekret","usage":"W końcu puścił farbę o projekcie.","source_url":""},
    {"concept_id":"gossip","lang":"tr","phrase":"ağzındaki baklayı çıkarmak","meaning":"sırrı açıklamak; ağzından kaçırmak","usage":"Sonunda ağzındaki baklayı çıkardı.","source_url":""},

    # 2) Daydreaming / head in the clouds
    {"concept_id":"daydream","lang":"en","phrase":"have your head in the clouds","meaning":"be distracted or daydreaming","usage":"He had his head in the clouds during the briefing.","source_url":""},
    {"concept_id":"daydream","lang":"es","phrase":"estar en las nubes","meaning":"estar distraído; soñar despierto","usage":"En clase siempre está en las nubes.","source_url":""},
    {"concept_id":"daydream","lang":"pl","phrase":"bujać w obłokach","meaning":"marzyć; bujać w obłokach","usage":"Na spotkaniu tylko bujał w obłokach.","source_url":""},
    {"concept_id":"daydream","lang":"tr","phrase":"aklı havada olmak","meaning":"dalgın olmak; hayallere dalmak","usage":"Toplantıda aklı tamamen havadaydı.","source_url":""},

    # 3) Low-key / subtly
    {"concept_id":"lowkey","lang":"en","phrase":"low-key","meaning":"subtly; a little; not openly","usage":"I’m low-key excited about this collab.","source_url":""},
    {"concept_id":"lowkey","lang":"es","phrase":"de tranquis","meaning":"de forma discreta; sin alardear","usage":"Lo celebramos de tranquis con el equipo.","source_url":""},
    {"concept_id":"lowkey","lang":"pl","phrase":"po cichu","meaning":"dyskretnie; po cichu","usage":"Zrobili to po cichu, bez ogłoszeń.","source_url":""},
    {"concept_id":"lowkey","lang":"tr","phrase":"çaktırmadan","meaning":"göze batmadan; usulca","usage":"Çaktırmadan birkaç değişiklik yaptık.","source_url":""},

    # 4) Mid / average
    {"concept_id":"mid","lang":"en","phrase":"mid","meaning":"average; not great","usage":"Tbh, the results were mid.","source_url":""},
    {"concept_id":"mid","lang":"es","phrase":"del montón","meaning":"normalito; sin destacar","usage":"Sinceramente, el vídeo quedó del montón.","source_url":""},
    {"concept_id":"mid","lang":"pl","phrase":"takie sobie","meaning":"średnie; nic specjalnego","usage":"Szczerze, wyniki są takie sobie.","source_url":""},
    {"concept_id":"mid","lang":"tr","phrase":"orta karar","meaning":"ortalama; vasat","usage":"Açıkçası performans orta karardı.","source_url":""},

    # 5) Lose it / get very angry
    {"concept_id":"loseit","lang":"en","phrase":"lose it","meaning":"become extremely angry or upset","usage":"I almost lost it when the app crashed.","source_url":""},
    {"concept_id":"loseit","lang":"es","phrase":"perder los papeles","meaning":"perder el control; enfadarse mucho","usage":"Con el retraso, perdió los papeles.","source_url":""},
    {"concept_id":"loseit","lang":"pl","phrase":"puścić nerwy","meaning":"stracić panowanie nad sobą","usage":"Prawie puściły mi nerwy przy tej awarii.","source_url":""},
    {"concept_id":"loseit","lang":"tr","phrase":"kafayı yemek","meaning":"çok sinirlenmek; kendini kaybetmek","usage":"Uygulama çökünce az kalsın kafayı yiyordum.","source_url":""},

    # 6) Get hyped / fired up
    {"concept_id":"hype","lang":"en","phrase":"get hyped","meaning":"become very excited or fired up","usage":"The crowd got hyped before the drop.","source_url":""},
    {"concept_id":"hype","lang":"es","phrase":"venirse arriba","meaning":"animarse mucho; venirse arriba","usage":"Con el tema nuevo todos se vinieron arriba.","source_url":""},
    {"concept_id":"hype","lang":"pl","phrase":"nakręcić się","meaning":"mocno się nakręcić; podekscytować","usage":"Publika szybko się nakręciła.","source_url":""},
    {"concept_id":"hype","lang":"tr","phrase":"gaza gelmek","meaning":"coşmak; hemen motive olmak","usage":"Kalabalık bir anda gaza geldi.","source_url":""},

    # 7) Ghosting
    {"concept_id":"ghosting","lang":"en","phrase":"ghosting","meaning":"suddenly cutting off contact","usage":"After two dates, it was pure ghosting.","source_url":""},
    {"concept_id":"ghosting","lang":"es","phrase":"hacer ghosting","meaning":"dejar de responder sin explicación","usage":"Después del mensaje, me hizo ghosting.","source_url":""},
    {"concept_id":"ghosting","lang":"pl","phrase":"zniknąć bez słowa","meaning":"przestać się odzywać; zniknąć","usage":"Po rozmowie zniknął bez słowa.","source_url":""},
    {"concept_id":"ghosting","lang":"tr","phrase":"ghostlamak / ortadan kaybolmak","meaning":"hiçbir açıklama yapmadan iletişimi kesmek","usage":"İki görüşmeden sonra resmen ghostladı.","source_url":""},

    # 8) Not in the mood / off today
    {"concept_id":"offday","lang":"en","phrase":"not in the mood","meaning":"feeling off; not up for it","usage":"I’m not in the mood for calls today.","source_url":""},
    {"concept_id":"offday","lang":"es","phrase":"no tener el día","meaning":"estar regular; no estar de humor","usage":"Hoy no tengo el día para reuniones.","source_url":""},
    {"concept_id":"offday","lang":"pl","phrase":"nie w sosie","meaning":"mieć zły humor; być nie w formie","usage":"Jestem dziś nie w sosie.","source_url":""},
    {"concept_id":"offday","lang":"tr","phrase":"keyfi yok","meaning":"modu düşük; canı istemiyor","usage":"Bugün pek keyfim yok toplantılara.","source_url":""},
]
df = pd.DataFrame(rows)
df["blob"] = df["phrase"] + " — " + df["meaning"] + " — " + df["usage"]
display(df.head())


In [None]:
# ---------------------------
# 3) Embeddings (multilingual)
# ---------------------------
EMB_ID = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
emb_model = SentenceTransformer(EMB_ID)
embeddings = emb_model.encode(df["blob"].tolist(), normalize_embeddings=True)


In [None]:
# ---------------------------
# 4) Generator (Gemma → FLAN)
# ---------------------------
CANDIDATES = ["google/gemma-3-270m", "google/flan-t5-small"]
loaded_id, gen = None, None
for mid in CANDIDATES:
    try:
        tok = AutoTokenizer.from_pretrained(mid)
        if "t5" in mid.lower():
            gen = pipeline("text2text-generation", model=mid, tokenizer=tok)
        else:
            lm = AutoModelForCausalLM.from_pretrained(mid)
            gen = pipeline("text-generation", model=lm, tokenizer=tok)
        loaded_id = mid
        break
    except Exception as e:
        print(f"[warn] Could not load {mid}: {e}")
print(f"[info] Loaded generator: {loaded_id}")


In [None]:
# ---------------------------
# 5) Retrieval + explanation
# ---------------------------
STYLES = {
    "learner": "Explain simply for language learners. Avoid slang in the explanation and include ONE short example.",
    "casual":  "Use a casual, friendly tone and keep it short.",
    "formal":  "Use a clear, formal, brand-safe tone suitable for documentation."
}

def search(query: str, k: int = 3) -> pd.DataFrame:
    qv = emb_model.encode([query], normalize_embeddings=True)
    sims = cosine_similarity(qv.reshape(1, -1), embeddings)[0]
    top_idx = np.argsort(sims)[::-1][:k]
    hits = df.iloc[top_idx].copy()
    hits["score"] = sims[top_idx]
    return hits[["lang","phrase","meaning","usage","source_url","score"]]

def _gen_text(prompt: str) -> str:
    if "t5" in (loaded_id or "").lower():
        return gen(prompt, max_new_tokens=140, do_sample=False)[0]["generated_text"]
    else:
        return gen(prompt, max_new_tokens=140, do_sample=False)[0]["generated_text"]

def explain(query: str, style: str = "learner", k: int = 3):
    hits = search(query, k)
    context = "\n".join([f"- [{r.lang}] {r.phrase}: {r.meaning} (e.g., {r.usage})" for _, r in hits.iterrows()])
    style_instr = STYLES.get(style, STYLES["learner"])
    prompt = (
        f"Explain the expression '{query}'. {style_instr}\n"
        f"Use the retrieved examples below as context and mention the language code in examples.\n"
        f"Retrieved examples:\n{context}\n\nAnswer:"
    )
    answer = _gen_text(prompt)
    return hits, answer

def aligned_equivalents_from_top_hit(hits: pd.DataFrame) -> pd.DataFrame:
    if hits.empty:
        return pd.DataFrame()
    top_phrase, top_lang = hits.iloc[0]["phrase"], hits.iloc[0]["lang"]
    match = df[(df["phrase"] == top_phrase) & (df["lang"] == top_lang)]
    if match.empty:
        return pd.DataFrame()
    cid = match.iloc[0]["concept_id"]
    return df[df["concept_id"] == cid][["lang","phrase","meaning","usage","source_url"]]


In [None]:
# ---------------------------
# 6) Demo — one query per lang
# ---------------------------
queries = ["spill the tea", "estar en las nubes", "bujać w obłokach", "kafayı yemek"]
for q in queries:
    print("\n==============================")
    print("QUERY:", q)
    hits, ans = explain(q, style="learner", k=3)
    display(hits)
    exact = aligned_equivalents_from_top_hit(hits)
    if not exact.empty:
        print("\nAligned equivalents (exact cross-language matches):")
        display(exact)
    print("\n--- EXPLANATION ---\n", ans)

# Style customisation example
_, ans_casual = explain("spill the tea", style="casual", k=3)
print("\n=== STYLE: CASUAL ===\n", ans_casual)
_, ans_formal = explain("spill the tea", style="formal", k=3)
print("\n=== STYLE: FORMAL ===\n", ans_formal)