<a href="https://colab.research.google.com/github/katoki001/emo_study/blob/main/emodu_edu.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install faiss-cpu



In [2]:
import pandas as pd
from datasets import load_dataset
import warnings

warnings.filterwarnings("ignore")
def load_all_datasets():
    print("Loading physics datasets...")
    dataframes = []

    # Helper to safely add a dataframe
    def add_df(df, source):
        dataframes.append(
            df[["text_content"]].assign(source=source)
        )

    # 1. Wikipedia Physics
    try:
        print("‚Üí Wikipedia physics")
        wiki = pd.read_parquet(
            "hf://datasets/arash11/wikipedia-physics-corpus/"
            "wikipedia-physics-paragraphs--planck-labeled.parquet"
        )
        wiki["text_content"] = wiki.get("content", wiki.get("text", ""))
        add_df(wiki, "wikipedia_physics")
    except Exception as e:
        print("Wikipedia error:", e)

    # 2. ArXiv Physics
    try:
        print("‚Üí ArXiv physics")
        arxiv = load_dataset(
            "ayoubkirouane/arxiv-physics",
            split="train"
        ).to_pandas()
        arxiv["text_content"] = arxiv["text"]
        add_df(arxiv, "arxiv")
    except Exception as e:
        print("ArXiv error:", e)

    # 3. ScienceQA
    try:
        print("‚Üí ScienceQA")
        sq = load_dataset(
            "AnonySub628/physics-scienceqa",
            split="train"
        ).to_pandas()
        sq["text_content"] = sq["question"]
        add_df(sq, "scienceqa")
    except Exception as e:
        print("ScienceQA error:", e)

    # 4. YouTube custom list
    try:
        print("‚Üí YouTube CSV")
        yt = pd.read_csv("youtube_physics_videos.csv")
        yt["text_content"] = yt["title"] + " ‚Äî " + yt["url"]
        add_df(yt, "youtube_custom")
    except Exception as e:
        print("YouTube error:", e)

    print("Finished loading datasets.")
    return dataframes


def create_final_dataset():
    dfs = load_all_datasets()

    if not dfs:
        raise ValueError("No datasets loaded.")

    df = pd.concat(dfs, ignore_index=True)

    # Clean text
    df = (
        df.dropna(subset=["text_content"])
          .assign(text_content=lambda x: x["text_content"].astype(str).str.strip())
    )

    # Filter + deduplicate
    df = df[df["text_content"].str.len() >= 20]
    df = df.drop_duplicates(subset="text_content").reset_index(drop=True)

    # Add ID
    df.insert(0, "id", range(1, len(df) + 1))

    # Save
    df.to_csv("physics_clean_dataset.csv", index=False)
    print(f"Saved physics_clean_dataset.csv ({len(df)} rows)")

    return df
# Run
df = create_final_dataset()
df.head()

Loading physics datasets...
‚Üí Wikipedia physics
‚Üí ArXiv physics




ArXiv error: 'text'
‚Üí ScienceQA
ScienceQA error: 'question'
‚Üí YouTube CSV
YouTube error: [Errno 2] No such file or directory: 'youtube_physics_videos.csv'
Finished loading datasets.
Saved physics_clean_dataset.csv (93852 rows)


Unnamed: 0,id,text_content,source
0,1,The atomic number or nuclear charge number (sy...,wikipedia_physics
1,2,"For an ordinary atom which contains protons, n...",wikipedia_physics
2,3,Atoms with the same atomic number but differen...,wikipedia_physics
3,4,The conventional symbol Z comes from the Germa...,wikipedia_physics
4,5,The rules above do not always apply to exotic ...,wikipedia_physics


In [3]:
import numpy as np
import torch
from sentence_transformers import SentenceTransformer

def generate_embeddings():
    df = pd.read_csv("physics_clean_dataset.csv", usecols=["text_content"])
    df = df.head(3000)
    sentences = df["text_content"].tolist()

    model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2", device="cpu")
    torch.set_num_threads(4)

    embeddings = model.encode(
        sentences,
        batch_size=128,
        convert_to_numpy=True,
        normalize_embeddings=True,
        show_progress_bar=True
    )

    np.save("light_embeddings.npy", embeddings)
    df.to_csv("physics_texts.csv", index=False)
    print("Saved embeddings + texts")
    print("Shape:", embeddings.shape)

generate_embeddings()

Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]

BertModel LOAD REPORT from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


Batches:   0%|          | 0/24 [00:00<?, ?it/s]

Saved embeddings + texts
Shape: (3000, 384)


In [4]:
import faiss
import numpy as np

emb = np.load("light_embeddings.npy").astype("float32")

index = faiss.IndexFlatIP(emb.shape[1])
index.add(emb)
faiss.write_index(index, "physics_index.faiss")

print("FAISS index saved")


FAISS index saved


In [5]:
import faiss
import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer

texts = pd.read_csv("physics_texts.csv")["text_content"].tolist()
index = faiss.read_index("physics_index.faiss")
model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

def retrieve(query, k=5):
    q_emb = model.encode([query]).astype("float32")
    D, I = index.search(q_emb, k)
    return [texts[i] for i in I[0]]

def answer_question(question):
    chunks = retrieve(question, 5)
    combined = "\n---\n".join(chunks)
    return f"Relevant physics information:\n\n{combined}"

Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]

BertModel LOAD REPORT from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


In [10]:
# =======================
# Imports
# =======================
import faiss
import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
import gc


# =======================
# Load data & models
# =======================
texts = pd.read_csv("physics_texts.csv")["text_content"].tolist()
index = faiss.read_index("physics_index.faiss")

embed_model = SentenceTransformer(
    "sentence-transformers/all-MiniLM-L6-v2",
    device="cpu"
)

llm_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
tokenizer = AutoTokenizer.from_pretrained(llm_name)

llm = AutoModelForCausalLM.from_pretrained(
    llm_name,
    device_map={"": "cpu"},
    torch_dtype=torch.float32,
    low_cpu_mem_usage=True
)


# =======================
# Helper functions
# =======================
def is_numerical_problem(question: str) -> bool:
    keywords = [
        "calculate", "determine", "find",
        "velocity", "acceleration",
        "distance", "time", "speed"
    ]
    return any(k in question.lower() for k in keywords)

def retrieve(query, k=2):
    q_emb = embed_model.encode([query]).astype("float32")
    _, I = index.search(q_emb, k)

    context_chunks = []
    links = []

    for i in I[0]:
        text = texts[i]
        # If the text contains your custom YouTube separator, treat it as a link
        if " ‚Äî http" in text:
            links.append(text)
        else:
            context_chunks.append(text[:600])

    return context_chunks, links


# =======================
# Core generation
# =======================
def generate_explanation(question, context, links):
    q_low = question.lower()

    # Mode 1: Schedule & Verification
    if any(w in q_low for w in ["schedule", "plan", "test me", "verify", "3 day"]):
        system_prompt = """You are a physics tutor.
        1. Ask the user 2 concept questions to verify their knowledge.
        2. Provide a 3-day study schedule based on the context.
        Format:
        ### 1. Verification Questions
        ### 2. 3-Day Schedule"""

    # Mode 2: Solving
    elif is_numerical_problem(question):
        system_prompt = """You are a strict Physics Tutor for beginners.
1. Use ONLY: Work = Force * Distance.
2. Units: Force in Newtons (N), Distance in Meters (m), Work in Joules (J).
3. DO NOT use time (s) in your calculation.
4. Calculate carefully:
5. Provide ONLY the requested sections.
        Solve the problem using this format.
        ### 1. Given
        ### 2. Formula
        ### 3. Solution
        ### 4. Final Answer """
    else:
        system_prompt = "You are a physics tutor. Provide a detailed Explanation and 4-7 bulleted Key Points."

    # Prepare messages for Chat Template
    messages = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": f"Context:\n{context}\n\nQuestion:\n{question}"}
    ]

    # Apply template and tokenize
    prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = tokenizer(prompt, return_tensors="pt").to("cpu") # Ensure it's on CPU as per your setup

    # GENERATION FIX: Added do_sample=True and pad_token_id
    outputs = llm.generate(
        **inputs,
        max_new_tokens=450,
        temperature=0.2,
        do_sample=True, # Required to use temperature
        pad_token_id=tokenizer.eos_token_id
    )

    # Decode only the new tokens
    response = tokenizer.decode(outputs[0][inputs["input_ids"].shape[-1]:], skip_special_tokens=True)

    # Mode 3: Add Links (Read/Watch section)
    link_section = ""
    if links:
        link_section = "\n\n### 3. Study Resources (Watch & Read)\n" + "\n".join([f"‚Ä¢ {l}" for l in links])

    return response.strip() + link_section


def answer_question(question: str):
    chunks, links = retrieve(question, 5) # Increased k to find both text and links
    context = "\n\n".join(chunks)
    return generate_explanation(question, context, links)

Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]

BertModel LOAD REPORT from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


Loading weights:   0%|          | 0/201 [00:00<?, ?it/s]

In [11]:
import torch
import gc

def clean_memory():
    torch.cuda.empty_cache()
    gc.collect()


In [13]:
print(answer_question(input()))



A person lifts a box weighing 100 up by 1.5. How much work is done? 
To answer the question, we need to understand the concept of mechanical work and its definition. Mechanical work is the amount of energy required to move an object from one position to another. In this case, the box is lifted by the person, and the work done is the change in kinetic energy of the box.

To calculate the work done, we need to know the force applied by the person, the distance traveled by the box, and the speed at which it was lifted. We can use the formula Œ¥ Œ¥ W = d ( p o u t V o u t ) ‚àí ‚àí d ( p i n V i n ) + Œ¥ Œ¥ W s h a f t to calculate the work done.

In this case, the force applied by the person is 100 N, the distance traveled by the box is 1.5 m, and the speed at which it was lifted is 1.5 m/s. The formula gives us:

Œ¥ Œ¥ W = 100 N √ó 1.5 m ‚àí 1.5 m + 100 N √ó 1.5 m = 1.782 661 92 √ó 1.5 m = 2.544 996 54 kg .

Therefore, the work done by the person on the box is 2.544 996 54 kg.


In [None]:
clean_memory()


In [None]:
# Launch with public URL
import gradio as gr  # <-- THIS WAS MISSING
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
def api_handler(prompt):
    try:
        result = generate_text(prompt)
        return result
    except Exception as e:
        return f"Error: {str(e)}"

demo = gr.Interface(
    fn=api_handler,
    inputs=gr.Textbox(label="Your Prompt"),
    outputs=gr.Textbox(label="LLM Response"),
    title="My LLM API",
    description="Send prompts via HTTP to this endpoint"
)

# THIS CREATES THE PUBLIC URL
demo.launch(share=True, debug=True)

In [None]:
!pip install -q gradio transformers torch accelerate

# ***TG***

In [None]:
from telegram import Update
from telegram.ext import (
    ApplicationBuilder,
    MessageHandler,
    ContextTypes,
    filters
)

BOT_TOKEN = "8587589105:AAGJ4eIEwKm5JQ_NiS0GRCF5pv2gb6Q9aM4"


async def handle_message(update: Update, context: ContextTypes.DEFAULT_TYPE):
    user_text = update.message.text
    if not user_text:
        return

    await update.message.reply_text("‚è≥ Thinking...")

    try:
        answer = answer_question(user_text)
        await update.message.reply_text(answer)
    except Exception as e:
        await update.message.reply_text("‚ö†Ô∏è Error occurred.")
        print(e)

    clean_memory()


async def main():
    app = ApplicationBuilder().token(BOT_TOKEN).build()
    app.add_handler(
        MessageHandler(filters.TEXT & ~filters.COMMAND, handle_message)
    )
    print("ü§ñ Physics bot is running...")
    await app.run_polling()


import nest_asyncio
nest_asyncio.apply()

await main()



ModuleNotFoundError: No module named 'telegram'

In [None]:
!pip install -U python-telegram-bot==20.7


In [None]:
!pip install nest_asyncio
import nest_asyncio

nest_asyncio.apply()

app = ApplicationBuilder().token(BOT_TOKEN).build()
app.add_handler(MessageHandler(filters.TEXT & ~filters.COMMAND, handle_message))

print("Bot is running...")
app.run_polling()