In [21]:
import os
from dotenv import load_dotenv
load_dotenv()  # 讀 .env（請勿 commit）

GROQ_API_KEY = os.getenv("GROQ_API_KEY")
assert GROQ_API_KEY, "環境變數 GROQ_API_KEY 未設定。請先在 .env 或 os.environ 設定。"

from groq import Groq
client = Groq(api_key=GROQ_API_KEY)

from sentence_transformers import SentenceTransformer
import faiss
from pypdf import PdfReader
from langchain_text_splitters import RecursiveCharacterTextSplitter


In [22]:
# 將你的 PDF 放到 repo 的 data/ 裡，例如 data/guide.pdf
PDF_PATH = "../data/Real-Time Sign Language Detection using LSTM.pdf"  # ← 替換成你的檔名
reader = PdfReader(PDF_PATH)

pages = []
for i, p in enumerate(reader.pages):
    try:
        pages.append(p.extract_text() or "")
    except Exception as e:
        pages.append("")
        
raw_text = "\n".join(pages).strip()
print("Chars:", len(raw_text))
print(raw_text[:1000])


Chars: 23082
Real-Time  Sign  Language  Detection  using  LSTM  
 Chung-Hao  Tuan  School  of  Computer  Science  Oregon  State  University,  Corvallis,  OR  USA  tuanc@oregonstate.edu  
Yun-Hsuan  Chan  School  of  Computer  Science  Oregon  State  University,  Corvallis,  OR  USA  chanyun@oregonstate.edu  
Fen-Yun  Huang  School  of  Computer  Science  Oregon  State  University,  Corvallis,  OR  USA  huanfeny@oregonstate.edu   
 
Abstract
 
    This  paper  proposes  a  real-time  sign  language  detection  system  utilizing  Long  Short-Term  Memory  (LSTM)  networks  combined  with  keypoint-based  feature  extraction.  The  system  leverages  MediaPipe  Holistic  for  extracting  skeletal  landmarks  from  hand,  face,  and  pose  keypoints.  Compared  to  conventional  approaches  like  Hidden  Markov  Models  (HMMs)  and  Convolutional  Neural  Networks  (CNNs),  LSTM  effectively  captures  temporal  dependencies  required  for  recognizing  continuous  gestures.  We  collected

In [23]:
splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,      # 可依文件長短調整（500~1200）
    chunk_overlap=120,   # 避免斷句丟資訊
    separators=["\n\n", "\n", "。", "，", " "]
)
chunks = splitter.split_text(raw_text)
print("Chunks:", len(chunks))
print(chunks[0][:300])


Chunks: 32
Real-Time  Sign  Language  Detection  using  LSTM  
 Chung-Hao  Tuan  School  of  Computer  Science  Oregon  State  University,  Corvallis,  OR  USA  tuanc@oregonstate.edu  
Yun-Hsuan  Chan  School  of  Computer  Science  Oregon  State  University,  Corvallis,  OR  USA  chanyun@oregonstate.edu  
Fen


In [24]:
embed_model_name = "sentence-transformers/all-MiniLM-L6-v2"
embedder = SentenceTransformer(embed_model_name)

embeddings = embedder.encode(chunks, batch_size=64, show_progress_bar=True, normalize_embeddings=True)
import numpy as np
emb = np.array(embeddings).astype("float32")
dim = emb.shape[1]
print("Embedding shape:", emb.shape, "dim:", dim)


Batches: 100%|██████████| 1/1 [00:00<00:00,  5.33it/s]

Embedding shape: (32, 384) dim: 384





In [25]:
index = faiss.IndexFlatIP(dim)  # cosine 等效於 inner product + normalized vectors
index.add(emb)                  # 加入全部 chunk 向量
print("Indexed vectors:", index.ntotal)


Indexed vectors: 32


In [26]:
# 建議存檔，方便 Day4 直接讀
import json

DATA_DIR = "../data"
CHUNKS_PATH = f"{DATA_DIR}/chunks.json"
EMB_PATH    = f"{DATA_DIR}/embeddings.npy"
INDEX_PATH  = f"{DATA_DIR}/index.faiss"

with open(CHUNKS_PATH, "w", encoding="utf-8") as f:
    json.dump(chunks, f, ensure_ascii=False)

np.save(EMB_PATH, emb)
faiss.write_index(index, INDEX_PATH)

print("✅ Saved chunks, embeddings, and index for reuse")

✅ Saved chunks, embeddings, and index for reuse


In [27]:
# ——— 讀取 Day2 成果（若存在）
from pathlib import Path
have_persist = Path(CHUNKS_PATH).exists() and Path(EMB_PATH).exists() and Path(INDEX_PATH).exists()

if have_persist:
    import faiss, json
    with open(CHUNKS_PATH, "r", encoding="utf-8") as f:
        chunks = json.load(f)
    emb = np.load(EMB_PATH).astype("float32")
    dim = emb.shape[1]
    index = faiss.read_index(INDEX_PATH)
    print("Loaded persisted index:", index.ntotal, "vectors")
else:
    print("⚠️ 找不到持久化索引；請先執行 Day2 建庫流程（或在本 Notebook 開頭複製 Day2 的建庫 cells）")

# Groq
GROQ_API_KEY = os.getenv("GROQ_API_KEY")
assert GROQ_API_KEY, "請先在 .env 或環境變數設定 GROQ_API_KEY"
client = Groq(api_key=GROQ_API_KEY)

Loaded persisted index: 32 vectors


In [28]:
%ls

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


day1_running_llms.ipynb        day3_conversational_rag.ipynb
day2_vector_storage.ipynb


In [None]:
# 基本
import os
import numpy as np

# FAISS + Embedding
import faiss
from sentence_transformers import SentenceTransformer

# Groq API
from groq import Groq

# 載入你 Day2 建好的資料 (chunks, emb, index, embedder, client)
from src.rag.retriever import search_with_meta
from src.rag.rag_pipeline import rag_answer
from src.rag.memory import ConversationMemory, summarize_history_with_groq


ImportError: cannot import name 'summarize_history_with_groq' from 'src.rag.memory' (/Users/duan/llm-engineer-roadmap/src/rag/memory.py)

In [30]:
r1 = rag_chat("What methodology does the paper use?")
print("A1:", r1["answer"])
print("SRC1:", [s["idx"] for s in r1["sources"]])

r2 = rag_chat("And how does it do Keypoint Extraction specifically?")
print("A2:", r2["answer"])
print("SRC2:", [s["idx"] for s in r2["sources"]])

r3 = rag_chat("Summarize the method in 2 bullet points.")
print("A3:", r3["answer"])
print("SRC3:", [s["idx"] for s in r3["sources"]])

NameError: name 'embedder' is not defined