In [None]:
import os
from datetime import datetime, timedelta
import pandas as pd


output_path = "/content/sample_data/doc_qa"
os.makedirs(output_path, exist_ok=True)

suffixes_and_descriptions = {
    "Alpha": (
        "2026-04-01",
        "Project E develops new speaking assessment tasks that simulate real-life conversations and classroom interactions. "
        "It emphasizes cultural relevance, spontaneous speech prompts, and rubric alignment with WIDA standards. "
        "The project also investigates audio delivery options and scoring consistency for oral language features."
    ),
    "Beta": (
        "2026-07-15",
        "Project F focuses on accessibility evaluation and enhancement of test items for English learners with diverse needs. "
        "It includes reviewing digital formats, simplifying complex language, and embedding accommodations such as screen readers and translations. "
        "The project also partners with specialists to ensure universal design compliance."
    ),
    "Gamma": (
        "2026-10-01",
        "Project G creates a centralized reporting and analytics framework for schools and districts using English language assessments. "
        "It includes data pipeline integration, real-time dashboard development, and stakeholder-focused report templates. "
        "The project aims to streamline decision-making and improve communication with parents and educators."
    )
}

for i, (suffix, (start_date_str, description)) in enumerate(suffixes_and_descriptions.items(), start=1):
    start_date = datetime.strptime(start_date_str, "%Y-%m-%d")
    end_date = start_date + timedelta(days=90)

    breakdown = [
        {
            "task": "Speaking Task Development",
            "members": ["Alice", "Xin"],
            "start": (start_date + timedelta(days=0)).strftime("%Y-%m-%d"),
            "end": (start_date + timedelta(days=20)).strftime("%Y-%m-%d"),
            "resources": ["Speaking rubrics", "Audio scripts"]
        },
        {
            "task": "Accessibility Review",
            "members": ["Shujing"],
            "start": (start_date + timedelta(days=21)).strftime("%Y-%m-%d"),
            "end": (start_date + timedelta(days=40)).strftime("%Y-%m-%d"),
            "resources": ["Language guidelines", "Universal design checklist"]
        },
        {
            "task": "Data Reporting Design",
            "members": ["Charlie", "George"],
            "start": (start_date + timedelta(days=41)).strftime("%Y-%m-%d"),
            "end": (start_date + timedelta(days=65)).strftime("%Y-%m-%d"),
            "resources": ["Excel dashboards", "Power BI templates"]
        },
        {
            "task": "Documentation and Handoff",
            "members": ["Karen"],
            "start": (start_date + timedelta(days=66)).strftime("%Y-%m-%d"),
            "end": (start_date + timedelta(days=90)).strftime("%Y-%m-%d"),
            "resources": ["User manuals", "Finalized project files"]
        }
    ]

    project = {
        "project_name": f"Project {suffix}",
        "manager": "Frank",
        "date_range": (start_date.strftime("%Y-%m-%d"), end_date.strftime("%Y-%m-%d")),
        "description": description,
        "breakdown": breakdown
    }

    file_path = os.path.join(output_path, f"project_{i}.txt")
    with open(file_path, "w") as f:
        f.write(f"Project Name: {project['project_name']}\n")
        f.write(f"Manager: {project['manager']}\n")
        f.write(f"Date Range: {project['date_range'][0]} to {project['date_range'][1]}\n")
        f.write(f"Description: {project['description']}\n\n")
        f.write("Project Breakdown:\n")
        for task in project["breakdown"]:
            f.write(f"- Task: {task['task']}\n")
            f.write(f"  Members: {', '.join(task['members'])}\n")
            f.write(f"  Date Range: {task['start']} to {task['end']}\n")
            f.write(f"  Resources Needed: {', '.join(task['resources'])}\n\n")


pd.DataFrame(os.listdir(output_path), columns=["Project Files"])


Unnamed: 0,Project Files
0,project_3.txt
1,project_2.txt
2,project_1.txt


In [None]:
!pip install together langchain langchain_together langchain-community langchain-text-splitters faiss-cpu sentence-transformers --quiet


[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/90.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m90.7/90.7 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.5/2.5 MB[0m [31m38.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.3/31.3 MB[0m [31m29.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m438.1/438.1 kB[0m [31m22.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m65.4/65.4 kB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.0/363.0 kB[0m [31m16.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.4/44.4 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m

In [None]:
from langchain_community.document_loaders import TextLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS

# Load and combine documents from project files
project_dir = "/content/sample_data/doc_qa"
filenames = [os.path.join(project_dir, f) for f in os.listdir(project_dir) if f.startswith("project_")]
documents = []

for file in filenames:
    loader = TextLoader(file)
    documents.extend(loader.load())

# Chunk documents
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
docs = text_splitter.split_documents(documents)

# Create embeddings with a small open-source model (MiniLM)
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

# Embed and store in FAISS
vectorstore = FAISS.from_documents(docs, embedding_model)

# Save the FAISS index
db_path = "/content/sample_data/doc_qa/project_faiss_index"
vectorstore.save_local(db_path)

f"✅ Vectorstore created and saved at: {db_path}"


  embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

'✅ Vectorstore created and saved at: /content/sample_data/doc_qa/project_faiss_index'

In [None]:
total_vectors = vectorstore.index.ntotal
dimensions = vectorstore.index.d
print(f"There are {total_vectors} vectors with {dimensions:,} dimensions in the vector store")

There are 6 vectors with 384 dimensions in the vector store


In [None]:
from together import Together
from langchain_core.prompts import PromptTemplate
from langchain.chains import RetrievalQA
from langchain_together import Together

llm = Together(
    model="meta-llama/Llama-3.3-70B-Instruct-Turbo-Free",
    api_key="YOUR-KEY",
    temperature=0.1,
    max_tokens=512
)

# Define prompt template for RAG
template = """You are a helpful assistant at the Center for Applied Linguistics.
Use the following context to answer the user's question.

Context:
{context}

Question: {question}
Answer:"""

rag_prompt = PromptTemplate(
    input_variables=["context", "question"],
    template=template
)

# Set up RetrievalQA chain
rag_chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=vectorstore.as_retriever(),
    chain_type_kwargs={"prompt": rag_prompt}
)



In [None]:
query = "What is Project Beta about, its timeframe and who is responsible for item writing?"
response = rag_chain.run(query)

print(response)


  response = rag_chain.run(query)


 Project Beta focuses on accessibility evaluation and enhancement of test items for English learners with diverse needs. It runs from 2026-07-15 to 2026-10-13. There is no specific mention of "item writing" in the context provided, but the task closest to this description is "Speaking Task Development," which is handled by Alice and Xin.
