# Loading Data

In [1]:
import os
from docx import Document
from PyPDF2 import PdfReader
import pandas as pd

# Loading .doc files
def read_docx(file_path):
    doc = Document(file_path)
    full_text = []
    for para in doc.paragraphs:
        full_text.append(para.text)
    return '\n'.join(full_text).strip()

# Loading .pdf files
def read_pdf(file_path):
    reader = PdfReader(file_path)
    text_by_page = []
    for i, page in enumerate(reader.pages):
        text = page.extract_text()
        if text:
            text_by_page.append((i + 1, text.strip()))
    return text_by_page  # list of (page_number, text)

# Loading .xls files
def read_excel(file_path):
    excel_data = pd.read_excel(file_path, sheet_name=None) 
    text = []
    for sheet, df in excel_data.items():
        text.append(f'--- Sheet: {sheet} ---\n')
        text.append(df.to_string(index=False))
    return '\n'.join(text).strip()

# Loading .csv files
def read_csv(file_path):
    df = pd.read_csv(file_path)
    return df.to_string(index=False).strip()

# Extracting Text in chunks from the files
def extract_text(file_path):
    ext = os.path.splitext(file_path)[1].lower()
    if ext == '.docx':
        return {'filename': os.path.basename(file_path), 'text': read_docx(file_path)}
    elif ext == '.pdf':
        pages = read_pdf(file_path)
        return [{'filename': os.path.basename(file_path), 'page': p, 'text': t} for p, t in pages]
    elif ext in ['.xlsx', '.xls', '.xlsm']:
        return {'filename': os.path.basename(file_path), 'text': read_excel(file_path)}
    elif ext == '.csv':
        return {'filename': os.path.basename(file_path), 'text': read_csv(file_path)}
    else:
        print(f"Unsupported file type: {ext}")
        return None

folder_path = 'C:/Users/User/Downloads/Dr.X Files/Dataset'
all_text_chunks = []

for fname in os.listdir(folder_path):
    fpath = os.path.join(folder_path, fname)
    result = extract_text(fpath)
    
    if isinstance(result, list): 
        all_text_chunks.extend(result)
    elif result:
        all_text_chunks.append(result)


  warn("""Cannot parse header or footer so it will be ignored""")
  warn("""Cannot parse header or footer so it will be ignored""")
  warn(msg)
  warn(f"Print area cannot be set to Defined name: {defn.value}.")


# Tokenization

In [2]:
import tiktoken
tokenizer = tiktoken.get_encoding("cl100k_base")

In [3]:
def chunk_text(text, max_tokens=300, overlap=50):
    tokens = tokenizer.encode(text)
    chunks = []

    i = 0
    while i < len(tokens):
        chunk_tokens = tokens[i:i+max_tokens]
        chunk_text = tokenizer.decode(chunk_tokens)
        chunks.append(chunk_text.strip())
        i += max_tokens - overlap  # Slide window
    return chunks

def chunk_documents(parsed_docs, max_tokens=300):
    chunked_data = []

    for doc in parsed_docs:
        filename = doc['filename']
        page = doc.get('page', None)
        text = doc['text']

        chunks = chunk_text(text, max_tokens=max_tokens)
        for idx, chunk in enumerate(chunks):
            chunked_data.append({
                'filename': filename,
                'page': page,
                'chunk_number': idx + 1,
                'text': chunk
            })

    return chunked_data

chunked_docs = chunk_documents(all_text_chunks, max_tokens=300)

# Printed sample to check chunked doc
for entry in chunked_docs[:2]:
    print(f"File: {entry['filename']}, Page: {entry.get('page', 'N/A')}, Chunk: {entry['chunk_number']}")
    print(entry['text'][:200], '...\n')


File: Dataset summaries and citations.docx, Page: None, Chunk: 1
Table 1. Description of studies included in the meta-analysis. Full article citations are listed after the table.






 
Citation List
Acuña E., A. A., Pastenes V., C., & Villalobos G., L. (2017). Ca ...

File: Dataset summaries and citations.docx, Page: None, Chunk: 2
https://doi.org/10.3390/f5030425
Carley, D. S., Goodman, D., Sermons, S., Shi, W., Bowman, D., Miller, G., & Rufty, T. (2011). Soil Organic Matter Accumulation in Creeping Bentgrass Greens: A Chronose ...



# Vector Database

![alt text](image.png)

In [4]:
from sentence_transformers import SentenceTransformer

embed_model = SentenceTransformer("all-MiniLM-L6-v2")
def embed_texts_local(texts):
    return embed_model.encode(texts, convert_to_numpy=True)

texts = [doc['text'] for doc in chunked_docs]
embeddings = embed_texts_local(texts)

  from .autonotebook import tqdm as notebook_tqdm





In [5]:
import faiss
import numpy as np

def build_faiss_index(chunks, dim=384):  # 384 is for MiniLM, we can adjust it for other models
    texts = [chunk['text'] for chunk in chunks]
    embeddings = embed_texts_local(texts)
    index = faiss.IndexFlatL2(dim)
    index.add(np.array(embeddings).astype('float32'))
    metadata = [
        {
            'filename': chunk['filename'],
            'page': chunk.get('page', 'N/A'),
            'chunk_number': chunk['chunk_number'],
            'text': chunk['text']
        }
        for chunk in chunks
    ]
    return index, embeddings, metadata

In [6]:
import pickle

def save_index(index, metadata, path='Database/vector_index'):
    faiss.write_index(index, f'{path}.index')
    with open(f'{path}_meta.pkl', 'wb') as f:
        pickle.dump(metadata, f)

In [7]:
def load_index(path='Database/vector_index'):
    index = faiss.read_index(f'{path}.index')
    with open(f'{path}_meta.pkl', 'rb') as f:
        metadata = pickle.load(f)
    return index, metadata

In [8]:
index, embeddings, metadata = build_faiss_index(chunked_docs)
save_index(index, metadata)

In [9]:
faiss.write_index(index, 'Database/index.index')
with open('Database/index_meta.pkl', 'wb') as f:
    pickle.dump(metadata, f)

In [10]:
def search_index(query, index, metadata, top_k=5):
    query_embedding = embed_texts_local([query])[0]
    D, I = index.search(np.array([query_embedding]).astype('float32'), top_k)

    results = []
    for idx in I[0]:
        results.append(metadata[idx])
    return results

# RAG Q&A System

In [11]:
def build_prompt(question, chunks):
    context = "\n\n".join([f"Source [{c['filename']}, page {c.get('page', 'N/A')}, chunk {c['chunk_number']}]:\n{c['text']}" for c in chunks])
    prompt = f"""You are an intelligent assistant helping with scientific research.

Answer the following question using only the information provided in the sources below.

Question: {question}

Sources:
{context}

Answer:"""
    return prompt


In [12]:
from llama_cpp import Llama
# Adjust path to your GGUF model
llm = Llama(model_path="llama-2-7b-chat.Q2_K.gguf", n_ctx=2048, n_threads=8)

llama_model_loader: loaded meta data with 19 key-value pairs and 291 tensors from llama-2-7b-chat.Q2_K.gguf (version GGUF V2)
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.name str              = LLaMA v2
llama_model_loader: - kv   2:                       llama.context_length u32              = 4096
llama_model_loader: - kv   3:                     llama.embedding_length u32              = 4096
llama_model_loader: - kv   4:                          llama.block_count u32              = 32
llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 11008
llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 32

In [13]:
def answer_question(question, index, metadata, top_k=5):
    top_chunks = search_index(question, index, metadata, top_k=top_k)
    prompt = build_prompt(question, top_chunks)

    output = llm(prompt, max_tokens=300, stop=["User:", "Question:"], echo=False)
    return output['choices'][0]['text'].strip()

In [14]:
index, metadata = load_index()
query = "What is the total photography budget in a party?"
response = answer_question(query, index, metadata)
print("Answer:", response)

llama_perf_context_print:        load time =  191328.92 ms
llama_perf_context_print: prompt eval time =  191327.12 ms /  1834 tokens (  104.32 ms per token,     9.59 tokens per second)
llama_perf_context_print:        eval time =    2994.34 ms /    16 runs   (  187.15 ms per token,     5.34 tokens per second)
llama_perf_context_print:       total time =  194332.52 ms /  1850 tokens


Answer: The total photography budget in a party is $2950.


In [15]:
index, metadata = load_index()
query = "What are the categories of amino acids with regard to carbon metabolism?"
response = answer_question(query, index, metadata)
print("Answer:", response)

Llama.generate: 33 prefix-match hit, remaining 1727 prompt tokens to eval
llama_perf_context_print:        load time =  191328.92 ms
llama_perf_context_print: prompt eval time =  185752.89 ms /  1727 tokens (  107.56 ms per token,     9.30 tokens per second)
llama_perf_context_print:        eval time =   24138.57 ms /   132 runs   (  182.87 ms per token,     5.47 tokens per second)
llama_perf_context_print:       total time =  209996.09 ms /  1859 tokens


Answer: Amino acids can be classified into two categories with regard to carbon metabolism: essential and non-essential. Essential amino acids are those that cannot be synthesized by an organism from materials normally available to the cells at a speed adequate with the demands for normal growth, while non-essential amino acids are those that can be synthesized from a variety of precursors in the Krebs cycle and other metabolic pathways. The categories are based on the degree of δ13C fractionation between the diet and consumer during nitrogen metabolism.


# Translators

In [16]:
from transformers import MarianMTModel, MarianTokenizer
from langdetect import detect

def load_translation_model(src_lang='ar', tgt_lang='en'):
    model_name = f"opus-mt-{src_lang}-{tgt_lang}"
    tokenizer = MarianTokenizer.from_pretrained(model_name)
    model = MarianMTModel.from_pretrained(model_name)
    return model, tokenizer

def translate_text(text, model, tokenizer):
    batch = tokenizer.prepare_seq2seq_batch([text], return_tensors="pt")
    generated = model.generate(**batch)
    return tokenizer.batch_decode(generated, skip_special_tokens=True)[0]

def translate_chunks(chunks, model, tokenizer):
    translated_chunks = []
    for c in chunks:
        translated_text = translate_text(c['text'], model, tokenizer)
        translated_chunks.append({**c, 'translated_text': translated_text})
    return translated_chunks

def translate_document_chunks_auto(chunks):
    results = []
    for chunk in chunks:
        result = auto_translate(chunk['text'])
        results.append({**chunk, 'translated_text': result['translated_text']})
    return results

# For auto language detection
def detect_language(text):
    try:
        return detect(text)
    except:
        return "unknown"
    
def load_model_by_lang(lang_code):
    # Arabic → English
    if lang_code == 'ar':
        model_name = "opus-mt-ar-en"
    # English → Arabic
    elif lang_code == 'en':
        model_name = "opus-mt-en-ar"
    else:
        raise ValueError(f"Unsupported language: {lang_code}")
    
    model = MarianMTModel.from_pretrained(model_name)
    tokenizer = MarianTokenizer.from_pretrained(model_name)
    return model, tokenizer
    
def auto_translate(text):
    lang = detect_language(text)
    
    if lang == 'ar':
        target_lang = 'en'
    elif lang == 'en':
        target_lang = 'ar'
    else:
        raise ValueError(f"Language {lang} not supported for auto-translation.")

    model, tokenizer = load_model_by_lang(lang)

    batch = tokenizer.prepare_seq2seq_batch([text], return_tensors="pt")
    generated = model.generate(**batch)
    translated = tokenizer.batch_decode(generated, skip_special_tokens=True)[0]

    return {
        'original_lang': lang,
        'target_lang': target_lang,
        'translated_text': translated
    }

In [17]:
# Testing Translators
test1 = "Dr. X was a well-known researcher."
test2 = "الدكتور إكس كان باحثًا مشهورًا."

print(auto_translate(test1))  
print(auto_translate(test2))  

`prepare_seq2seq_batch` is deprecated and will be removed in version 5 of HuggingFace Transformers. Use the regular
`__call__` method to prepare your inputs and targets.

Here is a short example:

model_inputs = tokenizer(src_texts, text_target=tgt_texts, ...)

If you either need to use different keyword arguments for the source and target texts, you should do two calls like
this:

model_inputs = tokenizer(src_texts, ...)
labels = tokenizer(text_target=tgt_texts, ...)
model_inputs["labels"] = labels["input_ids"]

See the documentation of your specific tokenizer for more details on the specific arguments to the tokenizer of choice.
For a more complete example, see the implementation of `prepare_seq2seq_batch`.



{'original_lang': 'en', 'target_lang': 'ar', 'translated_text': 'د. (إكس) كان باحثاً مشهوراً'}
{'original_lang': 'ar', 'target_lang': 'en', 'translated_text': 'Dr. X was a famous researcher.'}


# ROUGE metric

In [18]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained("t5-small")
model = AutoModelForSeq2SeqLM.from_pretrained("t5-small")

In [19]:
from transformers import pipeline
from tqdm import tqdm  
from collections import defaultdict

summarizer = pipeline("summarization", model=model, tokenizer=tokenizer)

def summarize_text_t5(text, max_length=130, min_length=30):
    prompt = "summarize: " + text
    return summarizer(prompt, max_length=max_length, min_length=min_length, do_sample=False)[0]['summary_text']

def summarize_document_chunks(chunks):
    summarized = []

    for chunk in tqdm(chunks, desc="Summarizing chunks"):
        text = chunk['text']

        try:
            summary = summarize_text_t5(text)
        except Exception as e:
            summary = "[Summary failed]"
            print(f"Error on chunk {chunk['chunk_number']}: {e}")

        summarized.append({
            **chunk,
            'summary': summary
        })

    return summarized

def group_summaries_by_file(summarized_chunks):
    file_summary_map = defaultdict(list)
    for chunk in summarized_chunks:
        file_summary_map[chunk['filename']].append(chunk['summary'])
    return file_summary_map

def generate_super_summary(summaries_list, max_length=150, min_length=40):
    combined_summary = " ".join(summaries_list)
    return summarize_text_t5(combined_summary, max_length=max_length, min_length=min_length)

def generate_super_summaries_per_file(summarized_chunks):
    file_summaries = group_summaries_by_file(summarized_chunks)
    super_summaries = {}

    for file, summaries in tqdm(file_summaries.items(), desc="Generating super summaries"):
        try:
            super_summary = generate_super_summary(summaries)
        except Exception as e:
            print(f"Error summarizing file {file}: {e}")
            super_summary = "[Super-summary failed]"
        
        super_summaries[file] = super_summary

    return super_summaries

Device set to use cuda:0


In [20]:
long_text = "Dr. X was a mysterious scientist who disappeared after publishing numerous groundbreaking papers..."
summary = summarize_text_t5(long_text)
print("Summary:\n", summary)

Your max_length is set to 130, but your input_length is only 22. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=11)


Summary:
 X was a mysterious scientist who disappeared after publishing numerous groundbreaking papers . he was the subject of a series of groundbreaking papers that have been published .


In [21]:
summarized_chunks = summarize_document_chunks(chunked_docs)

# Print a few samples
for s in summarized_chunks[:3]:
    print(f"\nFile: {s['filename']} | Page: {s.get('page')} | Chunk: {s['chunk_number']}")
    print("🔹 Original:\n", s['text'][:200], "...")
    print("Summary:\n", s['summary'], "\n" + "-"*50)

Summarizing chunks:   1%|          | 7/783 [00:08<15:38,  1.21s/it]Your max_length is set to 130, but your input_length is only 98. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=49)
Summarizing chunks:   1%|          | 9/783 [00:10<14:29,  1.12s/it]You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
Summarizing chunks:   2%|▏         | 12/783 [00:16<20:54,  1.63s/it]Your max_length is set to 130, but your input_length is only 67. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=33)
Summarizing chunks:   4%|▎         | 28/783 [00:47<22:21,  1.78s/it]Your max_length is set to 130, but your input_length is only 31. Since this is a summarization task, where outputs shorter 


File: Dataset summaries and citations.docx | Page: None | Chunk: 1
🔹 Original:
 Table 1. Description of studies included in the meta-analysis. Full article citations are listed after the table.






 
Citation List
Acuña E., A. A., Pastenes V., C., & Villalobos G., L. (2017). Ca ...
Summary:
 full article citations are listed after the table . Acua E., A. A., Pastenes V., C., & Villalobos G. (2018). carbon sequestration and photosynthesis in newly established Turfgrass Cover in central Chile . 
--------------------------------------------------

File: Dataset summaries and citations.docx | Page: None | Chunk: 2
🔹 Original:
 https://doi.org/10.3390/f5030425
Carley, D. S., Goodman, D., Sermons, S., Shi, W., Bowman, D., Miller, G., & Rufty, T. (2011). Soil Organic Matter Accumulation in Creeping Bentgrass Greens: A Chronose ...
Summary:
 soil organic matter Accumulation in Creeping Bentgrass Greens: a Chronosequence with Implications for management and carbon sequestration . Agronomy Jo




In [None]:
super_summaries = generate_super_summaries_per_file(summarized_chunks)

for filename, summary in super_summaries.items():
    print(f"\n📁 {filename}")
    print("🧠 Super-Summary:\n", summary)

Generating super summaries:  20%|██        | 2/10 [00:02<00:10,  1.27s/it]Token indices sequence length is longer than the specified maximum sequence length for this model (1421 > 512). Running this sequence through the model will result in indexing errors
Generating super summaries:  50%|█████     | 5/10 [00:17<00:21,  4.38s/it]

In [None]:
all_summaries = [chunk['summary'] for chunk in summarized_chunks]
mega_summary = generate_super_summary(all_summaries)

print("🌐 Final Mega-Summary Across All Docs:\n", mega_summary)

In [None]:
import json
import pandas as pd

# Saving
with open("summarized_chunks.json", "w", encoding="utf-8") as f:
    json.dump(summarized_chunks, f, ensure_ascii=False, indent=2)

# Loading
df = pd.DataFrame(summarized_chunks)
df.to_csv("summarized_chunks.csv", index=False)


In [None]:
df

Unnamed: 0,filename,page,chunk_number,text,summary
0,Dataset summaries and citations.docx,,1,Table 1. Description of studies included in th...,full article citations are listed after the ta...
1,Dataset summaries and citations.docx,,2,"https://doi.org/10.3390/f5030425\nCarley, D. S...",soil organic matter Accumulation in Creeping B...
2,Dataset summaries and citations.docx,,3,"bock, Texas. Agronomy Journal, 112(1), 148–157...",urbanization increases Grassland carbon pools:...
3,Dataset summaries and citations.docx,,4,"20(1), 87–96. https://doi.org/10.1007/s11252-0...","urban ecosystems, 17(1), 205–219. https://doi...."
4,Dataset summaries and citations.docx,,5,2010). Soil Organic Carbon Input from Urban Tu...,Soil Organic Carbon Input from Urban Turfgrass...
...,...,...,...,...,...
778,The_Plan_of_the_Giza_Pyramids.pdf,15.0,3,"successors, and the mathematical nature of th...",the real scientist will recognise the impossib...
779,The_Plan_of_the_Giza_Pyramids.pdf,16.0,1,"The Plan of the Giza Pyramids 16 \nIf, as the...",the plan of the Giza pyramids 16 is the bed-ro...
780,The_Plan_of_the_Giza_Pyramids.pdf,16.0,2,masonry of the Great Pyramid in no way invalid...,king who wished to claim ownership of the monu...
781,The_Plan_of_the_Giza_Pyramids.pdf,16.0,3,concerning the delivery of stones to the site ...,leading Egyptianologists have long recognised ...


In [None]:
import os
import faiss
import json
import numpy as np
import gradio as gr
from tqdm import tqdm
from langdetect import detect
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    MarianMTModel,
    MarianTokenizer,
    pipeline
)
from sentence_transformers import SentenceTransformer
import pandas as pd
from PyPDF2 import PdfReader
from docx import Document
import tiktoken

from llama_cpp import Llama

# Load your local LLaMA model (adjust path & params)
llama_model = Llama(
    model_path="llama-2-7b-chat.Q2_K.gguf",  # path to your .gguf file
    n_ctx=2048,
    n_threads=8  # adjust based on your CPU
)


class DrXResearchAssistant:
    def __init__(self):
        self.tokenizer = tiktoken.get_encoding("cl100k_base")
        self.embed_model = SentenceTransformer("all-MiniLM-L6-v2")
        self.index = None
        self.metadata = []
        self.chunked_docs = []
        self.translation_models = {}

        # Summarization model
        model_path = "google-t5/t5-small"
        self.summarizer = pipeline(
    "summarization",
    model=model_path,
    tokenizer=model_path,
    # device=-1  # 👈 this ensures it runs on CPU
)


    # ---------- FILE READING ----------
    def read_pdf(self, file_path):
        reader = PdfReader(file_path)
        return [{'filename': os.path.basename(file_path), 'page': i + 1, 'text': p.extract_text()} for i, p in enumerate(reader.pages) if p.extract_text()]

    def read_docx(self, file_path):
        doc = Document(file_path)
        text = '\n'.join([p.text for p in doc.paragraphs])
        return [{'filename': os.path.basename(file_path), 'text': text}]

    def read_excel(self, file_path):
        data = pd.read_excel(file_path, sheet_name=None)
        text = '\n'.join([df.to_string(index=False) for df in data.values()])
        return [{'filename': os.path.basename(file_path), 'text': text}]

    def read_csv(self, file_path):
        df = pd.read_csv(file_path)
        return [{'filename': os.path.basename(file_path), 'text': df.to_string(index=False)}]

    def extract_text(self, file_path):
        ext = os.path.splitext(file_path)[1].lower()
        if ext == '.pdf':
            return self.read_pdf(file_path)
        elif ext == '.docx':
            return self.read_docx(file_path)
        elif ext in ['.xlsx', '.xls', '.xlsm']:
            return self.read_excel(file_path)
        elif ext == '.csv':
            return self.read_csv(file_path)
        return []

    # ---------- CHUNKING ----------
    def chunk_text(self, text, max_tokens=300, overlap=50):
        tokens = self.tokenizer.encode(text)
        chunks = []
        i = 0
        while i < len(tokens):
            chunk = self.tokenizer.decode(tokens[i:i + max_tokens])
            chunks.append(chunk.strip())
            i += max_tokens - overlap
        return chunks

    def chunk_documents(self, parsed_docs):
        self.chunked_docs.clear()
        for doc in parsed_docs:
            text = doc['text']
            chunks = self.chunk_text(text)
            for idx, chunk in enumerate(chunks):
                self.chunked_docs.append({
                    'filename': doc['filename'],
                    'page': doc.get('page', 'N/A'),
                    'chunk_number': idx + 1,
                    'text': chunk
                })

    # ---------- EMBEDDINGS + INDEX ----------
    def embed_chunks(self):
        texts = [c['text'] for c in self.chunked_docs]
        embeddings = self.embed_model.encode(texts, convert_to_numpy=True)
        dim = embeddings.shape[1]
        self.index = faiss.IndexFlatL2(dim)
        self.index.add(np.array(embeddings).astype('float32'))
        self.metadata = self.chunked_docs.copy()

    def search_index(self, query, top_k=5):
        q_embed = self.embed_model.encode([query], convert_to_numpy=True)
        D, I = self.index.search(np.array(q_embed).astype('float32'), top_k)
        return [self.metadata[i] for i in I[0]]

    # ---------- Q&A ----------
    def build_prompt(self, question, chunks):
        context = "\n\n".join([f"[{c['filename']} - page {c.get('page')}]:\n{c['text']}" for c in chunks])
        return f"""You are an assistant helping analyze research papers.

Question: {question}

Sources:
{context}

Answer:"""

    def answer_question(self, question):
        if not self.index:
            return "Please upload and process documents first."

        top_chunks = self.search_index(question, top_k=5)
        prompt = self.build_prompt(question, top_chunks)

        # Actual LLaMA response
        output = llama_model(prompt, max_tokens=300, stop=["Question:", "User:"], echo=False)
        return output['choices'][0]['text'].strip()


    # ---------- TRANSLATION ----------
    def detect_lang(self, text):
        try:
            return detect(text)
        except:
            return 'unknown'

    def load_translation_model(self, src, tgt):
        key = f"{src}-{tgt}"
        if key not in self.translation_models:
            model_name = f"opus-mt-{src}-{tgt}"
            model = MarianMTModel.from_pretrained(model_name)
            tokenizer = MarianTokenizer.from_pretrained(model_name)
            self.translation_models[key] = (model, tokenizer)
        return self.translation_models[key]

    def auto_translate(self, text):
        lang = self.detect_lang(text)
        if lang == 'ar':
            src, tgt = 'ar', 'en'
        elif lang == 'en':
            src, tgt = 'en', 'ar'
        else:
            return f"Unsupported language detected: {lang}"

        model, tokenizer = self.load_translation_model(src, tgt)
        inputs = tokenizer.prepare_seq2seq_batch([text], return_tensors="pt")
        output = model.generate(**inputs)
        translated = tokenizer.batch_decode(output, skip_special_tokens=True)[0]
        return f"[{src} → {tgt}]\n{translated}"

    # ---------- SUMMARIZATION ----------
    def summarize_chunk(self, text, max_length=100, min_length=30):
        prompt = "summarize: " + text
        return self.summarizer(prompt, max_length=max_length, min_length=min_length, do_sample=False)[0]['summary_text']

    def summarize_chunks(self):
        summaries = []
        for chunk in tqdm(self.chunked_docs, desc="Summarizing"):
            try:
                s = self.summarize_chunk(chunk['text'])
            except:
                s = "[Failed to summarize]"
            summaries.append(s)
        return summaries

    def generate_super_summary(self):
        if not self.chunked_docs:
            return "No data to summarize."
        summaries = self.summarize_chunks()
        return self.summarize_chunk(" ".join(summaries), max_length=150, min_length=50)


# ---------- GRADIO UI ----------
assistant = DrXResearchAssistant()

with gr.Blocks() as demo:
    gr.Markdown("# 🤖 Dr. X Research Assistant")
    gr.Markdown("Upload research files, ask questions, translate, and summarize — all offline.")

    with gr.Row():
        file_input = gr.File(file_types=[".pdf", ".docx", ".xlsx", ".xls", ".csv"], file_count="multiple", label="📁 Upload Files")
        upload_btn = gr.Button("Process Files")
        upload_output = gr.Textbox(label="Upload Log")

    with gr.Tab("❓ Ask a Question"):
        q_input = gr.Textbox(label="Enter your question")
        q_btn = gr.Button("Answer")
        q_output = gr.Textbox(label="Answer")

    with gr.Tab("🌍 Translate"):
        t_input = gr.Textbox(label="Enter text to translate")
        t_btn = gr.Button("Translate")
        t_output = gr.Textbox(label="Translated Text")

    with gr.Tab("📝 Super Summary"):
        s_btn = gr.Button("Generate Super Summary")
        s_output = gr.Textbox(label="Summary", lines=10)

    def process_files(files):
        all_chunks = []
        for f in files:
            chunks = assistant.extract_text(f.name)
            all_chunks.extend(chunks)
        assistant.chunk_documents(all_chunks)
        assistant.embed_chunks()
        return f"✅ Processed {len(assistant.chunked_docs)} chunks from {len(files)} file(s)."

    upload_btn.click(fn=process_files, inputs=[file_input], outputs=[upload_output])
    q_btn.click(fn=assistant.answer_question, inputs=[q_input], outputs=[q_output])
    t_btn.click(fn=assistant.auto_translate, inputs=[t_input], outputs=[t_output])
    s_btn.click(fn=assistant.generate_super_summary, outputs=[s_output])

demo.launch()


  from .autonotebook import tqdm as notebook_tqdm





llama_model_loader: loaded meta data with 19 key-value pairs and 291 tensors from llama-2-7b-chat.Q2_K.gguf (version GGUF V2)
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.name str              = LLaMA v2
llama_model_loader: - kv   2:                       llama.context_length u32              = 4096
llama_model_loader: - kv   3:                     llama.embedding_length u32              = 4096
llama_model_loader: - kv   4:                          llama.block_count u32              = 32
llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 11008
llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 32

* Running on local URL:  http://127.0.0.1:7860

To create a public link, set `share=True` in `launch()`.




  warn("""Cannot parse header or footer so it will be ignored""")
  warn("""Cannot parse header or footer so it will be ignored""")
  warn(msg)
  warn(f"Print area cannot be set to Defined name: {defn.value}.")
Summarizing:   1%|          | 7/783 [00:09<16:20,  1.26s/it]Your max_length is set to 100, but your input_length is only 98. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=49)
Summarizing:   1%|▏         | 10/783 [00:15<22:00,  1.71s/it]You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
Summarizing:   2%|▏         | 12/783 [00:18<22:42,  1.77s/it]Your max_length is set to 100, but your input_length is only 63. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=31)
