In [9]:
# Step 0: Install Dependencies (only once)
!pip install gradio pymupdf faiss-cpu sentence-transformers transformers bertopic umap-learn hdbscan



In [15]:
!pip install langdetect

Collecting langdetect
  Downloading langdetect-1.0.9.tar.gz (981 kB)
     ---------------------------------------- 0.0/981.5 kB ? eta -:--:--
     ---------- ----------------------------- 262.1/981.5 kB ? eta -:--:--
     -------------------------------------- 981.5/981.5 kB 2.7 MB/s eta 0:00:00
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Building wheels for collected packages: langdetect
  Building wheel for langdetect (setup.py): started
  Building wheel for langdetect (setup.py): finished with status 'done'
  Created wheel for langdetect: filename=langdetect-1.0.9-py3-none-any.whl size=993251 sha256=3cb4b3c588d01a8279bab4eef31bade5cf8e454202f902a5ebe01d66239112c3
  Stored in directory: c:\users\intel\appdata\local\pip\cache\wheels\c1\67\88\e844b5b022812e15a52e4eaa38a1e709e99f06f6639d7e3ba7
Successfully built langdetect
Installing collected packages: langdetect
Successfully installed langdetect-1.0.9


In [10]:
# Step 1: Imports & Paths Setup
import os
import sys
import fitz
import faiss
import numpy as np
import pandas as pd
import gradio as gr
from transformers import pipeline
from sentence_transformers import SentenceTransformer
from bertopic import BERTopic
from transformers import MarianMTModel, MarianTokenizer

# Set base directory
BASE_DIR = r"C:\Users\intel\Desktop\draft RAG"
sys.path.append(os.path.join(BASE_DIR, "app"))

# Import citation generator
from citation_utils import generate_apa_citation

In [11]:
# Step 2: Load FAISS Index, Data & Models
# Load CSV
df = pd.read_csv(os.path.join(BASE_DIR, "data", "combined_final_papers.csv")).fillna("")
df["Combined_Text"] = df["Title"] + ". " + df["Abstract"] + ". " + df["Keyword"]

# Load FAISS index
index = faiss.read_index(os.path.join(BASE_DIR, "models", "semantic_index.faiss"))
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

Device set to use cpu


In [20]:
 # Step 3: Core Functions

           # 3.1 🧠 Semantic Search
def semantic_search(query, top_k=5):
    query_embedding = embedding_model.encode([query]).astype("float32")
    distances, indices = index.search(query_embedding, top_k)

    results = []
    for idx in indices[0]:
        paper = df.iloc[idx]
        citation = generate_apa_citation(paper['Title'], paper['Authors'], paper['Published Date'], paper['PDF Link'])
        results.append(f"📄 **{paper['Title']}**\n\n🧑‍🔬 *{paper['Authors']}* | 🗓️ {paper['Published Date']}\n🔗 [PDF Link]({paper['PDF Link']})\n\n🧾 **Citation**: {citation}\n\n📝 **Abstract:**\n{paper['Abstract']}\n\n---")
    return "\n\n".join(results)


    # 3.2 📄 PDF + Text Summarizer
def chunk_text(text, max_tokens=500):
    words = text.split()
    return [' '.join(words[i:i+max_tokens]) for i in range(0, len(words), max_tokens)]

def summarize_long_text(text):
    chunks = chunk_text(text)
    summaries = []
    for chunk in chunks:
        if len(chunk.split()) > 30:
            summary = summarizer(chunk, max_length=130, min_length=30, do_sample=False)[0]['summary_text']
            summaries.append(summary)
    return "\n\n".join(summaries)

def summarize_pdf(pdf):
    doc = fitz.open(stream=pdf.read(), filetype="pdf")
    text = ""
    for page in doc:
        text += page.get_text()
    return summarize_long_text(text)


  # 3.3 🌍 Multilingual Search
from langdetect import detect

def multilingual_search(query, lang):
    lang_map = {
        "French": ("fr", "en", "en", "fr"),
        "German": ("de", "en", "en", "de"),
        "Spanish": ("es", "en", "en", "es"),
        "Hindi": ("hi", "en", "en", "hi"),
        "Chinese": ("zh", "en", "en", "zh")
    }

    if lang not in lang_map:
        return "❌ Language not supported."

    src, tgt, back_src, back_tgt = lang_map[lang]

    model_to_en, tokenizer_to_en = load_translation_model(src, tgt)
    model_back, tokenizer_back = load_translation_model(back_src, back_tgt)

    translated_query = translate(query, model_to_en, tokenizer_to_en)
    english_results = semantic_search(translated_query)
    translated_results = translate(english_results, model_back, tokenizer_back)

    return translated_results

# 3.4 📎 Citation Generator
def citation_generator(title, authors, date, pdf_link):
    return generate_apa_citation(title, authors, date, pdf_link)

# 3.5 📊 Trend Visualization (graph)
import matplotlib.pyplot as plt
from io import BytesIO
from PIL import Image

def show_trend_plot():
    topic_model = BERTopic.load(os.path.join(BASE_DIR, "models", "bertopic_model"))
    topic_info = topic_model.get_topic_info()
    top_topics = topic_info[1:11]  # Skip -1 (outliers)

    fig, ax = plt.subplots(figsize=(10, 5))
    ax.barh(top_topics['Name'], top_topics['Count'], color='skyblue')
    ax.set_xlabel("Paper Count")
    ax.set_title("Top 10 Research Topics")
    plt.tight_layout()

    buf = BytesIO()
    plt.savefig(buf, format='png')
    buf.seek(0)
    return Image.open(buf)

In [21]:
# Step 4: Gradio Interface
import gradio as gr

# 🔍 English Search
search_interface = gr.Interface(
    fn=semantic_search,
    inputs=gr.Textbox(label="🔍 Enter Research Query (English)"),
    outputs=gr.Markdown(label="📄 Top Matching Papers"),
    title="📚 Semantic Search",
    description="Search academic papers using semantic understanding."
)

# 🌍 Multilingual Search
multilingual_interface = gr.Interface(
    fn=multilingual_search,
    inputs=[
        gr.Textbox(label="🔍 Enter your query"),
        gr.Dropdown(choices=["French", "German", "Spanish", "Hindi", "Chinese"], label="🌐 Select Language")
    ],
    outputs=gr.Textbox(label="📑 Translated Results"),
    title="🌍 Multilingual Semantic Search",
    description="Enter your research query in a different language. Results will be translated back to your language."
)

# 📄 PDF Summarizer
pdf_interface = gr.Interface(
    fn=summarize_pdf,
    inputs=gr.File(label="📄 Upload PDF"),
    outputs=gr.Textbox(label="📝 Summary"),
    title="🧾 PDF Summarizer",
    description="Upload a research PDF to summarize it."
)

# 📝 Long Text Summarizer
text_interface = gr.Interface(
    fn=summarize_long_text,
    inputs=gr.Textbox(lines=15, label="📝 Paste Long Text or Abstract"),
    outputs=gr.Textbox(label="📌 Summary"),
    title="🧠 Text Summarizer",
    description="Paste long research text for summarization."
)

# 📎 Citation Generator
citation_interface = gr.Interface(
    fn=citation_generator,
    inputs=[
        gr.Textbox(label="📘 Title"),
        gr.Textbox(label="✍️ Authors (comma-separated)"),
        gr.Textbox(label="📅 Published Date (YYYY-MM-DD)"),
        gr.Textbox(label="🔗 PDF Link")
    ],
    outputs=gr.Textbox(label="📎 APA Citation"),
    title="🧾 Citation Generator",
    description="Auto-generate APA-style citations for your papers."
)

# 📊 Trend Plot
trend_interface = gr.Interface(
    fn=show_trend_plot,
    inputs=[],
    outputs=gr.Image(label="📊 Topic Trend Graph"),
    title="📈 Research Trends",
    description="Visualizes the top 10 most common research topics."
)

# 🚀 Launch the complete app
gr.TabbedInterface(
    [
        search_interface,
        multilingual_interface,
        pdf_interface,
        text_interface,
        citation_interface,
        trend_interface
    ],
    [
        "🔍 Search",
        "🌍 Multilingual",
        "📄 PDF Summarizer",
        "📝 Text Summarizer",
        "📎 Citation Generator",
        "📊 Trend Graph"
    ]
).launch()

* Running on local URL:  http://127.0.0.1:7864
* To create a public link, set `share=True` in `launch()`.


