In [3]:
!pip install sentence-transformers faiss-cpu nltk



In [1]:
import pandas as pd
import numpy as np
import faiss
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sentence_transformers import SentenceTransformer
import os

# Optional: Set working directory to your project root
os.chdir(r"C:\Users\intel\Desktop\draft RAG")

# Download NLTK data
nltk.download('punkt', force=True)
nltk.download('stopwords', force=True)
nltk.download('wordnet', force=True)
nltk.download('omw-1.4', force=True)

# =========================
# STEP 1: Load Raw Data
# =========================
csv_path = r"data\combined_final_papers.csv"
df = pd.read_csv(csv_path).fillna("")

# =========================
# STEP 2: Preprocess Text
# =========================

lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words("english"))

def preprocess_text(text):
    if not isinstance(text, str):
        return ""
    text = re.sub(r"[^a-zA-Z\s]", "", text)
    words = word_tokenize(text.lower())
    words = [word for word in words if word not in stop_words]
    words = [lemmatizer.lemmatize(word) for word in words]
    return " ".join(words)

df["Preprocessed Title"] = df["Title"].apply(preprocess_text)
df["Preprocessed Abstract"] = df["Abstract"].apply(preprocess_text)
df["Preprocessed Keyword"] = df["Keyword"].apply(preprocess_text)

# =========================
# STEP 3: Combine for Embeddings
# =========================

df["Combined_Text"] = (
    df["Preprocessed Title"] + ". " +
    df["Preprocessed Abstract"] + ". " +
    df["Preprocessed Keyword"]
)

# =========================
# STEP 4: Generate Embeddings
# =========================

print("🔄 Generating embeddings...")

model = SentenceTransformer('all-MiniLM-L6-v2')
corpus_embeddings = model.encode(df["Combined_Text"].tolist(), show_progress_bar=True)
corpus_embeddings = np.array(corpus_embeddings).astype('float32')

# =========================
# STEP 5: Build FAISS Index
# =========================

dimension = corpus_embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(corpus_embeddings)

# =========================
# STEP 6: Save Index for Reuse
# =========================

faiss.write_index(index, r"models\semantic_index.faiss")

print("✅ Semantic index saved successfully!")

# =========================
# STEP 7: Semantic Search Function
# =========================

def semantic_search(query, top_k=10):
    query_embedding = model.encode([query]).astype('float32')
    distances, indices = index.search(query_embedding, top_k)

    results = []
    for idx in indices[0]:
        paper = df.iloc[idx]
        results.append({
            "Title": paper['Title'],
            "Authors": paper['Authors'],
            "Published Date": paper['Published Date'],
            "PDF Link": paper['PDF Link'],
            "Abstract": paper['Abstract'],
            "Keyword": paper['Keyword']
        })
    return results

# =========================
# STEP 8: Try It Out!
# =========================

query = input("🔍 Enter your research query: ")
results = semantic_search(query)

for i, paper in enumerate(results, 1):
    print(f"\n📄 Result {i}")
    print("Title:", paper["Title"])
    print("Authors:", paper["Authors"])
    print("Date:", paper["Published Date"])
    print("Keyword:", paper["Keyword"])
    print("Link:", paper["PDF Link"])
    print("Abstract:", paper["Abstract"][:300] + "...")

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\intel\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\intel\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\intel\AppData\Roaming\nltk_data...
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\intel\AppData\Roaming\nltk_data...


🔄 Generating embeddings...


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

✅ Semantic index saved successfully!


🔍 Enter your research query:  deepfake



📄 Result 1
Title: DeePhy: On Deepfake Phylogeny
Authors: Kartik Narayan, Harsh Agarwal, Kartik Thakral, Surbhi Mittal, Mayank Vatsa, Richa Singh
Date: 2022-09-19
Keyword: deepfakes
Link: http://arxiv.org/pdf/2209.09111v1
Abstract: Deepfake refers to tailored and synthetically generated videos which are now
prevalent and spreading on a large scale, threatening the trustworthiness of
the information available online. While existing datasets contain different
kinds of deepfakes which vary in their generation technique, they do n...

📄 Result 2
Title: Behind the Deepfake: 8% Create; 90% Concerned. Surveying public exposure to and perceptions of deepfakes in the UK
Authors: Tvesha Sippy, Florence Enock, Jonathan Bright, Helen Z. Margetts
Date: 2024-07-08
Keyword: deepfakes
Link: http://arxiv.org/pdf/2407.05529v1
Abstract: This article examines public exposure to and perceptions of deepfakes based
on insights from a nationally representative survey of 1403 UK adults. The
survey is one of th

In [2]:
citation_utils_code = '''
def generate_apa_citation(title, authors, date, pdf_link):
    # Format authors: "First A., Second B., & Third C."
    author_list = [name.strip() for name in authors.split(",")]
    if len(author_list) == 1:
        formatted_authors = author_list[0]
    elif len(author_list) == 2:
        formatted_authors = f"{author_list[0]} & {author_list[1]}"
    else:
        formatted_authors = ", ".join(author_list[:-1]) + f", & {author_list[-1]}"

    # Extract year only
    year = str(date)[:4] if date else "n.d."

    citation = f"{formatted_authors} ({year}). *{title}*. Retrieved from {pdf_link}"
    return citation
'''

# ✅ Save to your local path:
with open(r"C:\Users\intel\Desktop\draft RAG\app\citation_utils.py", "w", encoding='utf-8') as f:
    f.write(citation_utils_code)

print("✅ citation_utils.py created successfully!")

✅ citation_utils.py created successfully!


## **Citation Generation**

In [3]:
# Add your app directory to Python's module path
import sys
sys.path.append(r"C:\Users\intel\Desktop\draft RAG")

# Now import the function
from app.citation_utils import generate_apa_citation

# Sample test data
title = "Transformers for NLP"
authors = "Jane Doe, John Smith, Alan Turing"
date = "2020-05-15"
pdf_link = "https://arxiv.org/pdf/1234.5678"

# Generate citation
citation = generate_apa_citation(title, authors, date, pdf_link)
print("✅ APA Citation:\n", citation)

✅ APA Citation:
 Jane Doe, John Smith, & Alan Turing (2020). *Transformers for NLP*. Retrieved from https://arxiv.org/pdf/1234.5678
