# Thesisaurus: A Virtual Assistant for Curating and Synthesizing Thesis Research

**CMPE 259 Project**  
**Author:** Khush Naidu  
**SJSU ID:** 015798328

## Project Overview

This notebook implements a Virtual Assistant that helps researchers organize, synthesize, and query research papers. The VA uses:
- **Large Model:** Llama-3.3-70B-Instruct
- **Small Model:** Llama-3.1-8B-Instruct
- **Tools:** Database queries, Vector search (RAG), PDF extraction, Web search
- **Advanced Prompting:** Prompt chaining, Meta-prompting, Self-reflection
- **Security:** Prompt injection detection and prevention


In [None]:
# Install required packages
%pip install -q transformers torch accelerate bitsandbytes
%pip install -q faiss-cpu sentence-transformers
%pip install -q pypdf2 pdfplumber
%pip install -q requests beautifulsoup4 lxml
%pip install -q python-dotenv
%pip install -q datasets


In [None]:
import os
import sqlite3
import json
import re
import time
from datetime import datetime
from typing import List, Dict, Any, Tuple
from dataclasses import dataclass
import warnings
warnings.filterwarnings('ignore')

# LLM and ML libraries
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from sentence_transformers import SentenceTransformer

# Vector store
import faiss
import numpy as np

# PDF processing
import pdfplumber
from PyPDF2 import PdfReader

# Web scraping
import requests
from bs4 import BeautifulSoup

print("All imports successful!")
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")


## 2. Configuration


In [None]:
@dataclass
class Config:
    """Configuration for the VA system"""
    
    # Model configurations
    LARGE_MODEL = "meta-llama/Llama-3.3-70B-Instruct"
    SMALL_MODEL = "meta-llama/Llama-3.1-8B-Instruct"
    EMBEDDING_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
    
    # Database
    DB_PATH = "papers.db"
    
    # Vector store
    FAISS_INDEX_PATH = "faiss_index.bin"
    CHUNK_SIZE = 512
    CHUNK_OVERLAP = 50
    TOP_K = 5
    
    # Generation parameters
    MAX_NEW_TOKENS = 512
    TEMPERATURE = 0.7
    TOP_P = 0.9
    
    # Caching
    ENABLE_CACHE = True
    CACHE_SIZE = 100
    
    # Security
    ENABLE_SECURITY = True
    
config = Config()
print("Configuration loaded successfully!")
print(f"Large Model: {config.LARGE_MODEL}")
print(f"Small Model: {config.SMALL_MODEL}")


## 3. Database Tool - Paper Metadata Storage


In [None]:
class DatabaseTool:
    """Tool for querying structured paper metadata from SQLite database"""
    
    def __init__(self, db_path: str):
        self.db_path = db_path
        self.conn = None
        self._init_db()
    
    def _init_db(self):
        """Initialize database with schema"""
        self.conn = sqlite3.connect(self.db_path)
        cursor = self.conn.cursor()
        
        # Papers table
        cursor.execute('''
            CREATE TABLE IF NOT EXISTS papers (
                paper_id TEXT PRIMARY KEY,
                title TEXT NOT NULL,
                authors TEXT,
                year INTEGER,
                venue TEXT,
                arxiv_id TEXT,
                pdf_path TEXT,
                abstract TEXT,
                keywords TEXT
            )
        ''')
        
        # Datasets table
        cursor.execute('''
            CREATE TABLE IF NOT EXISTS datasets (
                dataset_id INTEGER PRIMARY KEY AUTOINCREMENT,
                paper_id TEXT,
                dataset_name TEXT,
                dataset_type TEXT,
                FOREIGN KEY (paper_id) REFERENCES papers (paper_id)
            )
        ''')
        
        # Models table
        cursor.execute('''
            CREATE TABLE IF NOT EXISTS models (
                model_id INTEGER PRIMARY KEY AUTOINCREMENT,
                paper_id TEXT,
                model_name TEXT,
                model_type TEXT,
                FOREIGN KEY (paper_id) REFERENCES papers (paper_id)
            )
        ''')
        
        # Experimental setups table
        cursor.execute('''
            CREATE TABLE IF NOT EXISTS experiments (
                exp_id INTEGER PRIMARY KEY AUTOINCREMENT,
                paper_id TEXT,
                optimizer TEXT,
                learning_rate REAL,
                batch_size INTEGER,
                epochs INTEGER,
                augmentations TEXT,
                pretrained_weights TEXT,
                FOREIGN KEY (paper_id) REFERENCES papers (paper_id)
            )
        ''')
        
        # Limitations table
        cursor.execute('''
            CREATE TABLE IF NOT EXISTS limitations (
                limitation_id INTEGER PRIMARY KEY AUTOINCREMENT,
                paper_id TEXT,
                limitation TEXT,
                FOREIGN KEY (paper_id) REFERENCES papers (paper_id)
            )
        ''')
        
        self.conn.commit()
        print("Database initialized successfully!")
    
    def query(self, sql: str, params: tuple = ()) -> List[Dict]:
        """Execute a parameterized SQL query (safe from injection)"""
        try:
            cursor = self.conn.cursor()
            cursor.execute(sql, params)
            columns = [desc[0] for desc in cursor.description] if cursor.description else []
            results = [dict(zip(columns, row)) for row in cursor.fetchall()]
            return results
        except Exception as e:
            print(f"Database query error: {e}")
            return []
    
    def insert_paper(self, paper_data: Dict):
        """Insert a paper into the database"""
        cursor = self.conn.cursor()
        cursor.execute('''
            INSERT OR REPLACE INTO papers 
            (paper_id, title, authors, year, venue, arxiv_id, pdf_path, abstract, keywords)
            VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)
        ''', (
            paper_data.get('paper_id'),
            paper_data.get('title'),
            paper_data.get('authors'),
            paper_data.get('year'),
            paper_data.get('venue'),
            paper_data.get('arxiv_id'),
            paper_data.get('pdf_path'),
            paper_data.get('abstract'),
            paper_data.get('keywords')
        ))
        self.conn.commit()
    
    def get_all_datasets(self) -> List[str]:
        """Get all unique datasets from corpus"""
        results = self.query("SELECT DISTINCT dataset_name FROM datasets ORDER BY dataset_name")
        return [r['dataset_name'] for r in results]
    
    def get_all_models(self) -> List[str]:
        """Get all unique models from corpus"""
        results = self.query("SELECT DISTINCT model_name FROM models ORDER BY model_name")
        return [r['model_name'] for r in results]
    
    def get_papers_by_year(self) -> List[Dict]:
        """Get papers grouped by publication year"""
        return self.query("""
            SELECT paper_id, title, year, venue 
            FROM papers 
            ORDER BY year DESC, title
        """)
    
    def close(self):
        """Close database connection"""
        if self.conn:
            self.conn.close()

# Initialize database tool
db_tool = DatabaseTool(config.DB_PATH)
print("Database tool initialized!")
