In [None]:
# First, let's make sure we have a clean environment for FAISS
!pip uninstall -y faiss-cpu faiss-gpu
!pip uninstall -y faiss

# Now install FAISS specifically for CPU
!apt-get update && apt-get install -y python3-dev
!pip install faiss-cpu --no-cache-dir

# Verify FAISS installation
import faiss
import numpy as np

# Test FAISS functionality
print("\nTesting FAISS installation:")
dimension = 64
nb = 100
xb = np.random.random((nb, dimension)).astype('float32')
index = faiss.IndexFlatL2(dimension)
index.add(xb)
print("FAISS test successful!")

# Now let's verify we have all required components
print("\nVerifying all components:")
import torch
import transformers
import sentence_transformers
import pandas as pd

print(f"PyTorch version: {torch.__version__}")
print(f"Transformers version: {transformers.__version__}")
print(f"Sentence-transformers version: {sentence_transformers.__version__}")
print(f"FAISS version: {faiss.__version__}")
print(f"Pandas version: {pd.__version__}")

In [None]:
import os
import pandas as pd
import numpy as np
import logging
from typing import List, Dict
import torch
from sentence_transformers import SentenceTransformer
from transformers import AutoModelForCausalLM, AutoTokenizer
from tqdm.auto import tqdm
import time

# Setup logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

class RobustRAG:
    def __init__(self, data_path: str):
        """Initialize RAG system with component verification"""
        self.data_path = data_path
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        logger.info(f"Using device: {self.device}")

        # Initialize components as None
        self.data = None
        self.embedding_model = None
        self.embeddings = None
        self.index = None
        self.tokenizer = None
        self.llm_model = None

        # Track component status
        self.status = {
            'data_loaded': False,
            'embedding_model_loaded': False,
            'embeddings_generated': False,
            'faiss_initialized': False,
            'llm_loaded': False
        }

    def verify_faiss(self):
        """Verify FAISS installation and functionality"""
        try:
            import faiss
            # Test FAISS with small random data
            dimension = 64
            nb = 10
            xb = np.random.random((nb, dimension)).astype('float32')
            index = faiss.IndexFlatL2(dimension)
            index.add(xb)
            logger.info("FAISS verification successful")
            return True
        except Exception as e:
            logger.error(f"FAISS verification failed: {str(e)}")
            return False

    def load_data(self):
        """Load and prepare dataset with retries"""
        max_retries = 3
        for attempt in range(max_retries):
            try:
                # Read CSV file (adjust separator if needed)
                self.data = pd.read_csv(self.data_path)  # Removed sep='\t'
                # Print available columns for debugging
                logger.info(f"Available columns: {list(self.data.columns)}")

                # Determine the correct description column
                possible_description_columns = ['Description', 'Abstract', 'Summary']
                description_column = None
                for col in possible_description_columns:
                    if col in self.data.columns:
                        description_column = col
                        break
                if not description_column:
                    raise KeyError("No description column found in the dataset. Checked columns: 'Description', 'Abstract', 'Summary'.")

                # Fill missing descriptions
                self.data['Description'] = self.data[description_column].fillna('No description available')
                logger.info(f"Loaded dataset with {len(self.data)} entries")
                self.status['data_loaded'] = True
                return True
            except Exception as e:
                logger.warning(f"Data loading attempt {attempt + 1} failed: {str(e)}")
                # Print available columns if KeyError occurs
                if isinstance(e, KeyError):
                    logger.warning(f"Available columns are: {list(self.data.columns)}")
                if attempt < max_retries - 1:
                    time.sleep(1)  # Wait before retrying
                else:
                    logger.error("All data loading attempts failed")
                    return False

    def setup_embedding_model(self):
        """Setup embedding model with verification"""
        try:
            self.embedding_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

            # Verify model works
            test_text = "Test sentence for verification."
            test_embedding = self.embedding_model.encode([test_text])
            assert test_embedding.shape[1] > 0

            logger.info("Embedding model setup successful")
            self.status['embedding_model_loaded'] = True
            return True
        except Exception as e:
            logger.error(f"Embedding model setup failed: {str(e)}")
            return False

    def generate_embeddings(self):
        """Generate embeddings with progress tracking and error handling"""
        if not self.status['data_loaded'] or not self.status['embedding_model_loaded']:
            logger.error("Cannot generate embeddings: prerequisites not met")
            return False

        try:
            descriptions = self.data['Description'].tolist()
            self.embeddings = []
            batch_size = 32

            for i in tqdm(range(0, len(descriptions), batch_size), desc="Generating embeddings"):
                batch = descriptions[i:i + batch_size]
                try:
                    batch_embeddings = self.embedding_model.encode(
                        batch,
                        convert_to_numpy=True,
                        show_progress_bar=False
                    )
                    self.embeddings.append(batch_embeddings)
                except Exception as e:
                    logger.warning(f"Failed to process batch {i//batch_size}: {str(e)}")
                    # Create zero embeddings for failed batch
                    batch_embeddings = np.zeros((len(batch), self.embedding_model.get_sentence_embedding_dimension()))
                    self.embeddings.append(batch_embeddings)

            self.embeddings = np.vstack(self.embeddings)
            self.status['embeddings_generated'] = True
            logger.info(f"Generated embeddings of shape {self.embeddings.shape}")
            return True
        except Exception as e:
            logger.error(f"Embedding generation failed: {str(e)}")
            return False

    def setup_faiss(self):
        """Setup FAISS index with verification"""
        if not self.status['embeddings_generated']:
            logger.error("Cannot setup FAISS: embeddings not generated")
            return False

        try:
            import faiss
            embedding_dim = self.embeddings.shape[1]
            self.index = faiss.IndexFlatL2(embedding_dim)
            self.index.add(self.embeddings.astype('float32'))

            # Verify index
            test_query = self.embeddings[0:1]
            distances, indices = self.index.search(test_query, 1)
            assert indices.shape == (1, 1)

            self.status['faiss_initialized'] = True
            logger.info("FAISS index created and verified")
            return True
        except Exception as e:
            logger.error(f"FAISS setup failed: {str(e)}")
            return False

    def setup_llm(self):
        """Setup LLM with verification"""
        try:
            self.tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf")
            self.llm_model = AutoModelForCausalLM.from_pretrained(
                "meta-llama/Llama-2-7b-hf",
                device_map="auto",
                torch_dtype=torch.float16,
                low_cpu_mem_usage=True
            )

            # Verify tokenizer and model
            test_input = "Test input."
            tokens = self.tokenizer(test_input, return_tensors="pt")
            assert tokens is not None

            self.status['llm_loaded'] = True
            logger.info("LLM setup successful")
            return True
        except Exception as e:
            logger.error(f"LLM setup failed: {str(e)}")
            return False

    def setup(self):
        """Complete setup with component verification"""
        steps = [
            (self.verify_faiss, "FAISS verification"),
            (self.load_data, "Data loading"),
            (self.setup_embedding_model, "Embedding model setup"),
            (self.generate_embeddings, "Embedding generation"),
            (self.setup_faiss, "FAISS setup"),
            (self.setup_llm, "LLM setup")
        ]

        success = True
        for step_func, step_name in steps:
            logger.info(f"Starting {step_name}...")
            if not step_func():
                success = False
                logger.warning(f"{step_name} failed or partially completed")
            else:
                logger.info(f"{step_name} completed successfully")

        return success

    def query(self, text: str, top_k: int = 3):
        """Query system with component checks"""
        if not all([self.status['embeddings_generated'], self.status['faiss_initialized']]):
            return {"error": "System not fully initialized", "results": []}

        try:
            query_embedding = self.embedding_model.encode([text])
            distances, indices = self.index.search(query_embedding.astype('float32'), top_k)

            results = []
            for idx, distance in zip(indices[0], distances[0]):
                results.append({
                    'text': self.data['Description'].iloc[idx],
                    'title': self.data['Title'].iloc[idx] if 'Title' in self.data.columns else 'No title available',
                    'accession_id': self.data['Accession ID'].iloc[idx] if 'Accession ID' in self.data.columns else 'No accession ID available',
                    'distance': float(distance)
                })

            return {"error": None, "results": results}
        except Exception as e:
            logger.error(f"Query failed: {str(e)}")
            return {"error": str(e), "results": []}

def main():
    # Initialize system
    rag = RobustRAG('/content/bioproject_Dengue_Human_sequencing.csv')

    # Setup system
    if rag.setup():
        logger.info("System setup complete!")

        # Test query PRJNA1063364;
        query = "RNA-seq and ATAC-seq analysis of iPSC macrophages"
        #query ="Macrophage differentiation and polarization using iPSCs"
        #query ="Flow cytometry and transcriptomics in macrophages"
        #query ="Chromatin accessibility profiling in virus-infected macrophages"
        #query = "RNA secondary structure and gene expression in single cells."
        results = rag.query(query)

        if results["error"] is None:
            print("\nQuery results:")
            for result in results["results"]:
                print(f"\nAccession ID: {result['accession_id']}")
                print(f"Title: {result['title']}")
                print(f"Distance: {result['distance']:.4f}")
                print(f"Text: {result['text'][:200]}...")
        else:
            print(f"Query failed: {results['error']}")
    else:
        logger.error("System setup incomplete. Check logs for details.")

if __name__ == "__main__":
    main()
