In [1]:
!pip install pandas googlesearch-python requests beautifulsoup4 google-generativeai


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.1.1[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3 -m pip install --upgrade pip[0m


In [17]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
from googlesearch import search
import time
import google.generativeai as genai
import os

# --- Configure the Gemini API ---
# IMPORTANT: Replace "YOUR_API_KEY" with your actual Google AI API key.
try:
    GOOGLE_API_KEY = "AIzaSyBf9wxJNyxFmf8XDfm85J9gl-5VWHLKlzQ"
    genai.configure(api_key=GOOGLE_API_KEY)
    model = genai.GenerativeModel('gemma-3-27b-it')
    print("✅ Gemini API configured successfully.")
except Exception as e:
    print(f"🔴 Error configuring Gemini API: {e}")
    print("Please make sure you have replaced 'YOUR_API_KEY' with a valid key.")

✅ Gemini API configured successfully.


In [18]:
# --- Load Data and Set Up for Resumable Processing ---

# Define filenames
input_filename = 'fpkm_counts_with_annotations.csv'
output_filename = 'fpkm_with_metabolic_summaries.csv'

# Check if a progress file already exists
if os.path.exists(output_filename):
    print(f"🔄 Resuming from previously saved progress file: '{output_filename}'")
    df = pd.read_csv(output_filename)
else:
    print(f"🚀 Starting a new run. Loading data from: '{input_filename}'")
    try:
        df = pd.read_csv(input_filename)
        # Add the new column for the results, initialized as empty
        df['Metabolic_Summary'] = None
    except FileNotFoundError:
        print(f"🔴 ERROR: The input file '{input_filename}' was not found.")
        # Stop execution if the file doesn't exist.
        # In Jupyter, this will just prevent subsequent cells from running as intended.

# Display the status of your data
print(f"\nTotal rows in DataFrame: {len(df)}")
if 'Metabolic_Summary' in df.columns:
    rows_processed = df['Metabolic_Summary'].notna().sum()
    print(f"Rows already processed: {rows_processed}")
    print(f"Rows remaining: {len(df) - rows_processed}")

🚀 Starting a new run. Loading data from: 'fpkm_counts_with_annotations.csv'

Total rows in DataFrame: 6575
Rows already processed: 0
Rows remaining: 6575


In [19]:
def get_web_context(gene_identifier, kegg_info, jgi_info):
    """Searches for a gene and scrapes the text content from the top result."""
    if pd.notna(kegg_info) and 'no KO assigned' not in str(kegg_info):
        query = f"{kegg_info} metabolic pathway function"
    elif pd.notna(jgi_info):
        query = f"{gene_identifier} {jgi_info} metabolic function"
    else:
        query = f"{gene_identifier} metabolic synthesis role"
    
    try:
        # ----- THE DEFINITIVE FIX IS HERE -----
        # Removed all arguments from search() and added a manual time.sleep() after.
        # This is the most compatible method.
        print(f"  Searching for: \"{query}\"")
        search_results = search(query)
        time.sleep(2.0) # Manual delay to prevent being blocked
        
        first_url = next(search_results, None) # Get the first item, or None if empty
        # ------------------------------------
        
        if not first_url:
            return None, "Skipped (No search results found)"
        
        headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'}
        response = requests.get(first_url, headers=headers, timeout=15)
        response.raise_for_status()

        soup = BeautifulSoup(response.text, 'html.parser')
        for script_or_style in soup(["script", "style"]):
            script_or_style.decompose()
        
        return soup.get_text(separator=' ', strip=True)[:8000], None
    except Exception as e:
        return None, f"Skipped (Web scraping error: {str(e)})"

def get_summary_from_gemini(context, gene_id):
    """Queries the Gemini model for a summary."""
    prompt = f"""
    Analyze the provided text about the gene '{gene_id}' to answer the following question.

    Question: What does this gene synthesize or what metabolic pathway is it a part of?

    Based your answer only on the text below. Provide a concise, one-sentence summary. If the text does not contain information about metabolic synthesis or pathways, respond with 'No specific metabolic synthesis information found.'

    --- Text ---
    {context}
    --- End of Text ---

    Summary:
    """
    
    try:
        response = model.generate_content(prompt)
        time.sleep(1) # Short delay to respect API rate limits
        return response.text.strip()
    except Exception as e:
        return f"Skipped (Gemini API Error: {e})"

print("✅ Core functions defined.")

✅ Core functions defined.


In [None]:
# --- Main Processing Loop ---

# Define the size of each batch
batch_size = 5

# Find the indices of rows that have not been processed yet
# We check for None, NaN, or any other empty-like value.
unprocessed_indices = df[df['Metabolic_Summary'].isna()].index

print(f"Starting processing for {len(unprocessed_indices)} remaining rows...")

for i in range(0, len(unprocessed_indices), batch_size):
    # Get the next batch of indices to process
    batch_indices = unprocessed_indices[i:i + batch_size]
    
    if len(batch_indices) == 0:
        print("No more rows to process.")
        break

    print(f"\n--- Processing Batch {i//batch_size + 1} (Rows {batch_indices[0]} to {batch_indices[-1]}) ---")
    
    for index in batch_indices:
        row = df.loc[index]
        gene_id = row['Geneid']
        
        print(f"  Processing Gene ID: {gene_id}...")
        
        # 1. Get context from the web
        context, error = get_web_context(gene_id, row.get('KEGG_annotation'), row.get('JGI_annotation'))
        
        if error:
            summary = error
        else:
            # 2. Get summary from Gemini
            summary = get_summary_from_gemini(context, gene_id)

        print(f"    -> Summary: {summary}")
        
        # 3. Add the summary to the DataFrame
        df.at[index, 'Metabolic_Summary'] = summary

    # 4. Save progress to the output file after each batch
    try:
        df.to_csv(output_filename, index=False)
        print(f"💾 Progress saved to '{output_filename}'")
    except Exception as e:
        print(f"🔴 CRITICAL: Could not save progress to file. Error: {e}")

print("\n--- ✅ All processing complete! ---")

Starting processing for 6575 remaining rows...

--- Processing Batch 1 (Rows 0 to 4) ---
  Processing Gene ID: jgi|Sacce1|1000|YDL018C...
  Searching for: "jgi|Sacce1|1000|YDL018C metabolic synthesis role"
    -> Summary: The text describes the JGI Metabolomics Program as focused on understanding the roles of small molecules (metabolites) in biological systems, linking them with genomic data to study processes like nutrient cycling, plant-microbe interactions, and environmental responses, and identifying enzymatic transformations and metabolic pathways.
  Processing Gene ID: jgi|Sacce1|1001|YDL017W...
  Searching for: "K02214 cell division control protein 7 [EC:2.7.11.1] | (RefSeq) CDC7, LSD6, SAS1; serine/threonine protein kinase CDC7 metabolic pathway function"
    -> Summary: This gene encodes a cell division cycle protein with kinase activity that is critical for the G1/S transition and essential for initiation of DNA replication.
  Processing Gene ID: jgi|Sacce1|1002|YDL016C...
  