In [1]:
!pip install google-genai

Collecting google-genai
  Using cached google_genai-1.26.0-py3-none-any.whl.metadata (42 kB)
Collecting anyio<5.0.0,>=4.8.0 (from google-genai)
  Using cached anyio-4.9.0-py3-none-any.whl.metadata (4.7 kB)
Collecting google-auth<3.0.0,>=2.14.1 (from google-genai)
  Using cached google_auth-2.40.3-py2.py3-none-any.whl.metadata (6.2 kB)
Collecting httpx<1.0.0,>=0.28.1 (from google-genai)
  Using cached httpx-0.28.1-py3-none-any.whl.metadata (7.1 kB)
Collecting pydantic<3.0.0,>=2.0.0 (from google-genai)
  Using cached pydantic-2.11.7-py3-none-any.whl.metadata (67 kB)
Collecting tenacity<9.0.0,>=8.2.3 (from google-genai)
  Using cached tenacity-8.5.0-py3-none-any.whl.metadata (1.2 kB)
Collecting websockets<15.1.0,>=13.0.0 (from google-genai)
  Downloading websockets-15.0.1-cp311-cp311-win_amd64.whl.metadata (7.0 kB)
Collecting sniffio>=1.1 (from anyio<5.0.0,>=4.8.0->google-genai)
  Using cached sniffio-1.3.1-py3-none-any.whl.metadata (3.9 kB)
Collecting cachetools<6.0,>=2.0.0 (from google-


[notice] A new release of pip is available: 25.0.1 -> 25.1.1
[notice] To update, run: C:\Users\hy608\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


In [2]:
from google import genai

client = genai.Client(api_key="AIzaSyCY3GbJOrSKN_ttTN9HtCi-cgEXf_K6zEM")

my_file = client.files.upload(file="this-day-in-history-07-19-1799-rosetta-stone-found.jpg")

response = client.models.generate_content(
    model="gemini-2.5-flash",
    contents=[my_file, "Please only give the name of the object and the link to its wikipedia page(if it has one). Nothing else."],
)

print(response.text)

FileNotFoundError: this-day-in-history-07-19-1799-rosetta-stone-found.jpg is not a valid file path.

In [11]:
def simple_parse(response_text):
    """Simple parsing for consistent format responses"""
    lines = response_text.strip().split('\n')
    
    title = lines[0].strip() if len(lines) > 0 else "Unknown"
    url = lines[1].strip() if len(lines) > 1 else None
    
    # Validate URL
    if url and not url.startswith('http'):
        url = None
    
    return {
        'title': title,
        'url': url
    }

# Use the simple parser
result = simple_parse(response.text)
print(f"Title: {result['title']}")
print(f"URL: {result['url']}")

Title: Rosetta Stone
URL: https://en.wikipedia.org/wiki/Rosetta_Stone


In [12]:
import requests
from bs4 import BeautifulSoup
import re
import time

class WikipediaScraper:
    """
    Scraper for Wikipedia that retrieves information about objects identified in images.
    """
    
    def __init__(self):
        self.base_url = "https://en.wikipedia.org/wiki/"
        self.headers = {
            'User-Agent': 'Auto-Museum/1.0 (Educational Project; contact@automuseum.example.com)'
        }
    
    def clean_text(self, text):
        """Remove references, citations and other wiki markup"""
        text = re.sub(r'\[\d+\]', '', text)
        text = re.sub(r'\(listen\)', '', text)
        return text.strip()
    
    def get_article_content(self, title, url):
        """
        Retrieve and parse Wikipedia article content for a given topic
        
        Args:
            url (str): The Wikipedia article link
            
        Returns:
            dict: Dictionary containing article sections and content
        """
        
        try:
            response = requests.get(url, headers=self.headers)
            response.raise_for_status()
            
            soup = BeautifulSoup(response.text, 'html.parser')
            
            # Extract title
            # title = soup.find(id="firstHeading").text
            
            # Get the main content
            content_div = soup.find(id="mw-content-text")
            
            # Extract the content
            paragraphs = []
            for p in content_div.find_all('p'):
                if p.text.strip():
                    paragraphs.append(self.clean_text(p.text))
            
            return {
                'title': title,
                'url': url,
                'paragraphs': paragraphs
            }
            
        except Exception as e:
            print(f"Error retrieving Wikipedia content for url: {e}")
            return {
                'title': None,
                'url': url,
                'paragraphs': [f"Could not retrieve information about url."]
            }

# Example usage
scraper = WikipediaScraper()
fossil_info = scraper.get_article_content("Rosetta Stone", "https://en.wikipedia.org/wiki/Rosetta_Stone")
print(f"Title: {fossil_info['title']}")
print(f"Introduction: {fossil_info['paragraphs'][0]}...")

Title: Rosetta Stone
Introduction: The Rosetta Stone is a stele of granodiorite inscribed with three versions of a decree issued in 196 BC during the Ptolemaic dynasty of Egypt, on behalf of King Ptolemy V Epiphanes. The top and middle texts are in Ancient Egyptian using hieroglyphic and Demotic scripts, respectively, while the bottom is in Ancient Greek. The decree has only minor differences across the three versions, making the Rosetta Stone key to deciphering the Egyptian scripts....


In [13]:
import numpy as np
from sentence_transformers import SentenceTransformer
import faiss
import json
import ollama
import os

class RAGPipeline:
    def __init__(self, model_name="all-MiniLM-L6-v2"):
        # Load the sentence transformer model for creating embeddings
        self.model = SentenceTransformer(model_name)

        # Initialize FAISS index
        self.dimension = self.model.get_sentence_embedding_dimension()
        self.index = faiss.IndexFlatL2(self.dimension)

        # Storage for our text chunks and metadata
        self.documents = []
        self.metadata = {}

    def add_wikipedia_content(self, wiki_data):
        """Process Wikipedia content and add to the vector database"""
        # Process introduction
        self.documents = wiki_data['paragraphs']
        self.metadata = {'title': wiki_data['title'], 'url': wiki_data['url']}
        self.object_name = wiki_data['title']

        self._update_index()

    def _update_index(self):
        """Update the FAISS index with the current documents"""
        if not self.documents:
            return

        embeddings = self.model.encode(self.documents)
        embeddings = np.array([embedding for embedding in embeddings]).astype('float32')

        self.index = faiss.IndexFlatL2(self.dimension)
        self.index.add(embeddings)

    def retrieve(self, query, top_k=3):
        """Retrieve relevant context for a query"""
        query_embedding = self.model.encode([query])[0].reshape(1, -1).astype('float32')
        distances, indices = self.index.search(query_embedding, min(top_k, len(self.documents)))

        results = []
        for i, idx in enumerate(indices[0]):
            if idx < len(self.documents):
                results.append({
                    'text': self.documents[idx],
                    'metadata': self.metadata,
                    'distance': float(distances[0][i])
                })

        return results

    def generate_museum_description(self, model="llama3"):
        """Generate a museum-style description using the LLM and retrieved context"""
        context = self.retrieve(f"Information about {self.object_name}", top_k=5)
        print(len(context))

        context_text = ""
        # context_text += f"Source: {context[0]['metadata']['url']}\n"
        for item in context:
            context_text += item['text'] + "\n\n"

        prompt = f'''
        You are a museum curator writing an informative and engaging description plaque for an exhibit.

        Object: {self.object_name}

        Based on the following information, write a museum-style description plaque for this object.
        The description should be informative, educational, and engaging for museum visitors.
        Write in a professional tone similar to what would be found in a prestigious museum.

        CONTEXTUAL INFORMATION:
        {context_text}

        REQUIREMENTS:
        1. Begin with a catchy title (max 10 words)
        2. The main description should be 150-200 words
        3. Include key historical or scientific information
        4. Make it accessible to general audience (grade 10 level)
        5. Include 2-3 interesting facts that would surprise visitors
        '''

        try:
            if len(context) == 1:
                return {
                    'title': f"Exhibit: {self.object_name}",
                    'description': f"Information about this {object_name} is currently being curated.",
                    'object_name': self.object_name,
                    'sources': []
                }


            # Generate description using Ollama
            response = ollama.generate(model=model, prompt=prompt)
            description = response['response'].strip()

            parts = description.split('\n', 1)
            title = parts[0].strip()
            body = parts[1].strip() if len(parts) > 1 else ""

            return {
                'title': title,
                'description': body,
                'object_name': self.object_name,
                'sources': self.metadata['url']
            }

        except Exception as e:
            print(f"Error generating museum description: {e}")
            return {
                'title': f"Exhibit: {self.object_name}",
                'description': f"Information about this {self.object_name} is currently being curated.",
                'object_name': self.object_name,
                'sources': []
            }


In [16]:

scraper = WikipediaScraper()
fossil_info = scraper.get_article_content(result['title'], result['url'])
rag = RAGPipeline()
rag.add_wikipedia_content(fossil_info)
output = rag.generate_museum_description()
print(json.dumps(output, indent=2))

5
{
  "title": "**\"Unlocking the Secrets of the Ancient World: The Rosetta Stone\"**",
  "description": "Discover one of history's most significant archaeological finds, the Rosetta Stone! This ancient stele holds the key to deciphering Egyptian hieroglyphics and is a testament to human ingenuity.\n\nIn 196 BC, during the Ptolemaic dynasty of Egypt, King Ptolemy V Epiphanes issued a decree inscribed on this granodiorite stone. The top register features ancient Egyptian hieroglyphs, while the middle text is written in Demotic script, and the bottom inscription is in Ancient Greek. This unique trilingual artifact allows us to understand the evolution of language and writing systems.\n\nInterestingly, the Rosetta Stone's inscriptions were only slightly modified across its three versions, making it a crucial tool for deciphering Egyptian scripts. The stone itself measures 112.3 cm high, 75.7 cm wide, and 28.4 cm thick, weighing approximately 760 kilograms.\n\nDid you know that the Rosetta