In [1]:
# Import necessary libraries
import os
from dotenv import load_dotenv
import re
import sqlite3
from firecrawl import FirecrawlApp

# Load environment variables from a .env file
# Create a .env file in the root of your project and add your API keys there
# Example .env file:
# FIRECRAWL_API_KEY=your_firecrawl_api_key
# GEMINI_API_KEY=your_gemini_api_key
load_dotenv()

# Get API keys from environment variables
firecrawl_api_key = os.getenv("FIRECRAWL_API_KEY")
gemini_api_key = os.getenv("GEMINI_API_KEY")

# Instantiate firecrawl app
app = FirecrawlApp(api_key=firecrawl_api_key)

# File path for sitemap.txt
sitemap_path = os.path.abspath(os.path.join(os.getcwd(), '..', 'scripts', 'sitemap.txt'))

# Database path for chat.db
db_path = os.path.abspath(os.path.join(os.getcwd(), '..', 'chat.db'))


In [None]:
import re
import sqlite3

# Regular expression to match the [title](url) pattern
pattern = re.compile(r'\[(.*?)\]\((.*?)\)') # (r'\[(.*?)\]\((.*?)\))')

# Connect to the SQLite database
conn = sqlite3.connect(db_path)
cursor = conn.cursor()

# Read the sitemap.txt file
try:
    with open(sitemap_path, 'r', encoding='utf-8') as f:
        for line in f:
            match = pattern.match(line.strip())
            if match:
                title = match.group(1) # matches \[(.*?)\]
                url = match.group(2) # matches \((.*?)\)
                # Insert data into the textbook table
                try:
                    cursor.execute("INSERT INTO textbook (title, url) VALUES (?, ?)", (title, url))
                    print(f"Inserted: {title} - {url}")
                except sqlite3.Error as e:
                    print(f"Database error: {e}")
                    print(f"Could not insert: {title} - {url}")

    # Commit the changes to the database
    conn.commit()
    print("Data successfully inserted into the database.")

except FileNotFoundError:
    print(f"Error: {sitemap_path} not found.")
except Exception as e:
    print(f"An error occurred: {e}")

finally:
    # Close the database connection
    if conn:
        conn.close()
        print("Database connection closed.")


In [2]:
# Install with pip install firecrawl-py
from firecrawl import FirecrawlApp

app = FirecrawlApp(api_key=firecrawl_api_key)

def textbookScrape():
    # Connect to the SQLite database
    conn = sqlite3.connect(db_path)
    cursor = conn.cursor()

    try:
        # Select all entries from the textbook table that have a URL but no text
        cursor.execute("SELECT url FROM textbook WHERE url IS NOT NULL AND (text IS NULL OR text = '') LIMIT 10")
        entries = cursor.fetchall()

        print(f"Found {len(entries)} entries to scrape.")

        for entry in entries:
            url = entry[0]
            if url:
                try:
                    print(f"Scraping {url}...")
                    # Scrape the URL and get the markdown content
                    scrape_result = app.scrape_url(url, formats=['markdown'], only_main_content=True, max_age=14400000)
                    markdown_content = scrape_result.markdown
                    
                    # Update the text field in the textbook table
                    cursor.execute("UPDATE textbook SET text = ? WHERE url = ?", (markdown_content, url))
                    print(f"Successfully scraped and updated text for {url}")

                except Exception as e:
                    print(f"Failed to scrape or update for url {url}. Error: {e}")

        # Commit the changes to the database
        conn.commit()
        print("Finished scraping and updating the database.")

    except sqlite3.Error as e:
        print(f"Database error: {e}")

    finally:
        # Close the database connection
        if conn:
            conn.close()
            print("Database connection closed.")

Where I am right now:
- Every url is scraped, but has extra text on top
- For every row in db, remove header.txt
- The goal is to read loop through the urls and input their content into the database
- After that is accopmlished, I can use Gemini to provide a summary for each one

For now, I am waiting for a FireScrape help ticket.

In [3]:
import time

i = 0

while i < 7:
    start_time = time.time()

    textbookScrape()

    elapsed_time = time.time() - start_time

    wait_time = 60 - elapsed_time

    if wait_time > 0:
        time.sleep(wait_time)

    i+=1


Found 10 entries to scrape.
Scraping https://books.byui.edu/science_of_learning/moderate_concept_71_retrieval_practice...
Successfully scraped and updated text for https://books.byui.edu/science_of_learning/moderate_concept_71_retrieval_practice
Scraping https://books.byui.edu/science_of_learning/moderate_concept_83_attention...
Successfully scraped and updated text for https://books.byui.edu/science_of_learning/moderate_concept_71_retrieval_practice
Scraping https://books.byui.edu/science_of_learning/moderate_concept_83_attention...
Successfully scraped and updated text for https://books.byui.edu/science_of_learning/moderate_concept_83_attention
Scraping https://books.byui.edu/science_of_learning/week_8...
Successfully scraped and updated text for https://books.byui.edu/science_of_learning/moderate_concept_83_attention
Scraping https://books.byui.edu/science_of_learning/week_8...
Successfully scraped and updated text for https://books.byui.edu/science_of_learning/week_8
Scraping https

Remove the header

In [None]:
with open('header.txt') as h:
    header = h.read()

    # Connect to the SQLite database
    conn = sqlite3.connect(db_path)
    cursor = conn.cursor()

     # Select all entries from the textbook table that have a URL but no text
    cursor.execute("SELECT text FROM textbook WHERE cleanText IS NULL")
    entries = cursor.fetchall()

    # print(header)

    for entry in entries:
        try:
            text = entry[0].replace("â€™", "'")
            text = text.split(header)[1]
            text = text.split("This content was previously published as follows:")[0]

            cursor.execute("UPDATE textbook SET cleanText = ? WHERE text = ?", (text, entry[0]))
        except:
            print("Didn't work")
    
    conn.commit()
    conn.close()


Didn't work
Didn't work


Remove the footer

In [94]:
# Connect to the SQLite database
conn = sqlite3.connect(db_path)
cursor = conn.cursor()

try:
    # Select all entries from the textbook table that have a URL but no text
    cursor.execute("SELECT cleanText FROM textbook WHERE cleanText IS NOT NULL")
    entries = cursor.fetchall()

    try:
        for entry in entries:
            original = entry[0]
            
            text = original.replace("\n\n![](https://books.byui.edu/images/licenses/cc.svg)![](https://books.byui.edu/images/licenses/by.svg)![](https://books.byui.edu/images/licenses/sa.svg)This work is released under a CC BY-SA license, which means that\nyou are free to do with it as you please as long as you (1) properly\nattribute it and (2) share any derivative works under an open\nlicense.\n\n","")

            cursor.execute("UPDATE textbook SET cleanText = ? WHERE cleanText = ?", [text, original])
    except Exception as e:
        print(e)

except Exception as e:
    print(e)

conn.commit()
conn.close()

In [74]:
from google import genai

client = genai.Client()

response = client.models.generate_content(
    model="gemini-2.5-flash",
    contents="Explain how AI works in a few words",
)

print(response.text)

ImportError: cannot import name 'genai' from 'google' (unknown location)