In [None]:
import datetime
import pytz
import schedule
import sqlite3
import time

# Define the hours at which each code will be executed
execution_time_code1 = '10:30'
delta_minutes_code2 = 10

# Define the GMT 0 time zone
timezone = pytz.timezone('GMT')

# Function to execute code 1
def execute_code1():
    print("Executing code 1...")
    # Call the function or code you want to execute for code 1

# Function to execute code 2
def execute_code2():
    print("Executing code 2...")
    # Call the function or code you want to execute for code 2

# Connect to the SQLite database
connection = sqlite3.connect('sqlite:///wiki_data.db')
cursor = connection.cursor()

# Check if the "next_execution" table exists
cursor.execute("SELECT name FROM sqlite_master WHERE type='table' AND name='next_execution';")
table_exists = cursor.fetchone()

if table_exists:
    # Check if there is a date and time record in the table
    cursor.execute("SELECT * FROM next_execution LIMIT 1;")
    record = cursor.fetchone()
    
    if record is not None:
        # Get the date and time from the record
        start_date = datetime.datetime.strptime(record[0], '%Y-%m-%d %H:%M:%S%z')
    else:
        # Set the start date to July 8, 2023, at 10:30 in GMT 0 time zone
        start_date = datetime.datetime(2023, 7, 8, hour=10, minute=30, tzinfo=timezone)
else:
    # Set the start date to July 8, 2023, at 10:30 in GMT 0 time zone
    start_date = datetime.datetime(2023, 7, 8, hour=10, minute=30, tzinfo=timezone)

# Calculate the start date for code 2
start_date_code2 = start_date + datetime.timedelta(minutes=delta_minutes_code2)

# Function to update the next execution date
def update_next_execution():
    # Calculate the next execution date
    next_execution_date = datetime.datetime.now(timezone).date() + datetime.timedelta(days=1)
    next_execution_datetime = datetime.datetime.combine(next_execution_date, start_date.time())
    
    if table_exists:
        # Update the next execution date in the table
        cursor.execute("UPDATE next_execution SET data = ?", (next_execution_datetime,))
    else:
        # Create the "next_execution" table and insert the next execution date
        cursor.execute("CREATE TABLE next_execution (data DATETIME);")
        cursor.execute("INSERT INTO next_execution (data) VALUES (?)", (next_execution_datetime,))
    
    # Save the changes to the database
    connection.commit()

# Schedule the execution of the codes
schedule.every().day.at(start_date.strftime('%H:%M')).do(execute_code1)
schedule.every().day.at(start_date_code2.strftime('%H:%M')).do(execute_code2)
schedule.every().day.at(execution_time_code2).do(update_next_execution)

# Main loop
while True:
    # Check if the current date is equal to or later than the start date
    if datetime.datetime.now(timezone) >= start_date:
        # Execute the scheduled tasks
        schedule.run_pending()
    
    # Wait 1 second before checking again
    time.sleep(1)


In [None]:
# registrar o horario que foi coletado
# checar se o conteúdo específico já existe, se sim, não gravar o fato em questão

In [None]:
import requests
from bs4 import BeautifulSoup
from sqlalchemy import create_engine, Column, Integer, String
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.orm import sessionmaker
from sqlalchemy.orm import relationship
from sqlalchemy import ForeignKey
import os
import logging
from datetime import datetime

In [None]:
def scrape_wikipedia():
    # Configure the logger
    logging.basicConfig(filename='app.log', level=logging.INFO, format='%(asctime)s %(levelname)s: %(message)s')

    # Log the start of the web scraping process
    logging.info('Starting web scraping...')

    # Send GET request to Wikipedia's main page
    response = requests.get("https://en.wikipedia.org/wiki/Main_Page")
    
    # Parse the HTML content
    soup = BeautifulSoup(response.content, "html.parser")
    
    # Extract the "Did you know..." section
    did_you_know_section = soup.find("div", {"id": "mp-dyk"})
    
    # Extract individual facts, preview links, and featured image
    facts = []
    for index, fact_item in enumerate(did_you_know_section.find_all("li")):
        fact_content = fact_item.text.strip()
        
        # Extract preview links
        preview_links = []
        for link in fact_item.find_all("a"):
            preview_content = link.get("title")
            preview_url = "https://en.wikipedia.org" + link.get("href")
            preview_links.append({"content": preview_content, "url": preview_url})
        
        # Extract featured image (if available)
        featured_image = None
        if index == 0: # Only for the first fact
            element = did_you_know_section.find_all("a")
            if len(element) != 0:
                image_element = element[0]
                if image_element and "image" in image_element.get("class", []):
                    image_url = "https://en.wikipedia.org" + image_element.get("href")
                    image_caption = image_element.get("title")
                    featured_image = {"url": image_url, "caption": image_caption}

        facts.append({"content": fact_content, "preview_links": preview_links, "featured_image": featured_image})
    
    # Log the number of facts found
    logging.info(f"Number of facts found: {len(facts)}")
    
    # Exclude items starting from the one with 'Archive' in the content
    facts_data = []
    found_archive = False
    for fact in facts:
        if found_archive:
            break
        if 'Archive' in fact['content']:
            found_archive = True
        else:
            facts_data.append(fact)
    

    # Database struct
    Base = declarative_base()
    class Fact(Base):
        __tablename__ = "facts"
        
        id = Column(Integer, primary_key=True)
        content = Column(String)
        
        preview_links = relationship("PreviewLink", back_populates="fact")
        featured_image = relationship("FeaturedImage", uselist=False, back_populates="fact")

    class PreviewLink(Base):
        __tablename__ = "preview_links"
        
        id = Column(Integer, primary_key=True)
        url = Column(String)
        
        fact_id = Column(Integer, ForeignKey("facts.id"))
        fact = relationship("Fact", back_populates="preview_links")

    class FeaturedImage(Base):
        __tablename__ = "featured_images"
        
        id = Column(Integer, primary_key=True)
        image_url = Column(String)
        caption = Column(String)
        
        fact_id = Column(Integer, ForeignKey("facts.id"))
        fact = relationship("Fact", back_populates="featured_image")
    
    # Create the database engine and session
    database_file = "wiki_data.db"
    database_exists = os.path.isfile(database_file)
    engine = create_engine(f"sqlite:///{database_file}")
    if not database_exists:
        # Create the tables if the database doesn't exist
        Base.metadata.create_all(engine)
        logging.info('Database created.')
    Session = sessionmaker(bind=engine)
    session = Session()

    # Store the data in the database
    for fact_data in facts_data:
        fact = Fact(content=fact_data["content"])
        
        # Link preview links to the fact
        for preview_link_data in fact_data["preview_links"]:
            preview_link = PreviewLink(url=preview_link_data["url"])
            fact.preview_links.append(preview_link)
        
        # Check if there's a featured image for the fact
        featured_image_data = fact_data["featured_image"]
        if featured_image_data:
            featured_image = FeaturedImage(image_url=featured_image_data["url"], caption=featured_image_data["caption"])
            fact.featured_image = featured_image
        
        # Save the fact to the database
        session.add(fact)
    # Commit the changes to the database
    session.commit()

    # Log the completion of the web scraping process
    logging.info('Web scraping completed.')

In [None]:
scrape_wikipedia()

In [None]:
database_file = "wiki_data.db"
database_exists = os.path.isfile(database_file)
engine = create_engine(f"sqlite:///{database_file}")    

In [None]:
import pandas as pd
from sqlalchemy import inspect

# Create the inspector
inspector = inspect(engine)

# Get the table names
table_names = inspector.get_table_names()

In [None]:
table_names

In [None]:
lista_dfs = []
# Iterate over the table names
for table_name in table_names:
    # Load the table content into a DataFrame
    df = pd.read_sql_table(table_name, engine)
    
    # Print the table name
    print(f"Table: {table_name}")
    
    print(df)
    lista_dfs.append(df)