In [2]:
import requests as r
from bs4 import BeautifulSoup
import pandas as pd
import sqlite3 as sql
import numpy as np
import datetime as dt
from sqlalchemy import create_engine


In [3]:
def log_progress(stage):
    with open("code_log.txt", "a") as log_file:
        log_file.write(f"Progress at stage: {stage}\n")

In [4]:
def extract(url, table_attribs):
    ''' This function extracts the required
    information from the website and saves it to a dataframe. The
    function returns the dataframe for further processing. '''
    log_progress("Starting extraction")
    
    # Send a GET request to the URL
    response = r.get(url)
    log_progress("Received response from URL")
    
    # Parse the HTML content using BeautifulSoup
    soup = BeautifulSoup(response.content, 'html.parser')
    log_progress("Parsed HTML content")
    
    # Find the table with the specified attributes
    table = soup.find('table', attrs=table_attribs)
    log_progress("Found the table in HTML content")
    
    # Extract the table headers
    headers = [header.text.strip() for header in table.find_all('th')]
    
    # Extract the table rows
    rows = []
    for row in table.find_all('tr')[1:]:  # Skip the header row
        cells = row.find_all('td')
        row_data = [cell.text.strip() for cell in cells]
        rows.append(row_data)
    
    # Create a DataFrame from the extracted data
    df = pd.DataFrame(rows, columns=headers)
    log_progress("Created DataFrame from extracted data")
    
    log_progress("Extraction completed")
    return df


def transform(df, exchange_rates_path):
    ''' This function adds columns for Market Capitalization in GBP, EUR, and INR
    based on the exchange rate information provided in a CSV file. The function
    returns the transformed dataframe.'''
    log_progress("Starting transformation")
    
    # Load exchange rates from CSV file
    exchange_rates = pd.read_csv(exchange_rates_path)
    
    # Extract exchange rates
    usd_to_gbp = exchange_rates.loc[exchange_rates['Currency'] == 'GBP', 'Rate'].values[0]
    usd_to_eur = exchange_rates.loc[exchange_rates['Currency'] == 'EUR', 'Rate'].values[0]
    usd_to_inr = exchange_rates.loc[exchange_rates['Currency'] == 'INR', 'Rate'].values[0]
    
    # Convert Market Capitalization to float
    df['Market Capitalization (USD)'] = df['Market Capitalization (USD)'].replace('[\$,]', '', regex=True).astype(float)
    
    # Add new columns for Market Capitalization in GBP, EUR, and INR
    df['Market Capitalization (GBP)'] = (df['Market Capitalization (USD)'] * usd_to_gbp).round(2)
    df['Market Capitalization (EUR)'] = (df['Market Capitalization (USD)'] * usd_to_eur).round(2)
    df['Market Capitalization (INR)'] = (df['Market Capitalization (USD)'] * usd_to_inr).round(2)
    
    log_progress("Transformation completed")
    return df

def load_to_csv(df, csv_path):
    ''' This function saves the final dataframe as a `CSV` file 
    in the provided path. Function returns nothing.'''
    log_progress("Starting CSV load")
    df.to_csv(csv_path, index=False)
    log_progress("CSV load completed")

def load_to_db(df, sql_connection, table_name):
    ''' This function saves the final dataframe as a database table
    with the provided name. Function returns nothing.'''
    log_progress("Starting DB load")
    engine = create_engine(sql_connection)
    df.to_sql(table_name, engine, if_exists='replace', index=False)
    log_progress("DB load completed")
    
def run_query(query_statement, sql_connection):
    ''' This function runs the stated query on the database table and
    prints the output on the terminal. Function returns nothing. '''
    log_progress("Starting query execution")
    engine = create_engine(sql_connection)
    with engine.connect() as connection:
        result = connection.execute(query_statement)
        for row in result:
            print(row)
    log_progress("Query execution completed")
    
def verify_log_entries(log_file_path):
    ''' This function reads and prints the contents of the log file to verify the log entries. '''
    with open(log_file_path, 'r') as log_file:
        log_contents = log_file.read()
        print(log_contents)


  df['Market Capitalization (USD)'] = df['Market Capitalization (USD)'].replace('[\$,]', '', regex=True).astype(float)


In [None]:
url = 'https://web.archive.org/web/20230908091635 /https://en.wikipedia.org/wiki/List_of_largest_banks'
table_attribs = {'class': 'wikitable'}  
exchange_rates_path = 'https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBMSkillsNetwork-PY0221EN-Coursera/labs/v2/exchange_rate.csv'  # Path to the exchange rates CSV file
csv_path = 'Largest_banks_data.csv'
sql_connection = 'sqlite:///Banks.db'  
table_name = 'Largest_banks'
query_statement = 'SELECT * FROM market_capitalization'
log_file_path = 'code_log.txt'

# Extract data
df = extract(url, table_attribs)

# Transform data
df_transformed = transform(df, exchange_rates_path)

# Load to CSV
load_to_csv(df_transformed, csv_path)

# Load to DB
load_to_db(df_transformed, sql_connection, table_name)

# Run query
run_query(query_statement, sql_connection)

# Verify log entries
verify_log_entries(log_file_path)

SyntaxError: unterminated string literal (detected at line 1) (2207838075.py, line 1)