In [None]:
import pandas as pd
import numpy as np
from sqlalchemy import create_engine, text
import psycopg2
from datetime import datetime, timedelta
import pytz
import logging
import os

In [None]:
#log file name
log_file = 'etl_to_staging_area_logs.txt'

# Checking if the log file exists
# if os.path.exists(log_file):
#     # If the file exists, it will be deleted 
#     os.remove(log_file)

logging.basicConfig(
    filename=log_file,
    level=logging.INFO,
    format='execution time: %(asctime)s - Details: %(message)s'
    )




In [None]:
try :
    countryTB = r"C:\Users\kmakh\Downloads\McD_Data\data\Country.csv"
    cityTB = r"C:\Users\kmakh\Downloads\McD_Data\data\City.csv"
    provinceTB = r"C:\Users\kmakh\Downloads\McD_Data\data\Province.csv"
    franchiseeTB = r"C:\Users\kmakh\Downloads\McD_Data\data\Franchisee.csv"
    dayDateTB = r"C:\Users\kmakh\Downloads\McD_Data\data\DayDate.csv"
    dayPartTB = r"C:\Users\kmakh\Downloads\McD_Data\data\DayPart.csv"
    dailySalesProductTB = r"C:\Users\kmakh\Downloads\McD_Data\data\DailySalesProducts.csv"
    orderTypeTB = r"C:\Users\kmakh\Downloads\McD_Data\data\OrderType.csv"
    ownerOperatorTB = r"C:\Users\kmakh\Downloads\McD_Data\data\OwnerOperator.csv"
    productCategoryTB = r"C:\Users\kmakh\Downloads\McD_Data\data\ProductCategory.csv"
    productGroupLevel1TB = r"C:\Users\kmakh\Downloads\McD_Data\data\ProductGroupLevel1.csv"
    productGroupLevel2TB = r"C:\Users\kmakh\Downloads\McD_Data\data\ProductGroupLevel2.csv"
    productGroupLevel3TB = r"C:\Users\kmakh\Downloads\McD_Data\data\ProductGroupLevel3.csv"
    productGroupLevel4TB = r"C:\Users\kmakh\Downloads\McD_Data\data\ProductGroupLevel4.csv"
    salesTypeTB = r"C:\Users\kmakh\Downloads\McD_Data\data\SalesType.csv"
    storesTB = r"C:\Users\kmakh\Downloads\McD_Data\data\Stores.csv"
    tradingHoursTB = r"C:\Users\kmakh\Downloads\McD_Data\data\TradingHours.csv"
    volumnBandTB = r"C:\Users\kmakh\Downloads\McD_Data\data\VolumnBand.csv"
    logging.info('Extracting all the csv files was successful')
except Exception as e:
    logging.warning("An exception occurred:", exc_info=True)

In [None]:
# Define variables for connection parameters for staging area
username = 'postgres'
password = 'makhubela'
host = 'localhost'
port = '5432'
database = 'DW_staging_area'

# Construct the connection string using variables
connection_string_staging = f'postgresql://{username}:{password}@{host}:{port}/{database}'

# Create the SQLAlchemy engine
engine_staging = create_engine(connection_string_staging)

In [None]:
# Loading data from the csv file to the data frames

try:
    df_country = pd.read_csv(countryTB)
    df_city =pd.read_csv(cityTB)
    df_province =pd.read_csv(provinceTB)
    df_franchisee =pd.read_csv(franchiseeTB)
    df_dayDate=pd.read_csv(dayDateTB)
    df_dayPart =pd.read_csv(dayPartTB)
    df_dailySalesProduct =pd.read_csv(dailySalesProductTB)
    df_productCategory =pd.read_csv(productCategoryTB)
    df_productGroupLevel1 =pd.read_csv(productGroupLevel1TB)
    df_productGroupLevel2 =pd.read_csv(productGroupLevel2TB)
    df_productGroupLevel3 =pd.read_csv(productGroupLevel3TB)
    df_productGroupLevel4 =pd.read_csv(productGroupLevel4TB)
    df_salesType =pd.read_csv(salesTypeTB)
    df_stores =pd.read_csv(storesTB)
    df_tradingHours =pd.read_csv(tradingHoursTB)
    df_volumnBand =pd.read_csv(volumnBandTB)
    df_ownerOperator =pd.read_csv(ownerOperatorTB)
    df_orderType = pd.read_csv(orderTypeTB)
    logging.info("18 tables from csv's have been converted to dataframes successfully!!")
except Exception as e:
    logging.warning("error occured: error loading tables as dataframes", exc_info=True) 

In [None]:
# dataFrames dict

dataframes = {
    'city': df_city,
    'country': df_country,
    'province':df_province,
    'day_part':df_dayPart,
    'day_date':df_dayDate,
    'franchisee':df_franchisee,
    'product_category':df_productCategory,
    'stores':df_stores,
    'product_group_level1':df_productGroupLevel1,
    'product_group_level2':df_productGroupLevel2,
    'product_group_level3':df_productGroupLevel3,
    'product_group_level4':df_productGroupLevel4,
    'sales_type':df_salesType,
    'trading_hours':df_tradingHours,
    'volumn_band':df_volumnBand,
    'owner_operator' : df_ownerOperator,
    'order_type' : df_orderType,
    'daily_sales_products':df_dailySalesProduct
}

In [None]:
try:    
    sa_tz = pytz.timezone('Africa/Johannesburg')
    start_time = datetime.now(sa_tz)
    print(f'start time :{start_time}')
    new_date = [start_time]  # New data to be added
    start_date = {'start_load_date':new_date}  # Name of the existing column
    #df[existing_column] = df[existing_column].append(pd.Series(new_data), ignore_index=True)
    df_start_date = pd.DataFrame(start_date)
    # Step 3: Write the updated DataFrame back to the database
    df_start_date.to_sql('staging_batch', engine_staging, if_exists='append', index=False)
    logging.info("start time successfully loaded into staging_batch table")
    print("start time successfully loaded into staging_batch table")
except Exception as e:
    logging.warning("error occured: start time not loaded in the staging_batch table", exc_info=True)
   

In [None]:
try:
    sql_line = 'SELECT "sbID" FROM staging_batch order by "sbID" desc limit 1'
    df_staging_batch = pd.read_sql_query(sql_line, engine_staging)
    staging_batch_id =df_staging_batch['sbID'].squeeze()
    logging.info('batch id successfully loaded')
    print(staging_batch_id)
except Exception as e:
    logging.warning("error occured: staging area batch id not assigned to staging_batch table", exc_info=True)

In [None]:
for table_name, df in dataframes.items(): 
    try:
        df['sbID'] = staging_batch_id
        df.to_sql(table_name, engine_staging, if_exists='append', index=False)
        logging.info(f"Table '{table_name}' successfully loaded consists of {df.shape[0]} rows.")
        print(f"Table '{table_name}' successfully loaded {df.shape[0]} .")
    except Exception as e:
        print(f"Error loading table '{table_name} ': {e}")
        logging.warning(f"error occured: Error loading table '{table_name}': {e}", exc_info=True)
end_time = datetime.now(sa_tz)

In [None]:
# end_time_str = end_time.strftime('%Y-%m-%d %H:%M:%S')
staging_batch_id_int = int(staging_batch_id)
# Define the SQL query to select the latest record from staging_batch
sql_line = 'SELECT * FROM staging_batch ORDER BY "sbID" DESC LIMIT 1'
 
# Read the latest record into a DataFrame
df_staging_batch = pd.read_sql_query(sql_line, engine_staging)
 
# Check if the latest record matches the staging_batch_id
print("Latest staging_batch_id from database:", df_staging_batch["sbID"].iloc[0])
print(end_time)
if df_staging_batch["sbID"].iloc[0] == staging_batch_id:
    connection = engine_staging.connect()
    # Update the end_load_date column with the provided end_time
    update_sql = text( f'UPDATE "staging_batch" SET "end_load_date" = :end_time WHERE "sbID" = :staging_batch_id')         
# Execute the SQL UPDATE statement with parameters
   
    connection.execute(update_sql, {'end_time': end_time, 'staging_batch_id': staging_batch_id_int})
    connection.commit()
    logging.info('end_load_date updated successfully.')
    print("end_load_date updated successfully.")
else:
    print("staging_batch_id does not match the latest record in the database.")
    logging.info('staging_batch_id does not match the latest record in the database.')
connection.close()

In [None]:
print(f'end time :{end_time}')
total_time = end_time - start_time
print(f"Total time taken: {total_time}")
logging.info(f'total time taken to load the staging area = {total_time}')