In [2]:
import psycopg2
import logging
import os
import pandas as pd

# Get the directory where the script is located
script_dir = os.getcwd()

# Define the path to the logs folder
logs_dir = os.path.join(script_dir, 'logs')

# Ensure the logs directory exists
if not os.path.exists(logs_dir):
    os.makedirs(logs_dir)

# Configure logging to store logs in the logs folder
logging.basicConfig(
    filename=os.path.join(logs_dir, 'load.log'),
    level=logging.DEBUG,
    format='%(asctime)s - %(levelname)s - %(message)s'
)


# Connecting to PostgreSQL database
###################################
def connect_db(dbname):
    """
    Connects to the PostgreSQL database.
    Returns a connection object if successful, or None if connection fails.
    """
    try:
        conn = psycopg2.connect(
            dbname=os.getenv('DB_NAME', dbname),
            user=os.getenv('DB_USER', 'postgres'),
            password=os.getenv('DB_PASSWORD', 'Michel2003'),
            host=os.getenv('DB_HOST', 'localhost'),
            port=os.getenv('DB_PORT', 5432)
        )
        logging.info("Connected to database successfully")
        return conn
    except Exception as error:
        logging.error(f"Error connecting to the database: {error}")
        return None


def create_disaster_tables(conn):
    try:
        cur = conn.cursor()
        # dim_disaster_groups
        try:
            logging.info("Creating table: dim_disaster_groups")
            cur.execute("""--sql
            CREATE TABLE IF NOT EXISTS dim_disaster_groups (
                group_id INT PRIMARY KEY,
                group_name VARCHAR,
                parent_group_id INT REFERENCES dim_disaster_groups(group_id)
            );
            """)
            logging.info("Table dim_disaster_groups created successfully.")
        except Exception as e:
            logging.error(f"Error creating dim_disaster_groups: {e}")

        # dim_disaster_types
        try:
            logging.info("Creating table: dim_disaster_types")
            cur.execute("""--sql
            CREATE TABLE IF NOT EXISTS dim_disaster_types (
                type_id INT PRIMARY KEY,
                type_name VARCHAR,
                parent_type_id INT REFERENCES dim_disaster_types(type_id)
            );
            """)
            logging.info("Table dim_disaster_types created successfully.")
        except Exception as e:
            logging.error(f"Error creating dim_disaster_types: {e}")

        # dim_disaster_names
        try:
            logging.info("Creating table: dim_disaster_names")
            cur.execute("""--sql
            CREATE TABLE IF NOT EXISTS dim_disaster_names (
                name_id INT PRIMARY KEY,
                name VARCHAR
            );
            """)
            logging.info("Table dim_disaster_names created successfully.")
        except Exception as e:
            logging.error(f"Error creating dim_disaster_names: {e}")

        # dim_locations
        try:
            logging.info("Creating table: dim_locations")
            cur.execute("""--sql
            CREATE TABLE IF NOT EXISTS dim_locations (
                location_id INT PRIMARY KEY,
                longitude DOUBLE PRECISION,
                latitude DOUBLE PRECISION,
                country VARCHAR,
                country_code VARCHAR,
                city VARCHAR,
                state VARCHAR
            );
            """)
            logging.info("Table dim_locations created successfully.")
        except Exception as e:
            logging.error(f"Error creating dim_locations: {e}")

        # dim_dates
        try:
            logging.info("Creating table: dim_dates")
            cur.execute("""--sql
            CREATE TABLE IF NOT EXISTS dim_dates (
                date_id INT PRIMARY KEY,
                disaster_date DATE
            );
            """)
            logging.info("Table dim_dates created successfully.")
        except Exception as e:
            logging.error(f"Error creating dim_dates: {e}")

        # dim_associated_distructions
        try:
            logging.info("Creating table: dim_associated_distructions")
            cur.execute("""--sql
            CREATE TABLE IF NOT EXISTS dim_associated_distructions (
                associated_dis_id INT PRIMARY KEY,
                associated_dis VARCHAR,
                parent_id INT REFERENCES dim_associated_distructions(associated_dis_id)
            );
            """)
            logging.info("Table dim_associated_distructions created successfully.")
        except Exception as e:
            logging.error(f"Error creating dim_associated_distructions: {e}")

        # dim_ofda_responses
        try:
            logging.info("Creating table: dim_ofda_responses")
            cur.execute("""--sql
            CREATE TABLE IF NOT EXISTS dim_ofda_responses (
                OFDA_resp_id INT PRIMARY KEY,
                OFDA_resp VARCHAR
            );
            """)
            logging.info("Table dim_ofda_responses created successfully.")
        except Exception as e:
            logging.error(f"Error creating dim_ofda_responses: {e}")

        # dim_appeals
        try:
            logging.info("Creating table: dim_appeals")
            cur.execute("""--sql
            CREATE TABLE IF NOT EXISTS dim_appeals (
                appeal_id INT PRIMARY KEY,
                appeal VARCHAR
            );
            """)
            logging.info("Table dim_appeals created successfully.")
        except Exception as e:
            logging.error(f"Error creating dim_appeals: {e}")

        # dim_declarations
        try:
            logging.info("Creating table: dim_declarations")
            cur.execute("""--sql
            CREATE TABLE IF NOT EXISTS dim_declarations (
                declaration_id INT PRIMARY KEY,
                declaration VARCHAR
            );
            """)
            logging.info("Table dim_declarations created successfully.")
        except Exception as e:
            logging.error(f"Error creating dim_declarations: {e}")

        # dim_mag_scales
        try:
            logging.info("Creating table: dim_mag_scales")
            cur.execute("""--sql
            CREATE TABLE IF NOT EXISTS dim_mag_scales (
                dis_mag_scale_id INT PRIMARY KEY,
                dis_mag_scalle VARCHAR
            );
            """)
            logging.info("Table dim_mag_scales created successfully.")
        except Exception as e:
            logging.error(f"Error creating dim_mag_scales: {e}")

        # dim_adm_levels
        try:
            logging.info("Creating table: dim_adm_levels")
            cur.execute("""--sql
            CREATE TABLE IF NOT EXISTS dim_adm_levels (
                adm_level_id INT PRIMARY KEY,
                adm_level INT
            );
            """)
            logging.info("Table dim_adm_levels created successfully.")
        except Exception as e:
            logging.error(f"Error creating dim_adm_levels: {e}")

        # fact_disasters
        try:
            logging.info("Creating table: fact_disasters")
            cur.execute("""--sql
            CREATE TABLE IF NOT EXISTS fact_disasters (
                disaster_id INT PRIMARY KEY,
                seq INT,
                glide VARCHAR,
                starting_date_id INT REFERENCES dim_dates(date_id),
                ending_date_id INT REFERENCES dim_dates(date_id),
                group_id INT REFERENCES dim_disaster_groups(group_id),
                type_id INT REFERENCES dim_disaster_types(type_id),
                name_id INT REFERENCES dim_disaster_names(name_id),
                location_id INT REFERENCES dim_locations(location_id),
                duration INT,
                origin VARCHAR,
                associated_dis_id INT REFERENCES dim_associated_distructions(associated_dis_id),
                OFDA_resp_id INT REFERENCES dim_ofda_responses(OFDA_resp_id),
                appeal_id INT REFERENCES dim_appeals(appeal_id),
                declaration_id INT REFERENCES dim_declarations(declaration_id),
                aid_contribution INT,
                dis_mag_value INT,
                dis_mag_scale_id INT REFERENCES dim_mag_scales(dis_mag_scale_id),
                total_deaths INT,
                no_injured INT,
                no_affected INT,
                no_homeless INT,
                total_affected INT,
                insured_damages DOUBLE PRECISION,
                total_damages DOUBLE PRECISION,
                cpi DOUBLE PRECISION,
                adm_level_id INT REFERENCES dim_adm_levels(adm_level_id)
            );
            """)
            logging.info("Table fact_disasters created successfully.")
        except Exception as e:
            logging.error(f"Error creating fact_disasters: {e}")

        conn.commit()
        logging.info("All tables created successfully.")

    except Exception as e:
        logging.error(f"General error during table creation: {e}")
        conn.rollback()

    finally:
        cur.close()

# Function to get data from PostgreSQL and load into a pandas DataFrame
#######################################################################
def get_data_from_db(query,conn):
    """
    Fetches data from the PostgreSQL database using the provided query.
    Cleans column names and returns the data as a pandas DataFrame.
    """
    #conn = connect_db()
    if conn is None:
        logging.error("Connection to database failed")
        return None
    
    try:
        df = pd.read_sql_query(query, conn)
        logging.info(f"Data fetched successfully for query: {query}")
        return df
    except Exception as error:
        logging.error(f"Error fetching data: {error}")
        return None
    finally:
        conn.close()

if __name__=="__main__":
    create_disaster_tables(connect_db('disasters_dwh'))
    disasters=get_data_from_db("""--sql
                     SELECT * FROM staging_disasters;""",connect_db('staging_disasters'))

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd
  df = pd.read_sql_query(query, conn)


In [3]:
disasters.columns

Index(['Year', 'Seq', 'Glide', 'Disaster Group', 'Disaster Subgroup',
       'Disaster Type', 'Disaster Subtype', 'Disaster Subsubtype',
       'Event Name', 'Country', 'ISO', 'Region', 'Continent', 'Location',
       'Origin', 'Associated Dis', 'Associated Dis2', 'OFDA Response',
       'Appeal', 'Declaration', 'Aid Contribution', 'Dis Mag Value',
       'Dis Mag Scale', 'Latitude', 'Longitude', 'Local Time', 'River Basin',
       'Start Year', 'Start Month', 'Start Day', 'End Year', 'End Month',
       'End Day', 'Total Deaths', 'No Injured', 'No Affected', 'No Homeless',
       'Total Affected', 'Insured Damages ('000 US$)',
       'Total Damages ('000 US$)', 'CPI', 'Adm Level', 'Admin1 Code',
       'Admin2 Code', 'Geo Locations', 'extraction_time'],
      dtype='object')

In [4]:
# I need to create IDs for:
##### combine both columns: Disaster group and disaster subgroup, create Ids for them and create a new column called parent_id first (before combining the columns)  
##### do the same for the types and subtypes.
##### Create date table from min date till end date and assign it ID and then join it to the fact table
##### get Associated Dis ID and then combine it Associated Dis 2 and creat a parents ID
##### Event name ID
##### location ID for all the feilds
##### OFDA repsonse ID
##### Appeal ID
##### Declaration_ID
##### Dis_Mag_scale ID
##### adm level ID
##### origin ID

In [5]:
def create_hierarchy(df, level_cols, id_col_name):
    """
    Create a hierarchy of groups and subgroups with incremental IDs, and add the the Ids to the original DataFrame.
    """
    hierarchy = pd.DataFrame(columns=['id', 'name', 'parent_id'])
    current_id = 1
    
    parent_ids = {}  # A dictionary to keep track of parent IDs for each group level
    
    # Add a column for group IDs in the original DataFrame
    df[id_col_name] = None
    
    for i, level in enumerate(level_cols):
        # Get unique values for the current level
        unique_values = df[level].unique()
        
        for value in unique_values:
            # Find the parent ID (if it's not the first level)
            if i > 0:
                # Check if there is a matching parent in the previous level
                parent_row = df[df[level] == value]
                if len(parent_row) > 0:
                    parent_value = parent_row[level_cols[i-1]].values[0]
                    parent_id = parent_ids.get(parent_value, None)
                else:
                    parent_id = None  # No matching parent found
            else:
                parent_id = None  # First level has no parent
            
            # Append the group with its ID and parent ID to the hierarchy
            hierarchy = pd.concat([hierarchy, pd.DataFrame({
                'id': [current_id],
                'name': [value],
                'parent_id': [parent_id]
            })], ignore_index=True)
            
            # Store the current ID for the current level
            parent_ids[value] = current_id
            
            # Assign the current ID to the corresponding rows in the original DataFrame
            df.loc[df[level] == value, id_col_name] = current_id
            
            current_id += 1
    df = df.drop(columns=level_cols)
    return hierarchy, df

In [6]:
dim_disaster_types, disasters = create_hierarchy(disasters, ['Disaster Type', 'Disaster Subtype', 'Disaster Subsubtype'],'type_id')
dim_disaster_groups ,disasters= create_hierarchy(disasters,['Disaster Group', 'Disaster Subgroup'],'group_id')
dim_associated_distructions, disasters=create_hierarchy(disasters,['Associated Dis', 'Associated Dis2'],'associated_dis_id')

In [7]:
def create_incremental_ids(df, column_names, id_column_name='location_id'):
    """
    This function generates incremental IDs and can for unique combinations of values across multiple columns in a DataFrame.
    """
    unique_combinations = df[column_names].drop_duplicates().reset_index(drop=True)
    unique_combinations[id_column_name] = range(1, len(unique_combinations) + 1)
    df = pd.merge(df, unique_combinations, on=column_names, how='left')
    df = df.drop(columns=column_names)
    return df, unique_combinations

In [8]:
disasters, dim_locations=create_incremental_ids(disasters,['Country', 'ISO', 'Region', 'Continent', 'Location','Latitude','Longitude'],'location_id')
disasters, dim_disaster_names=create_incremental_ids(disasters,['Event Name'],'name_id')
disasters, dim_ofda_responses=create_incremental_ids(disasters,['OFDA Response'],'OFDA_resp_id')
disasters, dim_appeals=create_incremental_ids(disasters,['Appeal'],'appeal_id')
disasters, dim_declarations=create_incremental_ids(disasters,['Declaration'],'declaration_id')
disasters, dim_mag_scales=create_incremental_ids(disasters,['Dis Mag Scale'],'dis_mag_scale_id')
disasters, dim_adm_levels=create_incremental_ids(disasters,['Adm Level'],'adm_level_id')
disasters, dim_disasters_origin=create_incremental_ids(disasters,['Origin'],'origin_id')

In [9]:
disasters

Unnamed: 0,Year,Seq,Glide,Aid Contribution,Dis Mag Value,Local Time,River Basin,Start Year,Start Month,Start Day,...,group_id,associated_dis_id,location_id,name_id,OFDA_resp_id,appeal_id,declaration_id,dis_mag_scale_id,adm_level_id,origin_id
0,1902,12,,,8,20:20,,1902,4,18,...,2,1,1,1,1,1,1,1,1,1
1,1902,3,,,,,,1902,4,8,...,2,,2,2,1,1,1,2,1,1
2,1902,10,,,,,,1902,10,24,...,2,,2,2,1,1,1,2,1,1
3,1903,6,,,,,,1903,4,29,...,2,,3,1,1,1,1,2,1,1
4,1903,12,,,,,,1903,,,...,2,,4,3,1,2,2,2,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48367,2021,449,FL-2021-000110,,,,,2021,7,16,...,4,,13533,1,1,1,1,4,1,6
48368,2021,75,,,,,,2021,2,1,...,4,,13534,1,1,1,1,4,4,6
48369,2021,599,EP-2021-000138,,,,,2021,9,7,...,5,,13535,1051,1,1,3,5,1,1
48370,2021,20,,,,,,2021,1,11,...,4,,13536,1,1,1,3,4,3,6


In [10]:
def generate_date_ids(df, start_date_col, end_date_col, id_name='date_id'):
    """
    Generates incremental IDs for all dates between the minimum and maximum dates 
    found in the starting_date and ending_date columns, updates the original DataFrame 
    to replace dates with their corresponding IDs, and creates a new DataFrame with 
    all unique dates and their IDs.
    """

    df[start_date_col] = pd.to_datetime(df[start_date_col])
    df[end_date_col] = pd.to_datetime(df[end_date_col])
    
    min_date = min(df[start_date_col].min(), df[end_date_col].min())
    max_date = max(df[start_date_col].max(), df[end_date_col].max())

    all_dates = pd.date_range(start=min_date, end=max_date, freq='D').to_frame(name='Date')

    date_dimension = all_dates.reset_index(drop=True)
    date_dimension[id_name] = range(1, len(date_dimension) + 1)
    
    df = pd.merge(df, date_dimension, left_on=start_date_col, right_on='Date', how='left')
    df = df.rename(columns={id_name: f'{start_date_col}_id'}).drop(columns='Date')
    
    df = pd.merge(df, date_dimension, left_on=end_date_col, right_on='Date', how='left')
    df = df.rename(columns={id_name: f'{end_date_col}_id'}).drop(columns='Date')
    
    return df, date_dimension

In [11]:
dim_appeals

Unnamed: 0,Appeal,appeal_id
0,,1
1,No,2
2,Yes,3


In [12]:
dim_adm_levels

Unnamed: 0,Adm Level,adm_level_id
0,,1
1,2,2
2,1,3
3,1;2,4


In [13]:
dim_mag_scales

Unnamed: 0,Dis Mag Scale,dis_mag_scale_id
0,Richter,1
1,,2
2,Kph,3
3,Km2,4
4,Vaccinated,5
5,°C,6


In [14]:
dim_ofda_responses

Unnamed: 0,OFDA Response,OFDA_resp_id
0,,1
1,Yes,2


In [15]:
dim_locations

Unnamed: 0,Country,ISO,Region,Continent,Location,Latitude,Longitude,location_id
0,Guatemala,GTM,Central America,Americas,"Quezaltenango, San Marcos",14,-91,1
1,Guatemala,GTM,Central America,Americas,,,,2
2,Canada,CAN,Northern America,Americas,"Frank, Alberta",,,3
3,Comoros (the),COM,Eastern Africa,Africa,,,,4
4,Bangladesh,BGD,Southern Asia,Asia,Chittagong,,,5
...,...,...,...,...,...,...,...,...
13532,Yemen,YEM,Western Asia,Asia,"Dhamar, Amran, Al Mahwit, Marib, Ibb, Sana’a C...",,,13533
13533,South Africa,ZAF,Southern Africa,Africa,"Mpumalanga Province, Free State Province and t...",,,13534
13534,Congo (the Democratic Republic of the),COD,Middle Africa,Africa,Tshopo province,,,13535
13535,Serbia,SRB,Southern Europe,Europe,"Zitoradja (Toplicki), Doljevac (Nisavski) , Di...",,,13536


In [16]:
dim_disaster_groups

Unnamed: 0,id,name,parent_id
0,1,Natural,
1,2,Geophysical,1.0
2,3,Meteorological,1.0
3,4,Hydrological,1.0
4,5,Biological,1.0
5,6,Climatological,1.0
6,7,Extra-terrestrial,1.0


In [17]:
dim_disaster_types

Unnamed: 0,id,name,parent_id
0,1,Earthquake,
1,2,Volcanic activity,
2,3,Mass movement (dry),
3,4,Storm,
4,5,Flood,
5,6,Epidemic,
6,7,Drought,
7,8,Landslide,
8,9,Wildfire,
9,10,Extreme temperature,


In [18]:
dim_disaster_names

Unnamed: 0,Event Name,name_id
0,,1
1,Santa Maria,2
2,Mount Karthala,3
3,Bubonic,4
4,Pneumonic,5
...,...,...
1567,Dixie fire,1568
1568,Caldor fire,1569
1569,Telegraph and Mescal Fires,1570
1570,Bootleg Fire,1571


In [19]:
dim_locations

Unnamed: 0,Country,ISO,Region,Continent,Location,Latitude,Longitude,location_id
0,Guatemala,GTM,Central America,Americas,"Quezaltenango, San Marcos",14,-91,1
1,Guatemala,GTM,Central America,Americas,,,,2
2,Canada,CAN,Northern America,Americas,"Frank, Alberta",,,3
3,Comoros (the),COM,Eastern Africa,Africa,,,,4
4,Bangladesh,BGD,Southern Asia,Asia,Chittagong,,,5
...,...,...,...,...,...,...,...,...
13532,Yemen,YEM,Western Asia,Asia,"Dhamar, Amran, Al Mahwit, Marib, Ibb, Sana’a C...",,,13533
13533,South Africa,ZAF,Southern Africa,Africa,"Mpumalanga Province, Free State Province and t...",,,13534
13534,Congo (the Democratic Republic of the),COD,Middle Africa,Africa,Tshopo province,,,13535
13535,Serbia,SRB,Southern Europe,Europe,"Zitoradja (Toplicki), Doljevac (Nisavski) , Di...",,,13536


In [20]:
dim_associated_distructions

Unnamed: 0,id,name,parent_id
0,1,Tsunami/Tidal wave,
1,2,,
2,3,"Slide (land, mud, snow, rock)",
3,4,Famine,
4,5,Heat wave,
...,...,...,...
57,58,Industrial accidents,47
58,59,Fog,37
59,60,Epidemic,35
60,61,Oil spill,38


In [21]:
dim_ofda_responses

Unnamed: 0,OFDA Response,OFDA_resp_id
0,,1
1,Yes,2


In [22]:
dim_appeals

Unnamed: 0,Appeal,appeal_id
0,,1
1,No,2
2,Yes,3


In [23]:
dim_declarations

Unnamed: 0,Declaration,declaration_id
0,,1
1,No,2
2,Yes,3


In [24]:
dim_mag_scales

Unnamed: 0,Dis Mag Scale,dis_mag_scale_id
0,Richter,1
1,,2
2,Kph,3
3,Km2,4
4,Vaccinated,5
5,°C,6


In [25]:
dim_adm_levels

Unnamed: 0,Adm Level,adm_level_id
0,,1
1,2,2
2,1,3
3,1;2,4


In [26]:
dim_disasters_origin

Unnamed: 0,Origin,origin_id
0,,1
1,Earthquake,2
2,Volacno and landslide,3
3,Late arrival of rain,4
4,Typhoon,5
...,...,...
660,Seismic activities,661
661,Active monsoonal rainfall,662
662,Polar Vortex,663
663,"Heat, drought and winds",664


In [27]:
disasters.columns

Index(['Year', 'Seq', 'Glide', 'Aid Contribution', 'Dis Mag Value',
       'Local Time', 'River Basin', 'Start Year', 'Start Month', 'Start Day',
       'End Year', 'End Month', 'End Day', 'Total Deaths', 'No Injured',
       'No Affected', 'No Homeless', 'Total Affected',
       'Insured Damages ('000 US$)', 'Total Damages ('000 US$)', 'CPI',
       'Admin1 Code', 'Admin2 Code', 'Geo Locations', 'extraction_time',
       'type_id', 'group_id', 'associated_dis_id', 'location_id', 'name_id',
       'OFDA_resp_id', 'appeal_id', 'declaration_id', 'dis_mag_scale_id',
       'adm_level_id', 'origin_id'],
      dtype='object')

In [28]:
import logging
from extract import *
from transform import *
from load import * 

from datetime import datetime

# # Set up logging
# logging.basicConfig(filename='etl_logs.log', 
#                     level=logging.INFO, 
#                     format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')

# logger = logging.getLogger(__name__)

#if __name__ == "__main__":


###########
# Extract #
###########

# logger.info("Starting extraction process...")

# # Connect to database
# conn = connect_db('staging_disasters')
# if conn:
#     cursor = conn.cursor()

#     # Process CSV file and load data into staging table
#     process_csv_to_staging(cursor, "staging_disasters", r"C:\Users\Legion\Desktop\official_data\1900_2021_DISASTERS.csv")

#     conn.commit()
#     cursor.close()
#     conn.close()

#     logger.info("Extraction completed successfully.")
# else:
#     logger.error("Database connection failed.")



#############
# Transform #
#############

# logger.info("Starting transformation process...")

disasters = get_data_from_db(f""" --sql
        SELECT * FROM staging_disasters 
        WHERE TO_CHAR(extraction_time, 'YYYYMMDD') = '{datetime.now().strftime("%Y%m%d")}';
    """, 'staging_disasters')

columns_to_remove = ['Local Time', 'River Basin', 'Admin1 Code', 'Admin2 Code', 'Geo Locations']
if disasters is not None:
    transformed_disasters = transform_data(disasters, columns_to_remove)
    print(transformed_disasters.columns)
#     logger.info("Transformation completed successfully.")
# else:
#     logger.warning("No data found for transformation.")

# except Exception as e:
#     logger.error(f"Error occurred: {e}")

dwh_conn = connect_db('disasters_dwh')
if dwh_conn:

            
            # Create disaster tables

         


            # Generate dimensions
   
            fact_disasters, dim_disaster_types, dim_disaster_groups, dim_associated_distructions, dim_locations, dim_disaster_names, dim_ofda_responses, dim_appeals, dim_declarations, dim_mag_scales, dim_adm_levels, dim_disasters_origin, dim_dates = generate_dimensions(transformed_disasters)
    
            fact_disasters=remove_columns(fact_disasters,['starting_date','ending_date'],'disasters')



  df = pd.read_sql_query(query, conn)


Index(['year', 'seq', 'glide', 'disaster_group', 'disaster_subgroup',
       'disaster_type', 'disaster_subtype', 'disaster_subsubtype',
       'event_name', 'country', 'iso', 'region', 'continent', 'location',
       'origin', 'associated_dis', 'associated_dis2', 'ofda_response',
       'appeal', 'declaration', 'aid_contribution', 'dis_mag_value',
       'dis_mag_scale', 'latitude', 'longitude', 'local_time', 'river_basin',
       'start_year', 'start_month', 'start_day', 'end_year', 'end_month',
       'end_day', 'total_deaths', 'no_injured', 'no_affected', 'no_homeless',
       'total_affected', 'insured_damages_('000_us$)',
       'total_damages_('000_us$)', 'cpi', 'adm_level', 'admin1_code',
       'admin2_code', 'geo_locations', 'extraction_time', 'starting_date',
       'ending_date', 'duration_days'],
      dtype='object')


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[id_col_name] = None


In [29]:
fact_disasters.columns

Index(['id', 'year', 'seq', 'glide', 'aid_contribution', 'dis_mag_value',
       'latitude', 'longitude', 'local_time', 'river_basin', 'start_year',
       'start_month', 'start_day', 'end_year', 'end_month', 'end_day',
       'total_deaths', 'no_injured', 'no_affected', 'no_homeless',
       'total_affected', 'insured_damages_('000_us$)',
       'total_damages_('000_us$)', 'cpi', 'admin1_code', 'admin2_code',
       'geo_locations', 'extraction_time', 'duration_days', 'type_id',
       'group_id', 'associated_dis_id', 'location_id', 'name_id',
       'ofda_resp_id', 'appeal_id', 'declaration_id', 'dis_mag_scale_id',
       'adm_level_id', 'origin_id', 'starting_date_id', 'ending_date_id'],
      dtype='object')

In [30]:
df=fact_disasters.describe()

In [31]:
transformed_disasters

Unnamed: 0,year,seq,glide,disaster_group,disaster_subgroup,disaster_type,disaster_subtype,disaster_subsubtype,event_name,country,...,cpi,adm_level,admin1_code,admin2_code,geo_locations,extraction_time,starting_date,ending_date,duration_days,id
0,1902,12,,Natural,Geophysical,Earthquake,Ground movement,,,Guatemala,...,3.350513162,,,,,2024-09-15 11:36:17.426008,1902-04-18,1902-04-18,0.0,1
1,1902,3,,Natural,Geophysical,Volcanic activity,Ash fall,,Santa Maria,Guatemala,...,3.350513162,,,,,2024-09-15 11:36:17.426008,1902-04-08,1902-04-08,0.0,2
2,1902,10,,Natural,Geophysical,Volcanic activity,Ash fall,,Santa Maria,Guatemala,...,3.350513162,,,,,2024-09-15 11:36:17.426008,1902-10-24,1902-10-24,0.0,3
3,1903,6,,Natural,Geophysical,Mass movement (dry),Rockfall,,,Canada,...,3.479379053,,,,,2024-09-15 11:36:17.426008,1903-04-29,1903-04-29,0.0,4
4,1903,12,,Natural,Geophysical,Volcanic activity,Ash fall,,Mount Karthala,Comoros (the),...,3.479379053,,,,,2024-09-15 11:36:17.426008,NaT,NaT,,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48367,2021,449,FL-2021-000110,Natural,Hydrological,Flood,,,,Yemen,...,,,,,,2024-09-15 14:39:46.392551,2021-07-16,2021-08-07,22.0,48368
48368,2021,75,,Natural,Hydrological,Flood,,,,South Africa,...,,1;2,2707;77311;77312;77313;77315,77364;77367,"Free State, KwaZulu-Natal, Limpopo, Mpumalanga...",2024-09-15 14:39:46.392551,2021-02-01,2021-02-15,14.0,48369
48369,2021,599,EP-2021-000138,Natural,Biological,Epidemic,Viral disease,,Meningitis,Congo (the Democratic Republic of the),...,,,,,,2024-09-15 14:39:46.392551,2021-09-07,2021-09-13,6.0,48370
48370,2021,20,,Natural,Hydrological,Flood,,,,Serbia,...,,1,25374;25378;25379;25380;25383;25386;25397,,"Jablanicki, Kosovski, Kosovsko-mitrovatski, Ko...",2024-09-15 14:39:46.392551,2021-01-11,2021-01-12,1.0,48371


In [32]:
transformed_disasters.describe()

Unnamed: 0,extraction_time,starting_date,ending_date,duration_days,id
count,48372,37488,37695,37155.0,48372.0
mean,2024-09-15 13:37:50.874087936,1999-07-13 00:54:58.079385344,1999-08-11 13:00:47.847194496,9.981429,24186.5
min,2024-09-15 11:36:17.426008,1900-01-06 00:00:00,1900-01-06 00:00:00,0.0,1.0
25%,2024-09-15 11:36:17.426008064,1992-08-28 18:00:00,1992-10-09 00:00:00,0.0,12093.75
50%,2024-09-15 14:37:28.803705088,2003-09-12 00:00:00,2003-09-22 00:00:00,0.0,24186.5
75%,2024-09-15 14:39:46.392550912,2012-08-01 06:00:00,2012-08-29 00:00:00,4.0,36279.25
max,2024-09-15 14:39:46.392551,2021-10-07 00:00:00,2021-10-07 00:00:00,18262.0,48372.0
std,,,,167.574779,13963.937947


In [1]:
        #############
        # Transform #
        #############

import logging
from extract import *
from transform import *
from load import * 

from datetime import datetime

disasters = get_data_from_db(f""" --sql
        SELECT * FROM staging_disasters 
        WHERE TO_CHAR(extraction_time, 'YYYYMMDD') = '{datetime.now().strftime("%Y%m%d")}';
    """, 'staging_disasters')

columns_to_remove = ['year','Year','local_time', 'river_basin', 'admin1_code', 'admin2_code', 'geo_locations',
                        'start_year','start_month', 'start_day', 'end_year', 'end_month', 'end_day','longitude','latitude'
                        ]
if disasters is not None:
    disasters=remove_columns(disasters,['Year'],'disasters')
    transformed_disasters = transform_data(disasters, columns_to_remove)
    transformed_disasters=rename_column(transformed_disasters,"insured_damages_('000_us$)",'insured_damages')
    transformed_disasters=rename_column(transformed_disasters,"total_damages_('000_us$)","total_damages")

    print(transformed_disasters.columns)



  
########
# Load #
########

# Connect to the data warehouse
dwh_conn = connect_db('disasters_dwh')
if dwh_conn:

    
    fact_disasters, dim_disaster_types, dim_disaster_groups, dim_associated_distructions, dim_locations, dim_disaster_names, dim_ofda_responses, dim_appeals, dim_declarations, dim_mag_scales, dim_adm_levels, dim_disasters_origin, dim_dates = generate_dimensions(transformed_disasters)

    fact_disasters=remove_columns(fact_disasters,['starting_date','ending_date'],'disasters')

  df = pd.read_sql_query(query, conn)
Invalid date encountered: Year=nan, Month=nan, Day=nan
Invalid date encountered: Year=nan, Month=nan, Day=nan
Error during data transformation: Can only use .dt accessor with datetimelike values


Index(['seq', 'glide', 'disaster_group', 'disaster_subgroup', 'disaster_type',
       'disaster_subtype', 'disaster_subsubtype', 'event_name', 'country',
       'iso', 'region', 'continent', 'location', 'origin', 'associated_dis',
       'associated_dis2', 'ofda_response', 'appeal', 'declaration',
       'aid_contribution', 'dis_mag_value', 'dis_mag_scale', 'latitude',
       'longitude', 'local_time', 'river_basin', 'start_year', 'start_month',
       'start_day', 'end_year', 'end_month', 'end_day', 'total_deaths',
       'no_injured', 'no_affected', 'no_homeless', 'total_affected',
       'insured_damages', 'total_damages', 'cpi', 'adm_level', 'admin1_code',
       'admin2_code', 'geo_locations', 'extraction_time', 'starting_date',
       'ending_date'],
      dtype='object')


ValueError: Neither `start` nor `end` can be NaT