In [1]:
import requests
import pandas as pd
import time
import sqlite3

def fetch_data(url, params):
    data_list = []
    while True:
        response = requests.get(url, params=params)
        if response.status_code == 200:
            data = response.json()
            for item in data['items']:
                # use 'resourceBaseUri' from data
                item['FullURL'] = data['resourceBaseUri'] + '/' + item['ResourceName']
                data_list.append(item)
            print(f"Current page: {params['page']}, Total pages: {data['pagination']['totalPages']}, Total items fetched: {len(data_list)}")
            if params["page"] >= data['pagination']['totalPages']:
                break
            params["page"] += 1
            time.sleep(1)
        else:
            print(f"Request failed with status code {response.status_code}")
            break
    df = pd.DataFrame(data_list)
    df.drop_duplicates(subset='_id', keep='first', inplace=True)  # Drop duplicate rows based on the '_id' column
    return df


url1 = "https://reports.semopx.com/api/v1/documents/static-reports/"
url2 = "https://reports.sem-o.com/api/v1/documents/static-reports/"

params1 = {
    "page": 1,
    "sort_by": "PublishTime",
    "order_by": "DESC",
    "name": "",
    "DPuG_ID": "",
    "group[]": ["Market Data"],
    "date_from": "",
    "date_to": "",
    "page_size": 120
}

params2 = {
    "page": 1,
    "sort_by": "PublishTime",
    "order_by": "DESC",
    "name": "",
    "DPuG_ID": "",
    "group[]": [
        "Capacity Qualification", 
        "Capacity Auctions", 
        "Forecast Data", 
        "Inputs, Commerical and Technical Offer Data", 
        "Registration", 
        "Settlement Data"
    ],
    "date_from": "",
    "date_to": "",
    "page_size": 120
}

# ea

SEMOPX_DPuG_IDs = ["EA-004"]
SEMO_DPuG_IDs = ["BM-010", "BM-013"]

dataframes_dict = {}

for DPuG_ID in SEMOPX_DPuG_IDs:
    params1["DPuG_ID"] = DPuG_ID
    params1["page"] = 1  # reset page to 1 for new ID
    dataframes_dict[DPuG_ID] = fetch_data(url1, params1)

for DPuG_ID in SEMO_DPuG_IDs:
    params2["DPuG_ID"] = DPuG_ID
    params2["page"] = 1  # reset page to 1 for new ID
    dataframes_dict[DPuG_ID] = fetch_data(url2, params2)

for df_key in dataframes_dict.keys():
    if 'Group' in dataframes_dict[df_key].columns:
        dataframes_dict[df_key]['Group'] = dataframes_dict[df_key]['Group'].apply(', '.join)
    else:
        print(f"Dataframe {df_key} does not contain 'Group' column")

# Create a connection for the SQLite database
conn = sqlite3.connect('SEMOPX.db')

# Create a cursor object using the cursor() method
cursor = conn.cursor()

# Iterate over all dataframes in the dictionary
for df_key in dataframes_dict.keys():
    # Replace hyphen in the dataframe key with underscore for table name
    table_name = df_key.replace('-', '_')
    
    # Replace problematic characters in the column names
    dataframes_dict[df_key].columns = dataframes_dict[df_key].columns.str.replace("[()]", "")
    
    # Check if table exists, create a new table if it doesn't
    cursor.execute(f"SELECT name FROM sqlite_master WHERE type='table' AND name='{table_name}';")
    table_exists = cursor.fetchone()
    
    if table_exists:
        # Load the existing data
        df_existing = pd.read_sql_query(f'SELECT * FROM {table_name}', conn)
        
        # Find new rows
        df_new = pd.concat([dataframes_dict[df_key], df_existing])
        df_new.drop_duplicates(subset='_id', keep='first', inplace=True)
        
        # Append new rows
        df_new.to_sql(table_name, conn, if_exists='replace', index=False)
        
    # If table does not exist, write the DataFrame
    else:
        dataframes_dict[df_key].to_sql(table_name, conn, if_exists='replace', index=False)

# close the connection
conn.close()


Current page: 1, Total pages: 13, Total items fetched: 120
Current page: 2, Total pages: 13, Total items fetched: 240
Current page: 3, Total pages: 13, Total items fetched: 360
Current page: 4, Total pages: 13, Total items fetched: 480
Current page: 5, Total pages: 13, Total items fetched: 600
Current page: 6, Total pages: 13, Total items fetched: 720
Current page: 7, Total pages: 13, Total items fetched: 840
Current page: 8, Total pages: 13, Total items fetched: 960
Current page: 9, Total pages: 13, Total items fetched: 1080
Current page: 10, Total pages: 13, Total items fetched: 1200
Current page: 11, Total pages: 13, Total items fetched: 1320
Current page: 12, Total pages: 13, Total items fetched: 1440
Current page: 13, Total pages: 13, Total items fetched: 1460
Current page: 1, Total pages: 96, Total items fetched: 120
Current page: 2, Total pages: 96, Total items fetched: 240
Current page: 3, Total pages: 96, Total items fetched: 360
Current page: 4, Total pages: 96, Total items f

In [89]:
conn = sqlite3.connect('SEMOPX.db')

# Read from the "EA_004" table into a DataFrame and create a list of 'FullURL'
bidaskcurves_df = pd.read_sql_query("SELECT * FROM EA_004", conn)
bidaskcurves_list = bidaskcurves_df['FullURL'].tolist()


# Read from the "BM_013" table into a DataFrame and create a list of 'FullURL'
FourDayWindUnitForecast_df = pd.read_sql_query("SELECT * FROM BM_013", conn)
FourDayWindUnitForecast_list  = FourDayWindUnitForecast_df['FullURL'].tolist()


# Read from the "BM_010" table into a DataFrame and create a list of 'FullURL'
DailyLoadForecast_df = pd.read_sql_query("SELECT * FROM BM_010", conn)
DailyLoadForecast_list  = DailyLoadForecast_df['FullURL'].tolist()



conn.close()


['https://reports.semopx.com/api/v1/documents/static-reports/BidAskCurves_SEM-IDA3_20230604_20230604151633.xml', 'https://reports.semopx.com/api/v1/documents/static-reports/BidAskCurves_SEM-DA_20230605_20230604131316.xml', 'https://reports.semopx.com/api/v1/documents/static-reports/BidAskCurves_SEM-IDA2_20230604_20230604093537.xml', 'https://reports.semopx.com/api/v1/documents/static-reports/BidAskCurves_SEM-IDA1_20230604_20230603190712.xml', 'https://reports.semopx.com/api/v1/documents/static-reports/BidAskCurves_SEM-IDA3_20230603_20230603151619.xml', 'https://reports.semopx.com/api/v1/documents/static-reports/BidAskCurves_SEM-DA_20230604_20230603131200.xml', 'https://reports.semopx.com/api/v1/documents/static-reports/BidAskCurves_SEM-IDA2_20230603_20230603093525.xml', 'https://reports.semopx.com/api/v1/documents/static-reports/BidAskCurves_SEM-IDA1_20230603_20230602191446.xml', 'https://reports.semopx.com/api/v1/documents/static-reports/BidAskCurves_SEM-IDA3_20230602_20230602151632.x