# Grupo 3 del proyecto de data mining - Analisis de datos

***Authors : H. Kanza, N.Matte, J. Escrihuela, A. Ortiz, ...***

In [None]:
# Imports
import requests
import re
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import zipfile
import os


# I. Import data from the website

First, I download the HTML page where I can find the URLs of every monthly file I want to analyse.

In [3]:
# URL of the file I want to download
url_csv = "https://www.omie.es/en/file-access-list?parents%5B0%5D=/&parents%5B1%5D=Day-ahead%20Market&parents%5B2%5D=3.%20Curves&dir=Monthly%20files%20with%20aggregate%20supply%20and%20demand%20curves%20of%20Day-ahead%20market%20including%20bid%20units&realdir=curva_pbc_uof"

# Path where I want to save the file
path = "data/webpage_omie.html"

# GET request to try to access URL data
response = requests.get(url_csv)

# I make sure the request succeeded
if response.status_code == 200:
    # ... so I write the data in a local file, in the specified path
    with open(path, "wb") as f:
        f.write(response.content)
    print(f"File saved at {path}")
else:
    print(f"Request error code: {response.status_code}")


File saved at data/webpage_omie.html


Now, with the HTML, I need to extract the list containing the URLs of all curves files.

In [4]:
# I read the HTML file
with open(path, "r", encoding="utf-8") as file:
    html_content = file.read()

    # Pattern with characters before and after
    pattern = r".*curva_pbc_uof_*."

    # Find occurrences that match the pattern
    matches = re.findall(pattern, html_content)

    if matches:
        print(len(matches), "occurences found on the webpage.")
    else:
        print("No occurence found.")

76 occurences found on the webpage.


Now I need to process the matches to extract only the URLs I need.

In [5]:
...

Ellipsis

Finally, I can download every zip file.

In [6]:
# I download the first file, just for a test

url_csv = "https://www.omie.es/en/file-download?parents%5B0%5D=curva_pbc_uof&filename=curva_pbc_uof_202402.zip" 

# Name of the file I will save on my computer
file_name = "data/first_file.zip"

# GET request to access data of zip files
response = requests.get(url_csv)

if response.status_code == 200:
    # If the request succeeds, I save the zip file at the specified path
    with open(file_name, "wb") as f:
        for chunk in response.iter_content(chunk_size=8192):
            f.write(chunk)
    print(f"Download complete : {file_name}")
else:
    print(f"Error while downloading: {response.status_code}")

Download complete : data/first_file.zip


Extracting first file from the zip

In [7]:
# Extract the first file from the zip archive
with zipfile.ZipFile(file_name, 'r') as zip_ref:
    # List all files in the zip archive
    zip_contents = zip_ref.namelist()
    print(f"Files in the zip archive: {zip_contents}")
    
    # Extract the first file
    first_file_name = zip_contents[0]
    zip_ref.extract(first_file_name, "data")
    print(f"Extracted {first_file_name} to 'data' directory")

# Load the extracted file into a DataFrame if it's a CSV
extracted_file_path = os.path.join("data", first_file_name)
if extracted_file_path.endswith('.csv'):
    df = pd.read_csv(extracted_file_path)
    print(df.head())
else:
    print(f"The extracted file is not a CSV: {extracted_file_path}")



Files in the zip archive: ['curva_pbc_uof_20240201.1', 'curva_pbc_uof_20240202.1', 'curva_pbc_uof_20240203.1', 'curva_pbc_uof_20240204.1', 'curva_pbc_uof_20240205.1', 'curva_pbc_uof_20240206.1', 'curva_pbc_uof_20240207.1', 'curva_pbc_uof_20240208.1', 'curva_pbc_uof_20240209.1', 'curva_pbc_uof_20240210.1', 'curva_pbc_uof_20240211.1', 'curva_pbc_uof_20240212.1', 'curva_pbc_uof_20240213.1', 'curva_pbc_uof_20240214.1', 'curva_pbc_uof_20240215.1', 'curva_pbc_uof_20240216.1', 'curva_pbc_uof_20240217.1', 'curva_pbc_uof_20240218.1', 'curva_pbc_uof_20240219.1', 'curva_pbc_uof_20240220.1', 'curva_pbc_uof_20240221.1', 'curva_pbc_uof_20240222.1', 'curva_pbc_uof_20240223.1']
Extracted curva_pbc_uof_20240201.1 to 'data' directory
The extracted file is not a CSV: data\curva_pbc_uof_20240201.1
