# 1. Descomprimir los archivos descargados

In [1]:
# Importa librerías
import os
import gzip
import pandas as pd

In [2]:
# abre el primer archivo para su exploración
folder_path = r'C:\Users\OY\Documents\The Bridge\2304_dsft_thebridge\0-DataSets\NDXm\NDXm_ASK_2018-06-27_15.log.gz'

df_file = pd.DataFrame()
date = []
price = []
ask = []

f = gzip.open(folder_path, 'rt')
f_content = f.read()
f.close()

f_content = f_content.splitlines()

for i, line in enumerate(f_content.copy()):
    f_content[i] = line.split(',')
    ask.append(f_content[i].pop())
    price.append(f_content[i].pop())
    date.append(f_content[i].pop())

df_file['date'] = pd.to_datetime(date, unit='ms')
df_file['price'] = price
df_file['ask'] = ask
df_file

Unnamed: 0,date,price,ask
0,2018-06-27 15:22:39.579,7102.1,40
1,2018-06-27 15:22:39.666,7101.1,40
2,2018-06-27 15:22:40.371,7100.8,40
3,2018-06-27 15:22:41.073,7101.1,40
4,2018-06-27 15:22:41.775,7101.3,40
...,...,...,...
3041,2018-06-27 15:59:58.218,7094.1,20
3042,2018-06-27 15:59:58.569,7093.8,20
3043,2018-06-27 15:59:58.920,7093.3,20
3044,2018-06-27 15:59:59.271,7093.6,20


In [3]:
# Abre los archivos para detectar archivos inválidos
folder_path = r'C:\Users\OY\Documents\The Bridge\2304_dsft_thebridge\0-DataSets\NDXm'

for file_name in os.listdir(folder_path):
    folder_path = os.path.join(folder_path, file_name)
    try:
        with gzip.open(folder_path, 'rb') as f:
            # Leer el contenido del archivo gzip
            f_content = f.read()
        # Si no se produce ningún error, el archivo es válido
        # print(f"Archivo válido: {file_name}")
    except gzip.BadGzipFile as e:
        # Si se produce un error BadGzipFile, el archivo es inválido
        print(f"Archivo inválido: {file_name} - Error: {e}")


Archivo inválido: NDXm_ASK_2020-04-03_09.log79720336527126253.tmp - Error: Not a gzipped file (b'15')
Archivo inválido: NDXm_BID_2020-04-03_09.log79720336526921150.tmp - Error: Not a gzipped file (b'15')


In [4]:
# Monta un dataframe con el bid y otro con el ask
folder_path = r'C:\Users\OY\Documents\The Bridge\2304_dsft_thebridge\0-DataSets\NDXm'
file_list = os.listdir(folder_path)
df_ask = pd.DataFrame()
df_bid = pd.DataFrame()

for file_name in file_list:
    if file_name.endswith('.gz'):
        folder_path = os.path.join(folder_path, file_name)
        if 'ask' in file_name.lower():
            df_file = pd.read_csv(folder_path, compression='gzip', names=['date', 'price', 'ask'])

            df_file['date'] = pd.to_datetime(df_file['date'], unit='ms')
            df_file['price'] = df_file['price'].astype(float)
            df_file['ask'] = df_file['ask'].astype(int)
            df_ask = pd.concat([df_ask, df_file])

        elif 'bid' in file_name.lower():
            df_file = pd.read_csv(folder_path, compression='gzip', names=['date', 'price', 'bid'])

            df_file['date'] = pd.to_datetime(df_file['date'], unit='ms')
            df_file['price'] = df_file['price'].astype(float)
            df_file['bid'] = df_file['bid'].astype(int)
            df_bid = pd.concat([df_bid, df_file])
            

KeyboardInterrupt: 

In [None]:
# Versión optimizada
import os
import pandas as pd

folder_path = r'C:\Users\OY\Documents\The Bridge\2304_dsft_thebridge\0-DataSets\NDXm'
file_list = os.listdir(folder_path)
df_ask_list = []
df_bid_list = []

for file_name in file_list:
    if file_name.endswith('.gz'):
        file_path = os.path.join(folder_path, file_name)

        if 'ASK' in file_name:
            df_file = pd.read_csv(file_path, compression='gzip', names=['date', 'price', 'ask'], dtype={'date': 'datetime64[ms]', 'price': float, 'ask': int})
            df_ask_list.append(df_file)

        elif 'bid' in file_name:
            df_file = pd.read_csv(file_path, compression='gzip', names=['date', 'price', 'bid'], dtype={'date': 'datetime64[ms]', 'price': float, 'bid': int})
            df_bid_list.append(df_file)

df_ask = pd.concat(df_ask_list)
df_bid = pd.concat(df_bid_list)


In [None]:
# Versión multihilo
import os
import pandas as pd
import concurrent.futures

folder_path = r'C:\Users\OY\Documents\The Bridge\2304_dsft_thebridge\0-DataSets\NDXm'
file_list = os.listdir(folder_path)
df_ask_list = []
df_bid_list = []

def process_file(file_path):
    if file_path.endswith('.gz'):
        file_name = os.path.basename(file_path)

        if 'ASK' in file_name:
            df_file = pd.read_csv(file_path, compression='gzip', names=['date', 'price', 'ask'], dtype={'date': 'datetime64[ms]', 'price': float, 'ask': int})
            return df_file

        elif 'bid' in file_name:
            df_file = pd.read_csv(file_path, compression='gzip', names=['date', 'price', 'bid'], dtype={'date': 'datetime64[ms]', 'price': float, 'bid': int})
            return df_file

with concurrent.futures.ThreadPoolExecutor() as executor:
    futures = [executor.submit(process_file, os.path.join(folder_path, file_name)) for file_name in file_list]

    for future in concurrent.futures.as_completed(futures):
        df_file = future.result()
        if df_file is not None:
            if 'ASK' in df_file.columns:
                df_ask_list.append(df_file)
            elif 'bid' in df_file.columns:
                df_bid_list.append(df_file)

df_ask = pd.concat(df_ask_list)
df_bid = pd.concat(df_bid_list)


In [None]:
df = pd.merge(df_ask, df_bid, on='date', how='outer')
df.reset_index(drop=True, inplace=True)
df