In [2]:
import datetime as dt
import wget
import os
import pandas as pd
from zipfile import ZipFile

In [3]:
# só extrai o arquivo dado como parâmetro para a pasta ./Dados/temp
def extrai_arquivo(arq):
    try:
        ZipFile(arq, 'r').extractall('./Dados/temp/')
    except:
        print(f"erro ao extrair {arq}")

In [4]:
# baixa dados, concatena em um dataframe só e salva em um .csv
# limpa depois
def baixa_e_concatena(ticker, timeframe, ano_inicial):
    ano_corrente, mes_corrente, dia_corrente = [dt.date.today().year, dt.date.today().month, dt.date.today().day]
    
    # baixa dados da binance conforme ticker e timeframe selecionados para a pasta ./Dados/
    # timeframes disponiveis: 12h 15m 1d 1h 1m 1mo 1s 1w 2h 30m 3d 3m 4h 5m 6h 8h
    # tickers disponiveis: https://data.binance.vision/?prefix=data/spot/monthly/klines/
    url = "https://data.binance.vision/data/spot/monthly/klines/"
    if not os.path.exists(f"./Dados/Processados/{ticker}-{timeframe}.csv"):
        for ano in range(ano_inicial, ano_corrente+1):
            for mes in range(1,12+1):
                mes = str(mes).zfill(2)
                if not ((os.path.exists(f"./Dados/temp/{ticker}-{timeframe}-{ano}-{mes}.zip"))):
                    try:
                        wget.download(f"{url}{ticker}/{timeframe}/{ticker}-{timeframe}-{ano}-{mes}.zip"
                                      , out = f"./Dados/temp/")
                        pass
                    except:
                        print(f"\nFalha ao baixar {url}{ticker}/{timeframe}/{ticker}-{timeframe}-{ano}-{mes}.zip")
                else:
                    print(f"{ano}/{mes} já baixado")
    else:
        print(f"{ticker}-{timeframe} já processado")
        return
    
    # cria uma lista de arquivos do ticker e timeframe selecionado
    lista_arquivos = os.listdir("./Dados/temp/")
    lista_arquivos = [x for x in lista_arquivos if x.startswith(f"{ticker}-{timeframe}")]
    lista_arquivos[-5:]
    
    # cria um dataframe vazio pra colocar todos os dados dentro
    nomes = ["Open time","Open","High","Low","Close","Volume","Close time","Quote asset volume"
                                 ,"Number of trades","Taker buy base asset volume","Taker buy quote asset volume","Ignore"]
    df = pd.DataFrame(columns = nomes)
    
    # concatena tudo em um CSV e deixa na pasta ./Dados/Processados/
    for arq in lista_arquivos:
        extrai_arquivo(f"./Dados/temp/{arq}")
        df = pd.concat([df, pd.read_csv(f'./Dados/temp/{arq[:-4]}.csv', sep=',',decimal='.'
                                   , encoding='latin1', names=nomes, header=None)], ignore_index=True, copy=False)
        os.remove(f"./Dados/temp/{arq[:-4]}.csv")
    df.drop("Ignore", inplace=True, axis=1)
    df.set_index("Open time", inplace=True)
    df.to_csv(f"./Dados/Processados/{ticker}-{timeframe}.csv")
    
    print(f"./Dados/Processados/{ticker}-{timeframe}.csv")
    
    # deleta tudo que é temporario e já foi processado
    for arq in lista_arquivos:
        os.remove(f"./Dados/temp/{arq}")
    
    return

In [5]:
# o de 1s demora muito. Pesa 16GB o arquivo final enquanto o de 1m pesa 300MB. tem que ver se vale a pena mesmo.
ticker = "BTCUSDT"
timeframes = ["1s", "1m", "15m", "30m", "1h", "2h", "4h", "8h", "1d"]
for timeframe in timeframes:
    baixa_e_concatena(ticker= ticker, timeframe=timeframe, ano_inicial=2017)

BTCUSDT-1s já processado
BTCUSDT-1m já processado
BTCUSDT-15m já processado
BTCUSDT-30m já processado
BTCUSDT-1h já processado
BTCUSDT-2h já processado
BTCUSDT-4h já processado
BTCUSDT-8h já processado
BTCUSDT-1d já processado


In [23]:
# transforma as timestamps em datetime
df = pd.read_csv(f"./Dados/Processados/BTCUSDT-1h.csv", index_col = "Close time")
df.index = pd.to_datetime(df.index, unit="ms")
#df.loc[:,["Open time", "Close time"]] = df[["Open time", "Close time"]].apply(pd.to_datetime, unit="ms")
#df.to_csv("teste.csv", decimal=",", sep=";")
df

Unnamed: 0_level_0,Open time,Open,High,Low,Close,Volume,Quote asset volume,Number of trades,Taker buy base asset volume,Taker buy quote asset volume
Close time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2017-08-17 04:59:59.999,1502942400000,4261.48,4313.62,4261.32,4308.83,47.181009,2.023661e+05,171,35.160503,1.509525e+05
2017-08-17 05:59:59.999,1502946000000,4308.83,4328.69,4291.37,4315.32,23.234916,1.003048e+05,102,21.448071,9.260828e+04
2017-08-17 06:59:59.999,1502949600000,4330.29,4345.45,4309.37,4324.35,7.229691,3.128231e+04,36,4.802861,2.079532e+04
2017-08-17 07:59:59.999,1502953200000,4316.62,4349.99,4287.41,4349.99,4.443249,1.924106e+04,25,2.602292,1.129135e+04
2017-08-17 08:59:59.999,1502956800000,4333.32,4377.85,4333.32,4360.69,0.972807,4.239504e+03,28,0.814655,3.552747e+03
...,...,...,...,...,...,...,...,...,...,...
2022-11-30 19:59:59.999,1669834800000,16920.45,17147.16,16917.30,17062.85,23959.641700,4.085239e+08,517212,11834.484550,2.017760e+08
2022-11-30 20:59:59.999,1669838400000,17061.14,17103.52,17030.21,17097.19,10883.041700,1.856127e+08,232781,5482.091140,9.350484e+07
2022-11-30 21:59:59.999,1669842000000,17098.37,17111.70,17028.37,17106.65,10824.192860,1.847139e+08,221837,5427.768040,9.262509e+07
2022-11-30 22:59:59.999,1669845600000,17106.64,17249.00,17069.04,17148.29,10013.790860,1.714478e+08,197182,5179.914440,8.870750e+07


In [37]:
pedaco.iloc[-1].name

Timestamp('2022-11-30 23:59:59.999000')

In [46]:
print(len(df2.loc[df2.iloc[-20].name:pedaco.iloc[-1].name]), "linhas")
pedaco2 = df2.loc[df2.iloc[-20].name:pedaco.iloc[-1].name]

20 linhas


Unnamed: 0_level_0,Open time,Open,High,Low,Close,Volume,Quote asset volume,Number of trades,Taker buy base asset volume,Taker buy quote asset volume
Close time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2022-11-24 15:59:59.999,1669276800000,16622.38,16642.18,16458.05,16558.11,74073.08771,1226479000.0,1725075,36729.75362,608177700.0
2022-11-24 23:59:59.999,1669305600000,16559.01,16676.0,16517.99,16598.95,53674.4111,890383800.0,1405735,26740.89511,443608500.0
2022-11-25 07:59:59.999,1669334400000,16599.55,16622.64,16342.81,16461.66,63541.26696,1046691000.0,1578783,30964.95897,510068300.0
2022-11-25 15:59:59.999,1669363200000,16462.22,16591.43,16407.72,16513.0,70582.46621,1164554000.0,1680351,34974.84274,577087000.0
2022-11-25 23:59:59.999,1669392000000,16513.0,16666.0,16458.46,16522.14,47965.76216,792307500.0,1137331,24247.75426,400563500.0
2022-11-26 07:59:59.999,1669420800000,16521.35,16701.99,16509.6,16583.75,70373.22902,1168706000.0,1457716,34534.78004,573518700.0
2022-11-26 15:59:59.999,1669449600000,16584.13,16680.57,16483.25,16512.86,64366.98161,1067317000.0,1346738,31980.67252,530326200.0
2022-11-26 23:59:59.999,1669478400000,16512.35,16549.66,16385.0,16458.57,47064.60603,775967400.0,1012676,23130.77989,381375100.0
2022-11-27 07:59:59.999,1669507200000,16457.61,16600.0,16446.59,16562.98,61300.29136,1013348000.0,1165076,30290.9645,500725500.0
2022-11-27 15:59:59.999,1669536000000,16562.98,16593.91,16513.0,16574.28,51981.2657,860235500.0,1163582,25763.11177,426367800.0


# Problemas:
- Se for usar GAF image vai ter uma imagem 20x20 por coluna (supondo que estamos olhando 20 períodos no passado por vez)
- Não está claro se 1D CNN é uma boa alternativa para timeseries.

# Soluções:
#### 1
- Usar GAF de qualquer maneira para tentar replicar os resultados do trabalho do BARRA, usando apenas os preços de fechamento de 4 timeframes e ver se os resultados melhoram com CSVM
- Depois se eu quiser adiciono mais dados, mas a imagem vai ficar bem grande. (5 colunas de t-20 resulta em uma imagem de 100x100). GAF usando mais dados seria o melhor dos mundos.

#### 2
- Usar 1D CNN com todos os dados disponíveis
- Metodo completamente diferente do do BARRA. Não usa GAF images. Provavelmente mais fácil

### A partir daqui são só testes

In [23]:
# junta todos os timeframes em um df só

timeframes = ["1h", "2h", "4h", "8h", "1d"]

# pega o primeiro timeframe
df = pd.read_csv(f"./Dados/Processados/BTCUSDT-{timeframes[0]}.csv", index_col = "Close time")
#df.index = pd.to_datetime(df.index, unit="ms")

# coloca o timeframe do df inicial
df.columns = df.columns+f"_{timeframes[0]}"

# remove o primeiro item pois está no df
timeframes.pop(0)

# da join em todos os timeframes e coloca o sufixo correto
for timeframe in timeframes:
    print(timeframe)
    df2 = pd.read_csv(f"./Dados/Processados/BTCUSDT-{timeframe}.csv", index_col = "Close time")
    df = df.merge(df2, how="left", on=["Close time"], suffixes=(None, f"_{timeframe}"))

#df.to_excel("teste.xlsx")

# transforma em datetime
df.index = pd.to_datetime(df.index, unit="ms")

#inverte a ordem do df pq vamos usar ele do fim pro começo pra fazer as windows
df = df[::-1]
df

2h
4h
8h
1d


Unnamed: 0_level_0,Open time_1h,Open_1h,High_1h,Low_1h,Close_1h,Volume_1h,Quote asset volume_1h,Number of trades_1h,Taker buy base asset volume_1h,Taker buy quote asset volume_1h,...,Open time_1d,Open_1d,High_1d,Low_1d,Close_1d,Volume_1d,Quote asset volume_1d,Number of trades_1d,Taker buy base asset volume_1d,Taker buy quote asset volume_1d
Close time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2022-11-30 23:59:59.999,1669849200000,17148.29,17227.23,17109.98,17163.64,15580.621690,2.674903e+08,314108,7684.859570,1.319436e+08,...,1.669766e+12,16442.91,17249.0,16428.3,17163.64,303019.80719,5.123595e+09,6519330.0,151764.57285,2.566042e+09
2022-11-30 22:59:59.999,1669845600000,17106.64,17249.00,17069.04,17148.29,10013.790860,1.714478e+08,197182,5179.914440,8.870750e+07,...,,,,,,,,,,
2022-11-30 21:59:59.999,1669842000000,17098.37,17111.70,17028.37,17106.65,10824.192860,1.847139e+08,221837,5427.768040,9.262509e+07,...,,,,,,,,,,
2022-11-30 20:59:59.999,1669838400000,17061.14,17103.52,17030.21,17097.19,10883.041700,1.856127e+08,232781,5482.091140,9.350484e+07,...,,,,,,,,,,
2022-11-30 19:59:59.999,1669834800000,16920.45,17147.16,16917.30,17062.85,23959.641700,4.085239e+08,517212,11834.484550,2.017760e+08,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2017-08-17 08:59:59.999,1502956800000,4333.32,4377.85,4333.32,4360.69,0.972807,4.239504e+03,28,0.814655,3.552747e+03,...,,,,,,,,,,
2017-08-17 07:59:59.999,1502953200000,4316.62,4349.99,4287.41,4349.99,4.443249,1.924106e+04,25,2.602292,1.129135e+04,...,,,,,,,,,,
2017-08-17 06:59:59.999,1502949600000,4330.29,4345.45,4309.37,4324.35,7.229691,3.128231e+04,36,4.802861,2.079532e+04,...,,,,,,,,,,
2017-08-17 05:59:59.999,1502946000000,4308.83,4328.69,4291.37,4315.32,23.234916,1.003048e+05,102,21.448071,9.260828e+04,...,,,,,,,,,,


In [32]:
# solucao sem unir tudo. FUNCIONA. FAZER.

# fazer uma lista de todos os close times que precisam de 20 observações anteriores. (fácil, é o próprio index de df)
lookback = 20

df2 = pd.read_csv(f"./Dados/Processados/BTCUSDT-8h.csv", index_col = "Close time")
df2.index = pd.to_datetime(df2.index, unit="ms")
df2 = pd.DataFrame(df2["Close"])[::-1]

linha = 0
# for linha in df

# pega o close time da linha sendo analisada no df original
ini = df.iloc[linha].name
# pega o número da linha no df do outro timeframe
x = df2.index.get_loc(ini)
print(len(df2[x:x+lookback]))
df2[x:x+lookback]
# pra cada linha pegar 20 primeiros não nan's da coluna que eu quiser a partir da linha

20


Unnamed: 0_level_0,Close
Close time,Unnamed: 1_level_1
2022-11-30 23:59:59.999,17163.64
2022-11-30 15:59:59.999,16865.64
2022-11-30 07:59:59.999,16884.18
2022-11-29 23:59:59.999,16442.53
2022-11-29 15:59:59.999,16393.48
2022-11-29 07:59:59.999,16463.31
2022-11-28 23:59:59.999,16212.91
2022-11-28 15:59:59.999,16146.26
2022-11-28 07:59:59.999,16222.06
2022-11-27 23:59:59.999,16428.78


In [24]:
# pega o primeiro timeframe
#df = pd.read_csv(f"./Dados/Processados/BTCUSDT-{timeframes[0]}.csv")
#df["Close time"] = pd.to_datetime(df["Close time"], unit="ms")
#df.loc[:, "Close time"] = pd.to_datetime(df["Close time"], unit="ms")
#df = df[::-1]
df

Unnamed: 0_level_0,Open time_1h,Open_1h,High_1h,Low_1h,Close_1h,Volume_1h,Quote asset volume_1h,Number of trades_1h,Taker buy base asset volume_1h,Taker buy quote asset volume_1h,...,Open time_1d,Open_1d,High_1d,Low_1d,Close_1d,Volume_1d,Quote asset volume_1d,Number of trades_1d,Taker buy base asset volume_1d,Taker buy quote asset volume_1d
Close time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2022-11-30 23:59:59.999,1669849200000,17148.29,17227.23,17109.98,17163.64,15580.621690,2.674903e+08,314108,7684.859570,1.319436e+08,...,1.669766e+12,16442.91,17249.0,16428.3,17163.64,303019.80719,5.123595e+09,6519330.0,151764.57285,2.566042e+09
2022-11-30 22:59:59.999,1669845600000,17106.64,17249.00,17069.04,17148.29,10013.790860,1.714478e+08,197182,5179.914440,8.870750e+07,...,,,,,,,,,,
2022-11-30 21:59:59.999,1669842000000,17098.37,17111.70,17028.37,17106.65,10824.192860,1.847139e+08,221837,5427.768040,9.262509e+07,...,,,,,,,,,,
2022-11-30 20:59:59.999,1669838400000,17061.14,17103.52,17030.21,17097.19,10883.041700,1.856127e+08,232781,5482.091140,9.350484e+07,...,,,,,,,,,,
2022-11-30 19:59:59.999,1669834800000,16920.45,17147.16,16917.30,17062.85,23959.641700,4.085239e+08,517212,11834.484550,2.017760e+08,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2017-08-17 08:59:59.999,1502956800000,4333.32,4377.85,4333.32,4360.69,0.972807,4.239504e+03,28,0.814655,3.552747e+03,...,,,,,,,,,,
2017-08-17 07:59:59.999,1502953200000,4316.62,4349.99,4287.41,4349.99,4.443249,1.924106e+04,25,2.602292,1.129135e+04,...,,,,,,,,,,
2017-08-17 06:59:59.999,1502949600000,4330.29,4345.45,4309.37,4324.35,7.229691,3.128231e+04,36,4.802861,2.079532e+04,...,,,,,,,,,,
2017-08-17 05:59:59.999,1502946000000,4308.83,4328.69,4291.37,4315.32,23.234916,1.003048e+05,102,21.448071,9.260828e+04,...,,,,,,,,,,
