<a href="https://colab.research.google.com/github/laribar/bitcoinprediction/blob/main/bitcoinprediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.ensemble import RandomForestRegressor

In [2]:
# Substitua os dados pelos corretos do seu repositório
url = 'https://raw.githubusercontent.com/laribar/bitcoinprediction/refs/heads/main/Bitcoin_30_11_2024-31_01_2025_historical_data_coinmarketcap.csv'

In [3]:
# Ler o CSV com o delimitador especificado
df = pd.read_csv(url, delimiter=';')  # Use ',' se o CSV for separado por vírgulas

# Exibir as primeiras cinco linhas do DataFrame
print(df.tail())

                    timeOpen                 timeClose  \
55  2024-12-06T00:00:00.000Z  2024-12-06T23:59:59.999Z   
56  2024-12-05T00:00:00.000Z  2024-12-05T23:59:59.999Z   
57  2024-12-04T00:00:00.000Z  2024-12-04T23:59:59.999Z   
58  2024-12-03T00:00:00.000Z  2024-12-03T23:59:59.999Z   
59  2024-12-02T00:00:00.000Z  2024-12-02T23:59:59.999Z   

                    timeHigh                   timeLow  name          open  \
55  2024-12-06T20:43:00.000Z  2024-12-06T00:07:00.000Z  2781  97074.224832   
56  2024-12-05T03:09:00.000Z  2024-12-05T22:29:00.000Z  2781  98741.539382   
57  2024-12-04T20:52:00.000Z  2024-12-04T16:53:00.000Z  2781  95988.528712   
58  2024-12-03T02:42:00.000Z  2024-12-03T14:36:00.000Z  2781  95854.597434   
59  2024-12-02T01:32:00.000Z  2024-12-02T18:00:00.000Z  2781  97276.010117   

             high           low         close        volume     marketCap  \
55  102039.881796  96514.876082  99920.714730  9.453477e+10  1.977631e+12   
56  103900.472147  91998.781

In [4]:
# Renomear a coluna 'último' para 'fechamento'
df.rename(columns={'timeClose': 'Data'}, inplace=True)

# Visualizar as primeiras linhas após renomear as colunas
print("\nColunas após a renomeação:")
print(df.columns)

# Exibir as primeiras linhas do DataFrame para confirmar a alteração
print(df.head())


Colunas após a renomeação:
Index(['timeOpen', 'Data', 'timeHigh', 'timeLow', 'name', 'open', 'high',
       'low', 'close', 'volume', 'marketCap', 'timestamp'],
      dtype='object')
                   timeOpen                      Data  \
0  2025-01-30T00:00:00.000Z  2025-01-30T23:59:59.999Z   
1  2025-01-29T00:00:00.000Z  2025-01-29T23:59:59.999Z   
2  2025-01-28T00:00:00.000Z  2025-01-28T23:59:59.999Z   
3  2025-01-27T00:00:00.000Z  2025-01-27T23:59:59.999Z   
4  2025-01-26T00:00:00.000Z  2025-01-26T23:59:59.999Z   

                   timeHigh                   timeLow  name           open  \
0  2025-01-30T15:13:00.000Z  2025-01-30T00:32:00.000Z  2781  103709.338799   
1  2025-01-29T20:34:00.000Z  2025-01-29T00:01:00.000Z  2781  101317.527253   
2  2025-01-28T15:32:00.000Z  2025-01-28T22:01:00.000Z  2781  102095.417583   
3  2025-01-27T00:11:00.000Z  2025-01-27T07:31:00.000Z  2781  102680.303591   
4  2025-01-26T04:27:00.000Z  2025-01-26T23:56:00.000Z  2781  104713.213230   

    

In [5]:
# Converter coluna timeOpen para tipo datetime
df['Data'] = pd.to_datetime(df['close']).dt.date
# Definindo a 'data' como índice do DataFrame
df.set_index('Data', inplace=True)
# Exibir as primeiras linhas para verificar se a operação foi bem-sucedida
print(df.head())

                            timeOpen                  timeHigh  \
Data                                                             
1970-01-01  2025-01-30T00:00:00.000Z  2025-01-30T15:13:00.000Z   
1970-01-01  2025-01-29T00:00:00.000Z  2025-01-29T20:34:00.000Z   
1970-01-01  2025-01-28T00:00:00.000Z  2025-01-28T15:32:00.000Z   
1970-01-01  2025-01-27T00:00:00.000Z  2025-01-27T00:11:00.000Z   
1970-01-01  2025-01-26T00:00:00.000Z  2025-01-26T04:27:00.000Z   

                             timeLow  name           open           high  \
Data                                                                       
1970-01-01  2025-01-30T00:32:00.000Z  2781  103709.338799  106418.766695   
1970-01-01  2025-01-29T00:01:00.000Z  2781  101317.527253  104750.807242   
1970-01-01  2025-01-28T22:01:00.000Z  2781  102095.417583  103730.821061   
1970-01-01  2025-01-27T07:31:00.000Z  2781  102680.303591  103214.110460   
1970-01-01  2025-01-26T23:56:00.000Z  2781  104713.213230  105438.644535   

    

In [6]:
# Converter coluna 'timeOpen' para datetime
df['Data'] = pd.to_datetime(df['timeOpen'])

# Opcional: extrair apenas a parte da data, se o seu propósito não exige precisão de tempo
df['Data'] = df['Data'].dt.date

# Definir a nova coluna 'Data' como índice
df.set_index('Data', inplace=True)

# Exibir as 5 primeiras linhas para confirmar o índice correto
print(df.head())

                            timeOpen                  timeHigh  \
Data                                                             
2025-01-30  2025-01-30T00:00:00.000Z  2025-01-30T15:13:00.000Z   
2025-01-29  2025-01-29T00:00:00.000Z  2025-01-29T20:34:00.000Z   
2025-01-28  2025-01-28T00:00:00.000Z  2025-01-28T15:32:00.000Z   
2025-01-27  2025-01-27T00:00:00.000Z  2025-01-27T00:11:00.000Z   
2025-01-26  2025-01-26T00:00:00.000Z  2025-01-26T04:27:00.000Z   

                             timeLow  name           open           high  \
Data                                                                       
2025-01-30  2025-01-30T00:32:00.000Z  2781  103709.338799  106418.766695   
2025-01-29  2025-01-29T00:01:00.000Z  2781  101317.527253  104750.807242   
2025-01-28  2025-01-28T22:01:00.000Z  2781  102095.417583  103730.821061   
2025-01-27  2025-01-27T07:31:00.000Z  2781  102680.303591  103214.110460   
2025-01-26  2025-01-26T23:56:00.000Z  2781  104713.213230  105438.644535   

    

In [7]:
import pandas as pd

# Considerando que 'df' é o seu DataFrame atual.
# Converter colunas de tempo para datetime
df['timeHigh'] = pd.to_datetime(df['timeHigh'])
df['timeLow'] = pd.to_datetime(df['timeLow'])
df['timestamp'] = pd.to_datetime(df['timestamp'])

In [8]:
# Converter e setar 'Data' como o índice do DataFrame
df.index = pd.to_datetime(df.index)

In [9]:
# Examinar informação do DataFrame para assegurar todos estarem num tpo apropriado
print(df.info())

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 60 entries, 2025-01-30 to 2024-12-02
Data columns (total 11 columns):
 #   Column     Non-Null Count  Dtype              
---  ------     --------------  -----              
 0   timeOpen   60 non-null     object             
 1   timeHigh   60 non-null     datetime64[ns, UTC]
 2   timeLow    60 non-null     datetime64[ns, UTC]
 3   name       60 non-null     int64              
 4   open       60 non-null     float64            
 5   high       60 non-null     float64            
 6   low        60 non-null     float64            
 7   close      60 non-null     float64            
 8   volume     60 non-null     float64            
 9   marketCap  60 non-null     float64            
 10  timestamp  60 non-null     datetime64[ns, UTC]
dtypes: datetime64[ns, UTC](3), float64(6), int64(1), object(1)
memory usage: 5.6+ KB
None


In [10]:
X = df.drop('close', axis=1)  # todas as colunas menos a de preço de fechamento
y = df['close']  # a coluna alvo, naturalmente

# Mostrar para Certificação
print(X.head())
print(y.head())

                            timeOpen                  timeHigh  \
Data                                                             
2025-01-30  2025-01-30T00:00:00.000Z 2025-01-30 15:13:00+00:00   
2025-01-29  2025-01-29T00:00:00.000Z 2025-01-29 20:34:00+00:00   
2025-01-28  2025-01-28T00:00:00.000Z 2025-01-28 15:32:00+00:00   
2025-01-27  2025-01-27T00:00:00.000Z 2025-01-27 00:11:00+00:00   
2025-01-26  2025-01-26T00:00:00.000Z 2025-01-26 04:27:00+00:00   

                             timeLow  name           open           high  \
Data                                                                       
2025-01-30 2025-01-30 00:32:00+00:00  2781  103709.338799  106418.766695   
2025-01-29 2025-01-29 00:01:00+00:00  2781  101317.527253  104750.807242   
2025-01-28 2025-01-28 22:01:00+00:00  2781  102095.417583  103730.821061   
2025-01-27 2025-01-27 07:31:00+00:00  2781  102680.303591  103214.110460   
2025-01-26 2025-01-26 23:56:00+00:00  2781  104713.213230  105438.644535   

    

In [11]:
# Verificar o número de amostras em X e y
print(f"Número de amostras em X: {X.shape[0]}")
print(f"Número de amostras em y: {y.shape[0]}")

Número de amostras em X: 60
Número de amostras em y: 60


In [12]:
from sklearn.model_selection import train_test_split

# Dividindo o dataset em conjuntos de treino e teste
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Padrão: Pode fazer escalonamento aqui se quiser com StandardScaler
# from sklearn.preprocessing import StandardScaler
# scaler = StandardScaler()
# X_train = scaler.fit_transform(X_train)
# X_test = scaler.transform(X_test)

In [13]:
import pandas as pd

# Convertendo timeOpen para datetime
X['timeOpen'] = pd.to_datetime(X['timeOpen'])

In [14]:
# Transformar colunas datetime em timestamps
X['timeOpen'] = X['timeOpen'].astype(int) / 10**9  # Convertendo para Unix timestamp
X['timeHigh'] = X['timeHigh'].astype(int) / 10**9
X['timeLow'] = X['timeLow'].astype(int) / 10**9
X['timestamp'] = X['timestamp'].astype(int) / 10**9

In [15]:
if 'name' in X.columns:
    # Remover a coluna 'name'
    X.drop(columns=['name'], inplace=True)

In [16]:
from sklearn.model_selection import train_test_split

# Separe o conjunto de dados em conjuntos de treino e teste
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Confirmar o tamanho dos conjuntos resultantes, certificando coerência
print(f"Número de amostras em X_train: {X_train.shape[0]}")
print(f"Número de amostras em y_train: {y_train.shape[0]}")
print(f"Número de amostras em X_test: {X_test.shape[0]}")
print(f"Número de amostras em y_test: {y_test.shape[0]}")

Número de amostras em X_train: 48
Número de amostras em y_train: 48
Número de amostras em X_test: 12
Número de amostras em y_test: 12


In [17]:
# Escolher a coluna 'close' como variável alvo
y = df['close'].copy()

In [18]:
# Converte a coluna 'close' para numérico, coerce lidará com valores inválidos transformando-os temporariamente em NaN
df['close'] = pd.to_numeric(df['close'], errors='coerce')

# Certifique-se ao final da conversão que há mínimo de erros cada entropia editável pro conclusão
print(df['close'].dtype)
print(df['close'].isna().sum())  # Verifique se alguns valores tornaram-se NaN

float64
0


In [19]:
# Confirmar formato e conteúdo do alvo `y`
print(y.head())
print(f"Tipo de dados de `y`: {y.dtype}")

Data
2025-01-30    104735.302839
2025-01-29    103703.211192
2025-01-28    101332.476221
2025-01-27    102087.691335
2025-01-26    102682.497033
Name: close, dtype: float64
Tipo de dados de `y`: float64


In [20]:
# Selecionar apenas os últimos 20 dias para treinamento
df_recent = df.tail(20)

# Separar as features (X) e o alvo (y)
X_recent = df_recent.drop(columns=['close', 'Fechamento'])  # Exclua colunas irrelevantes, mantenha apenas úteis
y_recent = df_recent['close']

# Re-treinar o modelo com os últimos 20 dias
from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor(random_state=42)
model.fit(X_recent, y_recent)

# Previsões para os últimos 20 dias
y_recent_pred = model.predict(X_recent)

KeyError: "['Fechamento'] not found in axis"