In [79]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error, r2_score


# Pre-processing

In [80]:
df_bitcoin = pd.read_csv("../data/bitcoin_features.csv", na_values=["null", ""], keep_default_na=True, index_col=0)
df_rates = pd.read_csv("../data/FEDFUNDS.csv")

In [81]:
df_bitcoin.columns.values

array(['date', 'active_addresses', 'tweets', 'top_100_percent',
       'difficulty', 'transactions', 'av_transaction_size', 'market_cap',
       'confirmation_time', 'median_transaction_value', 'send_usd',
       'google_trends', 'block_size', 'mining_profitability', 'hashrate',
       'sent_addresses', 'median_transaction_size', 'fee_reward',
       'av_transaction_value', 'full_name', 'coin'], dtype=object)

In [82]:
df_rates.columns.values

array(['observation_date', 'FEDFUNDS'], dtype=object)

In [83]:
df_bitcoin.dtypes

date                         object
active_addresses            float64
tweets                      float64
top_100_percent             float64
difficulty                  float64
transactions                float64
av_transaction_size         float64
market_cap                  float64
confirmation_time           float64
median_transaction_value    float64
send_usd                    float64
google_trends               float64
block_size                  float64
mining_profitability        float64
hashrate                    float64
sent_addresses              float64
median_transaction_size     float64
fee_reward                  float64
av_transaction_value        float64
full_name                    object
coin                         object
dtype: object

In [84]:
df_rates.dtypes

observation_date     object
FEDFUNDS            float64
dtype: object

In [85]:
df_bitcoin.shape

(4378, 21)

In [86]:
df_rates.shape

(121, 2)

In [87]:
df_bitcoin.isna().sum()

date                           0
active_addresses              27
tweets                      1978
top_100_percent               12
difficulty                     6
transactions                 261
av_transaction_size         4378
market_cap                   561
confirmation_time              8
median_transaction_value     561
send_usd                     561
google_trends                570
block_size                     6
mining_profitability         561
hashrate                       7
sent_addresses                 8
median_transaction_size     1205
fee_reward                   641
av_transaction_value         561
full_name                      0
coin                           0
dtype: int64

In [88]:
df_bitcoin.columns.values

array(['date', 'active_addresses', 'tweets', 'top_100_percent',
       'difficulty', 'transactions', 'av_transaction_size', 'market_cap',
       'confirmation_time', 'median_transaction_value', 'send_usd',
       'google_trends', 'block_size', 'mining_profitability', 'hashrate',
       'sent_addresses', 'median_transaction_size', 'fee_reward',
       'av_transaction_value', 'full_name', 'coin'], dtype=object)

In [89]:
df_bitcoin.count()

date                        4378
active_addresses            4351
tweets                      2400
top_100_percent             4366
difficulty                  4372
transactions                4117
av_transaction_size            0
market_cap                  3817
confirmation_time           4370
median_transaction_value    3817
send_usd                    3817
google_trends               3808
block_size                  4372
mining_profitability        3817
hashrate                    4371
sent_addresses              4370
median_transaction_size     3173
fee_reward                  3737
av_transaction_value        3817
full_name                   4378
coin                        4378
dtype: int64

In [90]:
df_bitcoin.shape

(4378, 21)

In [91]:
target_col = 'send_usd'
df_bitcoin[target_col] = pd.to_numeric(df_bitcoin[target_col], errors='coerce')

In [92]:
df_bitcoin['date'] = pd.to_datetime(df_bitcoin['date'])
df_bitcoin = df_bitcoin.sort_values('date').reset_index(drop=True)
df_bitcoin['tweets'] = df_bitcoin['tweets'].fillna(0)
df_bitcoin['google_trends'] = df_bitcoin['google_trends'].fillna(0)
daily_range = pd.date_range(df_bitcoin['date'].min(), df_bitcoin['date'].max(), freq='D')

df_rates['observation_date'] = pd.to_datetime(df_rates['observation_date'])
df_rates = df_rates.sort_values('observation_date').reset_index(drop=True)
df_rates = df_rates.rename(columns={'observation_date': 'date'})
df_rates['FEDFUNDS'] = pd.to_numeric(df_rates['FEDFUNDS'], errors='coerce')

In [93]:
df_btc_daily = (
    df_bitcoin.set_index('date')
    .sort_index()
    .reindex(daily_range)
    .rename_axis('date')
)
numeric_cols = df_btc_daily.select_dtypes(include=[np.number]).columns.tolist()
feature_cols_to_ffill = [c for c in numeric_cols if c != target_col]
df_btc_daily[feature_cols_to_ffill] = df_btc_daily[feature_cols_to_ffill].ffill()

In [94]:
df_rates_daily = df_rates.set_index('date').sort_index()
df_rates_daily = df_rates_daily.reindex(daily_range).rename_axis('date')
df_rates_daily['FEDFUNDS'] = df_rates_daily['FEDFUNDS'].ffill()

In [95]:
df_merged = df_btc_daily.join(df_rates_daily['FEDFUNDS'])

In [96]:
df_merged[target_col] = pd.to_numeric(df_merged[target_col], errors='coerce')
df_merged = df_merged[(df_merged[target_col].notna()) & (df_merged[target_col] > 0)]

In [97]:
candidate_features = [
    'active_addresses', 'transactions', 'av_transaction_size', 'median_transaction_value',
    'av_transaction_value', 'block_size', 'hashrate', 'difficulty', 'sent_addresses',
    'fee_reward', 'tweets', 'google_trends', 'mining_profitability', 'confirmation_time', 'FEDFUNDS'
]
present_feats = [c for c in candidate_features if c in df_merged.columns]