In [None]:
from matplotlib import pyplot as plt
import seaborn as sns
import pandas as pd
import os

In [None]:
# change directory to where dataset is located
os.chdir('../datasets')
ROOT_DIR = os.getcwd()

In [None]:
# Modifying Wiki Data: Filter to only include WIKI one month
start_date = '2017-01-01'
end_date = '2021-12-31'

def _mask(dataset, start_date, end_date):
    mask = (dataset.index >= start_date) & (dataset.index <= end_date)
    return dataset.loc[mask]  

In [None]:
# BITCOIN PRICE INDEX
os.chdir(os.path.join(ROOT_DIR, 'BPI\\'))

BPI = pd.read_csv("./bpi.csv",  index_col="Date", parse_dates=True)
BPI.rename(columns={'Closing Price (USD)': 'close',
           '24h Open (USD)': 'open', '24h High (USD)': 'high', '24h Low (USD)': 'low'}, inplace=True)
BPI.drop(['Currency'], axis=1, inplace=True)

HASH = pd.read_csv("./hash-rate.csv",  index_col="Timestamp", parse_dates=True)
MINE_DIFF = pd.read_csv("./difficulty.csv",  index_col="Timestamp", parse_dates=True)
COST_PER_TRANS = pd.read_csv("./cost-per-transaction.csv", index_col="Timestamp", parse_dates=True)
OUPUT = pd.read_csv("./output-volume.csv", index_col="Timestamp", parse_dates=True)
EXTV = pd.read_csv("./estimated-transaction-volume-usd.csv", index_col="Timestamp", parse_dates=True)
TRADE_VOL = pd.read_csv("./trade-volume.csv", index_col="Timestamp", parse_dates=True)
NTRANS = pd.read_csv("./n-transactions.csv", index_col="Timestamp", parse_dates=True)

BPI = _mask(BPI, start_date, end_date)
HASH = _mask(HASH, start_date, end_date)
EXTV = _mask(EXTV, start_date, end_date)
NTRANS = _mask(NTRANS, start_date, end_date)
MINE_DIFF = _mask(MINE_DIFF, start_date, end_date)
COST_PER_TRANS = _mask(COST_PER_TRANS, start_date, end_date)
OUPUT = _mask(OUPUT, start_date, end_date)
TRADE_VOL = _mask(TRADE_VOL, start_date, end_date)


In [None]:
# MACRO
os.chdir(ROOT_DIR)

GP = pd.read_csv("./gold-price.csv",  index_col="Date", parse_dates=True)
GP.dropna(axis=1, how="all", inplace=True)
GP.drop(['Open', 'High', 'Low', 'Change %', 'Vol.'], axis=1, inplace=True)
GP.rename(columns={'Price': 'Gold price'}, inplace=True)
# conver string to int
for i in range(len(GP["Gold price"])):
    GP["Gold price"][i] = "".join(GP["Gold price"][i].split(","))
GP["Gold price"] = pd.to_numeric(GP["Gold price"])


USD_CNY = pd.read_csv("./usd-cny.csv",  index_col="Date", parse_dates=True)
USD_CNY.drop(['Open', 'High', 'Low', 'Change %'], axis=1, inplace=True)
USD_CNY.rename(columns={'Price': 'USD-CNY Price'}, inplace=True)

GP = _mask(GP, start_date, end_date)
USD_CNY = _mask(USD_CNY, start_date, end_date)

In [None]:
# MEDIA AND INTERESTS
SVI = pd.read_csv("./svi.csv", index_col="Week", parse_dates=True)
# SVI.rename(columns={'Category: All categories':'SVI'}, inplace=True)

WIKI = pd.read_csv("./wikishark.csv", index_col="DateTime", parse_dates=True)
WIKI.rename(columns={'Bitcoin[en]':'Wikiviews'}, inplace=True)

SVI = _mask(SVI, start_date, end_date)
WIKI = _mask(WIKI, start_date, end_date)

In [None]:
df = pd.concat([BPI,EXTV, NTRANS, HASH, MINE_DIFF, COST_PER_TRANS,
     GP, OUPUT, TRADE_VOL, USD_CNY, SVI, WIKI], axis=1)

# 7th differencing, (daily to weekly freq)
# df_resampled = df.resample('7D').mean().interpolate()
df_resampled = df
df_resampled.fillna(df_resampled.mean(), inplace=True)
df_resampled.head()

In [None]:
print("Before sampling NaN values: ",len(df[df.isna().any(axis=1)]))
print("After sampling NaN values: ",len(df_resampled[df_resampled.isna().any(axis=1)]))

In [None]:
df_resampled[df_resampled.isna().any(axis=1)]

In [None]:
df_resampled.info()
df_resampled.describe()

In [None]:
df_resampled.columns

In [None]:
# SAVE DATASET
df_resampled.to_csv("../complete-merged-df.csv")