In [23]:
import pandas as pd
import numpy as np

In [24]:
bitcoin = pd.read_csv("../data/merged_datasets.csv")

In [25]:
# Dump some columns
bitcoin.drop(inplace=True, columns=["Unnamed: 0", "Adj Close", "gold_close", "gold_label", "sp500_close", "sp500_label", \
                                    "isPartial", "neg", "neu", "pos"])

In [26]:
# Create a daily change percent
bitcoin['Daily_Change_Perc'] = bitcoin["Daily_Change"] / bitcoin["Open"]

In [27]:
# Weekly change (4 day trading week apparently)
bitcoin['Weekly_Change'] = bitcoin['Close'] - bitcoin['Open'].shift(4)
bitcoin['Weekly_Change_Perc'] = bitcoin['Weekly_Change'] / bitcoin['Open'].shift(4)

In [28]:
# Re-add gold
gold = pd.read_csv("../data/gold_modified.csv")
gold['Daily_Change'] = gold["Close/Last"] - gold["Open"]
gold['Daily_Change_Perc'] = gold["Daily_Change"] / gold["Open"]
gold["Increased"] = np.where(gold["Daily_Change"] > 0, 1, 0)
# Get date in right format for merge
gold["Date"] = pd.to_datetime(gold["Date"]).dt.strftime('%Y-%m-%d')
gold_needed_columns = gold[["Date", "Close/Last", "Open", "Daily_Change", "Daily_Change_Perc", "Increased"]]

In [29]:
# Merge Gold Columns Back in
bitcoin = bitcoin.merge(gold_needed_columns, on="Date", suffixes=("", "_Gold"))

In [30]:
# Re-add sp500
sp500 = pd.read_csv("../data/sp_500.csv")
sp500['Daily_Change'] = sp500["Close/Last"] - sp500["Open"]
sp500['Daily_Change_Perc'] = sp500["Daily_Change"] / sp500["Open"]
sp500["Increased"] = np.where(sp500["Daily_Change"] > 0, 1, 0)
# Get date in right format for merge
sp500["Date"] = pd.to_datetime(sp500["Date"]).dt.strftime('%Y-%m-%d')
sp_needed_columns = sp500[["Date", "Close/Last", "Open", "Daily_Change", "Daily_Change_Perc", "Increased"]]

In [31]:
bitcoin = bitcoin.merge(sp_needed_columns, on="Date", suffixes=("", "_SP500"))

In [32]:
bitcoin.tail(30)

Unnamed: 0,Date,Open,High,Low,Close,Volume,Daily_Change,Daily_Change_Ind,MACD,PROC_3,...,Close/Last,Open_Gold,Daily_Change_Gold,Daily_Change_Perc_Gold,Increased,Close/Last_SP500,Open_SP500,Daily_Change_SP500,Daily_Change_Perc_SP500,Increased_SP500
1548,2020-12-29,27081.810547,27370.720703,25987.298828,27362.4375,45265946774,277.628906,1.0,2137.11442,0.035004,...,1882.9,1878.0,4.9,0.002609,1,3727.04,3750.01,-22.97,-0.006125,0
1549,2020-12-30,27360.089844,28937.740234,27360.089844,28840.953125,51287442704,1478.515625,1.0,2307.615495,0.097771,...,1893.4,1881.8,11.6,0.006164,1,3732.04,3736.19,-4.15,-0.001111,0
1550,2020-12-31,28841.574219,29244.876953,28201.992188,29001.720703,46754964848,160.767578,1.0,2427.72611,0.070774,...,1895.1,1899.0,-3.9,-0.002054,0,3756.07,3733.27,22.8,0.006107,1
1551,2021-01-04,32810.949219,33440.21875,28722.755859,31971.914063,81163475344,-810.109375,0.0,3099.426762,0.088437,...,1946.6,1908.2,38.4,0.020124,1,3700.65,3764.61,-63.96,-0.01699,0
1552,2021-01-05,31977.041016,34437.589844,30221.1875,33992.429688,67547324782,2020.515625,1.0,3288.260018,0.058055,...,1954.4,1946.0,8.4,0.004317,1,3726.86,3698.02,28.84,0.007799,1
1553,2021-01-06,34013.613281,36879.699219,33514.035156,36824.363281,75289433811,2831.933593,1.0,3624.642484,0.12331,...,1908.6,1952.8,-44.2,-0.022634,0,3748.14,3712.2,35.94,0.009682,1
1554,2021-01-07,36833.875,40180.367188,36491.191406,39371.042969,84762141031,2546.679688,1.0,4050.037469,0.231426,...,1913.6,1921.5,-7.9,-0.004111,0,3803.79,3764.71,39.08,0.010381,1
1555,2021-01-08,39381.765625,41946.738281,36838.636719,40797.609375,88107519480,1426.566406,1.0,4450.970033,0.200197,...,1835.4,1915.2,-79.8,-0.041667,0,3824.68,3815.05,9.63,0.002524,1
1556,2021-01-11,38346.53125,38346.53125,30549.599609,35566.65625,123320567399,-2789.785156,0.0,4338.167231,-0.128217,...,1850.8,1849.4,1.4,0.000757,1,3799.61,3803.14,-3.53,-0.000928,0
1557,2021-01-12,35516.359375,36568.527344,32697.976563,33922.960938,74773277909,-1643.695312,0.0,3922.051224,-0.157289,...,1844.2,1845.4,-1.2,-0.00065,0,3801.19,3801.62,-0.43,-0.000113,0


In [33]:
bitcoin = bitcoin.rename(columns={'Close/Last': 'Close/Last_Gold', 'Increased': "Increased_Gold"})

In [34]:
# set y label
bitcoin["label"] = bitcoin["Daily_Change_Ind"].shift(-1)
bitcoin.head()

Unnamed: 0,Date,Open,High,Low,Close,Volume,Daily_Change,Daily_Change_Ind,MACD,PROC_3,...,Open_Gold,Daily_Change_Gold,Daily_Change_Perc_Gold,Increased_Gold,Close/Last_SP500,Open_SP500,Daily_Change_SP500,Daily_Change_Perc_SP500,Increased_SP500,label
0,2014-10-13,377.92099,397.226013,368.897003,390.414001,35221400,11.86499,1.0,-15.70184,0.079798,...,1224.8,5.2,0.004246,1,1874.74,1905.65,-30.91,-0.01622,0,1.0
1,2014-10-14,391.691986,411.697998,391.324005,400.869995,38491500,10.455994,1.0,-11.895282,0.106462,...,1237.3,-3.0,-0.002425,0,1877.7,1877.11,0.59,0.000314,1,0.0
2,2014-10-15,400.954987,402.22699,388.765991,394.77301,25267100,-6.096985,0.0,-9.263747,0.042858,...,1233.3,11.5,0.009325,1,1862.49,1874.18,-11.69,-0.006237,0,1.0
3,2014-10-17,382.756012,385.477997,375.389008,383.757996,13600700,1.201996,1.0,-6.948665,-0.042687,...,1239.5,-0.5,-0.000403,0,1886.76,1864.91,21.85,0.011716,1,0.0
4,2014-10-20,389.230988,390.084015,378.252014,382.845001,16419000,-6.700989,0.0,-3.828066,-0.002379,...,1239.1,5.6,0.004519,1,1904.01,1885.62,18.39,0.009753,1,1.0


In [35]:
for (columnName, columnData) in bitcoin.iteritems():
    print('Column Name : ', columnName)
    print('Any NAs : ', columnData.isnull().any())
    try:
        print('Any INFs:', np.isinf(columnData).any())
    except:
        print("Couldn't check")

Column Name :  Date
Any NAs :  False
Couldn't check
Column Name :  Open
Any NAs :  False
Any INFs: False
Column Name :  High
Any NAs :  False
Any INFs: False
Column Name :  Low
Any NAs :  False
Any INFs: False
Column Name :  Close
Any NAs :  False
Any INFs: False
Column Name :  Volume
Any NAs :  False
Any INFs: False
Column Name :  Daily_Change
Any NAs :  False
Any INFs: False
Column Name :  Daily_Change_Ind
Any NAs :  False
Any INFs: False
Column Name :  MACD
Any NAs :  False
Any INFs: False
Column Name :  PROC_3
Any NAs :  False
Any INFs: False
Column Name :  PROC_5
Any NAs :  False
Any INFs: False
Column Name :  PROC_10
Any NAs :  False
Any INFs: False
Column Name :  wpr
Any NAs :  False
Any INFs: False
Column Name :  sto_os
Any NAs :  False
Any INFs: False
Column Name :  goog_trend_score
Any NAs :  False
Any INFs: False
Column Name :  count
Any NAs :  False
Any INFs: False
Column Name :  compound
Any NAs :  False
Any INFs: False
Column Name :  retweets_count
Any NAs :  False
Any IN

In [36]:
bitcoin['compound_weighted_replies'] = bitcoin['compound_weighted_replies'].fillna(0)

In [37]:
bitcoin['compound_weighted_replies'].isnull().sum()

0

In [38]:
print(len(bitcoin))
bitcoin = bitcoin.dropna()
print(len(bitcoin))

1578
1573


In [39]:
np.isinf(bitcoin['Daily_Change_Perc_SP500']).sum()

10

In [40]:
#Dump the inf
bitcoin = bitcoin[np.isinf(bitcoin['Daily_Change_Perc_SP500']) == False]

In [41]:
# Train - Test Split
from sklearn.model_selection import train_test_split
random_state = 12345

train_set, test_set = train_test_split(bitcoin, test_size=0.2, random_state=random_state)

In [42]:
len(train_set)

1250

In [43]:
len(test_set)

313

In [44]:
train_set.to_csv("../models/bitcoin_train.csv")
test_set.to_csv("../models/bitcoin_test.csv")