In [1]:
import pandas as pd
from sklearn.model_selection import TimeSeriesSplit
from sklearn.preprocessing import StandardScaler
from joblib import Parallel, delayed
import joblib

# Helper Functions

In [2]:
def standardize_X_train_test(X_train_in, X_test_in):
    # Avoid modifying view of dataframe
    X_train_std = X_train_in.copy()
    X_test_std = X_test_in.copy()
    # Standardize
    X_train_std[X_train_std.columns] = StandardScaler().fit_transform(X_train_std)
    X = pd.concat([X_train_in,X_test_in], axis = 0).copy()
    X[X.columns] = StandardScaler().fit_transform(X)
    return {"train":X_train_in, "test":X.iloc[-1]}

In [3]:
def find_optimal_k(error_dict):
    # Find optimal hyperparameter k
    keys = list(error_dict.keys())
    min_key = keys[0]
    min_avg = sum(error_dict[min_key])/len(error_dict[min_key])
    for k in keys[1:]:
        avg = sum(error_dict[k])/len(error_dict[k])
        if (avg < min_avg):
            min_key = k
            min_avg = avg
    return min_key

In [4]:
def standardize_X(X):
    # Avoid modifying view of dataframe
    X_std = X.copy()
    X_std[X_std.columns] = StandardScaler().fit_transform(X_std)
    return X_std

In [5]:
def processInput(index, is_save_error_dict=False):
    # DataFrame of the current window
    rolling_df = df.iloc[index-window_size:index]
    # error_dict to save all errors during cross-validation
    #   error_dict[key]:
    #     key    = hyper-parameter
    #     value  = list of all errors associated with the same hyperparameter
    error_dict = {}
    # initialize error_dict
    for k in hyper_param_lst:
        error_dict[k] = []
    
    # Generator dependent variables
    y = rolling_df[response_col]
    # Generate predictor(s)
    X = rolling_df[predictor_col_lst]
    
    # Time-series Expanding Cross-validation
    #    about 30% test, 70% train
    #    multiply window_size by test percent
    tscv = TimeSeriesSplit(test_size=1, n_splits=round(window_size*0.1))
    for train_index, test_index in tscv.split(X):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        X_train = X_train.dropna()
        y_train = y_train.loc[X_train.index]
        # Standardize X_train and X_test
        std_X = standardize_X_train_test(X_train, X_test)
        X_train = std_X["train"]
        X_test = std_X["test"]
        for k in hyper_param_lst:
            # Fit the model with a given k
            model = model_constructor(k).fit(X_train, y_train)
            # Model Testing
            prediction = model.predict(pd.DataFrame(X_test).T)[0]
            actual_value = y_test.values[0]
            abs_pct_error = abs( (prediction - actual_value)/actual_value )
            error_dict[k].append(abs_pct_error)
    # Find optimal hyperparameter k
    optimal_k = find_optimal_k(error_dict)
    # Generate Prediction
    #  1) Fit Model with the optimal k
    X = X.dropna()
    y = y.loc[X.index]
    X = standardize_X(X)
    model = model_constructor(optimal_k).fit(X, y)
    # 2) Generate prediction
    predictor = pd.DataFrame(X.iloc[-1]).T
    prediction = model.predict(predictor)[0]
    # Return the function
    print("Index:",index,"|","k:", optimal_k,"|","Prediction:",prediction)
    if (is_save_error_dict):
        rt = {"index":index,"Prediction":prediction,"k":optimal_k,"error_dict":error_dict}
    else:
        rt = {"index":index,"Prediction":prediction,"k":optimal_k}
    return rt

# KNN Regression

In [6]:
import pandas as pd
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import TimeSeriesSplit
from sklearn.preprocessing import StandardScaler
from joblib import Parallel, delayed
import joblib
num_CPU = joblib.cpu_count()
print(num_CPU)

8


In [7]:
df = pd.read_csv("../../Data/5-min-dataset.csv")
df

Unnamed: 0,day,minute interval,5-min-price,5-min-pct_vol_buy,5-min-amount,minutes,10-min-price,10-min-pct,10-min-vol,30-min-price,...,60-min-vol,nqbtc_change,Reddit Post,Reddit Comments,Google Search,News,YouTube,Twitter,Wiki,response
0,1,0.0,,,,,,,,,...,,,,,,,,,,41314.920468
1,1,1.0,41314.920468,0.291280,0.150595,0.0,41381.881989,0.317189,0.136113,41473.234697,...,0.119213,-2499.3,210.0,3067.0,81.0,69.0,74.0,103.03,12926.0,41511.249825
2,1,2.0,41511.249825,0.395266,0.108303,5.0,41381.881989,0.317189,0.136113,41473.234697,...,0.119213,-2499.3,210.0,3067.0,81.0,69.0,74.0,103.03,12926.0,41486.174681
3,1,3.0,41486.174681,0.282159,0.113543,10.0,41506.276422,0.325864,0.112773,41473.234697,...,0.119213,-2499.3,210.0,3067.0,81.0,69.0,74.0,103.03,12926.0,41531.763676
4,1,4.0,41531.763676,0.393237,0.112326,15.0,41506.276422,0.325864,0.112773,41473.234697,...,0.119213,-2499.3,210.0,3067.0,81.0,69.0,74.0,103.03,12926.0,41628.766845
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8923,31,283.0,47284.570836,0.656551,0.092480,1410.0,47286.428784,0.672923,0.071151,47248.769702,...,0.069706,-258.8,165.0,2184.0,70.0,10.0,84.0,85.17,11297.0,47288.792279
8924,31,284.0,47288.792279,0.721789,0.041436,1415.0,47286.428784,0.672923,0.071151,47248.769702,...,0.069706,-258.8,165.0,2184.0,70.0,10.0,84.0,85.17,11297.0,47252.781186
8925,31,285.0,47252.781186,0.539705,0.101038,1420.0,47259.143672,0.595719,0.084160,47248.769702,...,0.069706,-258.8,165.0,2184.0,70.0,10.0,84.0,85.17,11297.0,47266.287352
8926,31,286.0,47266.287352,0.693403,0.062425,1425.0,47259.143672,0.595719,0.084160,47248.769702,...,0.069706,-258.8,165.0,2184.0,70.0,10.0,84.0,85.17,11297.0,47244.202044


In [None]:
#predictor_col_lst = [ '5-min-price', '5-min-pct_vol_buy',
#       '5-min-amount','10-min-price', '10-min-pct', '10-min-vol',
#       '30-min-price', '30-min-pct', '30-min-vol', '60-min-price',
#       '60-min-pct', '60-min-vol', 'nqbtc_change', 'Reddit Post',
#       'Reddit Comments', 'Google Search', 'News', 'YouTube', 'Twitter',
#       'Wiki'

In [8]:
response_col = "response"
file_name = "5-min-avg-KNN-past-price-only.csv"
predictor_col_lst = [ '5-min-price']
window_size = int(2*24*60/5)
print("window size:",window_size)
hyper_param_lst = [2,3,4,5,6]
print("Hyperparameters:",hyper_param_lst)

window size: 576
Hyperparameters: [2, 3, 4, 5, 6]


In [9]:
def model_constructor(k):
    return KNeighborsRegressor(n_neighbors=k)

In [10]:
results = Parallel(n_jobs=num_CPU)(delayed(processInput)(index) for index in range(window_size, len(df)))
output = pd.DataFrame(results)
output.to_csv(file_name)

Index: 587 | k: 6 | Prediction: 39691.36076039912
Index: 600 | k: 6 | Prediction: 38929.43448903113
Index: 616 | k: 6 | Prediction: 38959.8730288473
Index: 631 | k: 6 | Prediction: 38766.58246134076
Index: 645 | k: 6 | Prediction: 38215.27297989871
Index: 660 | k: 2 | Prediction: 38354.39620310284
Index: 676 | k: 2 | Prediction: 38416.86174014755
Index: 690 | k: 2 | Prediction: 38506.27184839287
Index: 704 | k: 6 | Prediction: 38600.732480337276
Index: 720 | k: 6 | Prediction: 38494.53160052706
Index: 736 | k: 6 | Prediction: 38392.74860709307
Index: 752 | k: 6 | Prediction: 38624.90107202735
Index: 768 | k: 6 | Prediction: 38110.041419291876
Index: 784 | k: 6 | Prediction: 38374.576520724404
Index: 800 | k: 6 | Prediction: 37966.95628683584
Index: 816 | k: 6 | Prediction: 38130.80902105854
Index: 832 | k: 3 | Prediction: 38257.03076594546
Index: 848 | k: 3 | Prediction: 38446.550553664194
Index: 864 | k: 3 | Prediction: 38250.18562836689
Index: 880 | k: 3 | Prediction: 38436.859621636

<hr>

# SVR

In [14]:
import pandas as pd
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import TimeSeriesSplit
from sklearn.preprocessing import StandardScaler
from joblib import Parallel, delayed
import joblib
num_CPU = joblib.cpu_count()
print(num_CPU)

16


In [27]:
response_col = "response"
file_name = "5-min-avg-SVR-past-price-only.csv"
predictor_col_lst = [ '5-min-price']
window_size = int(2*24*60/5)
print("window size:",window_size)
hyper_param_lst = [1,2,3]
print("Hyperparameters:",hyper_param_lst)

window size: 576
Hyperparameters: [1, 2, 3]


In [28]:
def model_constructor(k):
    return SVR(kernel = "poly", degree=k, max_iter=100000)

In [None]:
results = Parallel(n_jobs=num_CPU)(delayed(processInput)(index) for index in range(window_size, len(df)))
output = pd.DataFrame(results)
output.to_csv(file_name)



<hr>

# Decision Tree

In [18]:
import pandas as pd
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import TimeSeriesSplit
from joblib import Parallel, delayed
import joblib
num_CPU = joblib.cpu_count()
print(num_CPU)

16


In [19]:
response_col = "response"
file_name = "5-min-avg-DT-price-only.csv"
predictor_col_lst = [ '5-min-price']
window_size = int(2*24*60/5)
print("window size:",window_size)
hyper_param_lst = [1,2,3,4,5,6,7]
print("Hyperparameters:",hyper_param_lst)

window size: 576
Hyperparameters: [1, 2, 3, 4, 5, 6, 7]


In [20]:
def model_constructor(k):
    return DecisionTreeRegressor(random_state=0, max_depth=k)

In [21]:
results = Parallel(n_jobs=num_CPU)(delayed(processInput)(index) for index in range(window_size, len(df)))
output = pd.DataFrame(results)
output.to_csv(file_name)

Index: 578 | k: 2 | Prediction: 39219.31242156841
Index: 593 | k: 2 | Prediction: 39166.269576171304
Index: 608 | k: 2 | Prediction: 39133.86066454976
Index: 624 | k: 3 | Prediction: 38915.03193969028
Index: 640 | k: 4 | Prediction: 38170.54066829695
Index: 656 | k: 6 | Prediction: 38263.818186977835
Index: 672 | k: 7 | Prediction: 38581.54624171216
Index: 688 | k: 4 | Prediction: 38563.62000957141
Index: 704 | k: 3 | Prediction: 38475.43893287115
Index: 720 | k: 3 | Prediction: 38497.78769290372
Index: 736 | k: 3 | Prediction: 38445.82468087945
Index: 752 | k: 3 | Prediction: 38474.03726268381
Index: 768 | k: 3 | Prediction: 38266.282289861745
Index: 784 | k: 3 | Prediction: 38252.58318274367
Index: 800 | k: 6 | Prediction: 37947.215792464216
Index: 816 | k: 4 | Prediction: 38062.379268541314
Index: 832 | k: 3 | Prediction: 38103.6089774394
Index: 848 | k: 3 | Prediction: 38367.78102141633
Index: 864 | k: 2 | Prediction: 38263.84740418997
Index: 880 | k: 2 | Prediction: 38646.01044297

<hr>

# Linear Regression

In [22]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import TimeSeriesSplit
from joblib import Parallel, delayed
import joblib
num_CPU = joblib.cpu_count()
print(num_CPU)

16


In [23]:
response_col = "response"
file_name = "5-min-avg-LR-past-price-only.csv"
predictor_col_lst = [ '5-min-price']
window_size = int(2*24*60/5)
print("window size:",window_size)
hyper_param_lst = [0]
print("Hyperparameters:",hyper_param_lst)

window size: 576
Hyperparameters: [0]


In [24]:
def model_constructor(k):
    return LinearRegression()

In [25]:
results = Parallel(n_jobs=num_CPU)(delayed(processInput)(index) for index in range(window_size, len(df)))
output = pd.DataFrame(results)
output.to_csv(file_name)

 8248 | k: 1 | Prediction: 48424.40681464635
Index: 8264 | k: 1 | Prediction: 48439.11160495089
Index: 8279 | k: 1 | Prediction: 48975.825639060175
Index: 8295 | k: 1 | Prediction: 48967.26274836167
Index: 8311 | k: 1 | Prediction: 48962.04165246067
Index: 8327 | k: 1 | Prediction: 48475.15156307444
Index: 8343 | k: 1 | Prediction: 48966.429436432
Index: 8358 | k: 1 | Prediction: 48960.17939914662
Index: 8374 | k: 1 | Prediction: 48477.61126378279
Index: 8390 | k: 1 | Prediction: 48481.48106399603
Index: 8406 | k: 1 | Prediction: 48455.742885356034
Index: 8422 | k: 3 | Prediction: 47983.52851493551
Index: 8438 | k: 3 | Prediction: 47973.66081904025
Index: 8454 | k: 3 | Prediction: 47920.03160304542
Index: 8470 | k: 3 | Prediction: 48045.82550898011
Index: 8486 | k: 3 | Prediction: 48032.62397076534
Index: 8502 | k: 3 | Prediction: 47888.80864109779
Index: 8518 | k: 4 | Prediction: 47745.63312061041
Index: 8534 | k: 3 | Prediction: 47688.08302136102
Index: 8550 | k: 3 | Prediction: 4796