In [59]:
import json
import numpy as np
from pprint import pprint
from tabulate import tabulate
from sklearn import metrics
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import cross_val_score
from sklearn import linear_model
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from sklearn.cluster import KMeans
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.tree import DecisionTreeClassifier

with open('nbrb.json') as file:    
    byns = json.load(file)

with open('cbrf.json') as file:    
    rubs = json.load(file)

byns_by_date = {}
for byn in byns:
    byns_by_date[byn['date']] = byn
    
rubs_by_date = {}
for rub in rubs:
    rubs_by_date[rub['date']] = rub

matrix = []
    
for date in byns_by_date:
    if not date in rubs_by_date:
        continue
    byn = byns_by_date[date]
    rub = rubs_by_date[date]
    row = [date, byn['eur'], byn['rub'], rub['usd'], byn['usd']]
    matrix.append(row)
            
matrix.sort(key=lambda x: x[0])
extended_matrix = []

for i, row in enumerate(matrix[1:]):
    prev_row = matrix[i]
    extended_matrix.append([
        row[0], 
        row[1], 
        row[2], 
        row[3], 
        row[1] - prev_row[1], 
        row[2] - prev_row[2], 
        row[3] - prev_row[3], 
        row[4], 
        row[4] > prev_row[4]])

matrix = extended_matrix
headers =['date', 'byn/eur', 'byn/rub', 'rub/usd', 'Δbyn/eur', 'Δbyn/rub', 'Δrub/usd', 'byn/usd', 'byn/usd up?']

data = []
y = []

for i, future_row in enumerate(matrix[1:]):
    row = matrix[i]
    data.append(row[1:-2])
    y.append(future_row[-1])
    
X = np.matrix(data)
Y = np.ravel(np.matrix(y).T)

print(tabulate(matrix, headers=headers))

date           byn/eur    byn/rub    rub/usd     Δbyn/eur    Δbyn/rub    Δrub/usd     byn/usd    byn/usd up?
----------  ----------  ---------  ---------  -----------  ----------  ----------  ----------  -------------
2000.01.06    332         11.85      26.9          9.3        -0.02       -0.1       320                   0
2000.01.07    332         11.85      27.23         0           0           0.33      320                   0
2000.01.11    334.67      11.72      27.73         2.67       -0.13        0.5       325                   1
2000.01.12    338.1       11.53      28.44         3.43       -0.19        0.71      328                   1
2000.01.13    340.87      11.44      28.85         2.77       -0.09        0.41      330                   1
2000.01.14    341.81      11.59      28.65         0.94        0.15       -0.2       332                   1
2000.01.15    341.81      11.59      28.57         0           0          -0.08      332                   0
2000.01.18    341.3

In [67]:

print(Y.shape, X.shape)

train_count = 3200

train_X = X[:-train_count]
train_Y = Y[:-train_count]

test_X = X[train_count:]
test_Y = Y[train_count:]

print('% of always true guesses: ', len([x for x in test_Y if x]) / len(test_Y))

def calculate_guesses_percentage(expected, predicted):
    count = 0
    for i, v in enumerate(expected):
        count += int(expected[i] == predicted[i])
    return count / len(expected)

def test(clf, name, test_X=test_X):
    print(name + '\n% of guesses: ', calculate_guesses_percentage(test_Y, clf.predict(test_X)))

clf = DecisionTreeClassifier()
clf = clf.fit(train_X, train_Y)

test(clf, 'Decision tree')

for n in [10, 20, 30, 40, 50, 60, 70, 80, 100, 200]:
    clf = RandomForestClassifier(n_estimators=n, max_depth=None, min_samples_split=2, random_state=0)
    clf = clf.fit(train_X, train_Y)
    
    test(clf, 'Random forest ({} trees)'.format(n))

    
clf = ExtraTreesClassifier(n_estimators=50, max_depth=None, min_samples_split=2, random_state=0)
clf = clf.fit(train_X, train_Y)

test(clf, 'Extra random forest (50 trees)')

(4191,) (4191, 6)
% of always true guesses:  0.5055499495459133
Decision tree
% of guesses:  0.5045408678102926
Random forest (10 trees)
% of guesses:  0.5005045408678103
Random forest (20 trees)
% of guesses:  0.48940464177598386
Random forest (30 trees)
% of guesses:  0.49848637739656915
Random forest (40 trees)
% of guesses:  0.4883955600403633
Random forest (50 trees)
% of guesses:  0.5105953582240161
Random forest (60 trees)
% of guesses:  0.5015136226034309
Random forest (70 trees)
% of guesses:  0.5045408678102926
Random forest (80 trees)
% of guesses:  0.5055499495459133
Random forest (100 trees)
% of guesses:  0.5065590312815338
Random forest (200 trees)
% of guesses:  0.5025227043390514
Extra random forest (50 trees)
% of guesses:  0.5035317860746721


In [62]:
from sklearn.naive_bayes import GaussianNB, BernoulliNB
from sklearn.linear_model import LogisticRegression
clf = GaussianNB()
clf.fit(train_X, train_Y)
test(clf, 'GaussianNB', test_X)

clf = BernoulliNB()
clf.fit(train_X, train_Y)
test(clf, 'BernoulliNB', test_X)

clf = LogisticRegression()
clf.fit(train_X, train_Y)
test(clf, 'LogisticRegression', test_X)

GaussianNB
% of guesses:  0.520686175580222
BernoulliNB
% of guesses:  0.5055499495459133
LogisticRegression
% of guesses:  0.5055499495459133


In [63]:
from sklearn.neighbors import KNeighborsClassifier
clf = KNeighborsClassifier()
clf.fit(train_X, train_Y)
test(clf, 'KNeighborsClassifier', test_X)

KNeighborsClassifier
% of guesses:  0.5065590312815338


In [64]:
from sklearn.svm import SVC
clf = SVC()
clf.fit(train_X, train_Y)
test(clf, 'SVC', test_X)

SVC
% of guesses:  0.5055499495459133


In [114]:
from sklearn.cluster import KMeans, MeanShift
est = KMeans(n_clusters=2)
est.fit(X)
print("KMeans: ", calculate_guesses_percentage(Y, est.labels_))

est = MeanShift(bandwidth=1000, bin_seeding=True)
est.fit(X)

clusters = dict()
for i, label in enumerate(est.labels_):
    if not label in clusters:
        clusters[label] = [0, 0]
    clusters[label][1] += 1
    clusters[label][0] += int(True == Y[i])
    
average_perc = 0
for label in clusters:
    perc = clusters[label][0] / clusters[label][1]
    print(max(perc, 1 - perc))
    average_perc += max(perc, 1 - perc) / len(clusters)
print("MeanShift: ", average_perc)

KMeans:  0.5712240515390121
0.7152608483666505
0.6343713956170703
0.6083333333333334
0.5796178343949044
0.5266666666666666
0.5904761904761905
0.5656565656565656
0.5398230088495575
1.0
1.0
1.0
1.0
1.0
1.0
MeanShift:  0.7685861316686383
