<a href="https://colab.research.google.com/github/liggettla/intro_ML/blob/master/historical.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Introduction
[This](https://towardsdatascience.com/getting-rich-quick-with-machine-learning-and-stock-market-predictions-696802da94fe) might be a fun example to rework.

# Imports

In [0]:
# import sys
# sys.path.append('/content/drive/My Drive/Finance/turbospoon/util')

%reload_ext autoreload
%autoreload 2

# from util import *
import numpy as np                  
import pandas as pd                 
from matplotlib import pyplot as plt
import seaborn as sns
from sklearn import tree
import pickle

from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier

def pretty(size='talk'):
    sns.set_palette('pastel')
    sns.set_style('ticks')
    sns.set_context(size)
pretty()

# Dependencies

In [0]:
#!/usr/bin/env python

def historical():
    import pandas as pd
    import pandas_datareader as pdr
    from pandas_datareader.robinhood import RobinhoodHistoricalReader as rhr

    # holdings = pd.read_csv('../personal_data/holdings.csv')

    # get historical closing data
    # start and end do nothing for robinhood atm
    # historical = rhr('QQQ', start='2018-12-15', end='2018-12-18').read()
    historical = rhr('QQQ').read()

    return historical

# this holds daily information
class DailyInfo:

    def __init__(self, historical):
        import pandas as pd

        self.historical = historical
        self.holdings = pd.read_csv('../personal_data/holdings.csv')
        self.shares = float(self.holdings[self.holdings.Stock == 'QQQ'].Shares)

    def pct_change(self):
        return (100 * self.historical.loc[('QQQ')].close_price.astype(float).pct_change()[-1])

    def change(self):
        return (self.historical.loc[('QQQ')].close_price.astype(float)[-1] - self.historical.loc[('QQQ')].close_price.astype(float)[-2]) * self.shares

    def yearly_pct(self):
        return ((self.historical.loc[('QQQ')].close_price.astype(float)[-1] - self.historical.loc[('QQQ')].close_price.astype(float)[0]) / self.historical.loc[('QQQ')].close_price.astype(float)[0]) * 100

    def yearly(self):
        return (self.historical.loc[('QQQ')].close_price.astype(float)[-1] - self.historical.loc[('QQQ')].close_price.astype(float)[0]) * self.shares

    def max_dollar(self):
        return self.historical.loc[('QQQ')].close_price.astype(float).max() * self.shares

    def min_dollar(self):
        return self.historical.loc[('QQQ')].close_price.astype(float).min() * self.shares

    def current_value(self):
        return self.historical.loc[('QQQ')].close_price.astype(float)[-1] * self.shares

    # print out some useful portfolio information
    def summary(self):
        print('Percent Change: %s' % (self.pct_change()))
        print('Dollar Change: %s' % (self.change()))
        print('Yearly Percent Change: %s' % (self.yearly_pct()))
        print('Yearly Dollar Change: %s' % (self.yearly()))
        print('Maximum: %s' % (self.max_dollar()))
        print('Minimum: %s' % (self.min_dollar()))
        print('Current Value: %s' % (self.current_value()))

    # plot 1 year portfolio values
    def one_year(self):
        from matplotlib import pyplot as plt
        import seaborn as sns
        import numpy as np
        import pandas as pd
        from matplotlib import pyplot as plt
        import seaborn as sns

        sns.set_palette('pastel')
        sns.set_style('ticks')
        sns.set_context('talk')

        temp = self.historical.loc[('QQQ')].copy().reset_index().rename(columns={'begins_at':'Date'})
        temp['Value'] = temp.close_price.astype(float) * self.shares

        plt.figure() # start a figure
        sns.lineplot(x='Date', y='Value', data=temp, color='#bea3ff')
        sns.despine(offset=10, trim=True)
        plt.xticks(rotation=45)
        plt.title('1 Year Portfolio Value\n')

# uses live data to build a quote
class Quote:
    def __init__(self, symbol):
        import pandas as pd
        import urllib, re, json

        self.symbol = symbol
        self.web_data = json.loads(urllib.request.urlopen('https://api.robinhood.com/quotes/?symbols=%s' % (symbol)).read())['results'][0]

        self.holdings = pd.read_csv('../personal_data/holdings.csv')
        self.shares = float(self.holdings[self.holdings.Stock == symbol].Shares)

    def ask_price(self):
        return self.web_data['ask_price']

    def ask_size(self):
        return self.web_data['ask_size']

    def bid_price(self):
        return self.web_data['bid_price']

    def bid_size(self):
        return self.web_data['bid_size']

    def last_trade_price(self):
        return self.web_data['last_trade_price']

    def last_extended_hours_trade_price(self):
        import numpy as np

        if self.web_data['last_extended_hours_trade_price']:
            return self.web_data['last_extended_hours_trade_price']
        else:
            return np.nan

    def previous_close(self):
        return self.web_data['previous_close']

    def adjusted_previous_close(self):
        return self.web_data['adjusted_previous_close']

    def previous_close_date(self):
        return self.web_data['previous_close_date']

    def trading_halted(self):
        return self.web_data['trading_halted']

    def has_traded(self):
        return self.web_data['has_traded']

    def last_trade_price_source(self):
        return self.web_data['last_trade_price_source']

    def percent_change(self):
        return (((float(self.web_data['last_trade_price']) - float(self.web_data['previous_close'])) / float(self.web_data['previous_close']) * 100))

    def percent_change_extended(self):
        import numpy as np

        if self.web_data['last_extended_hours_trade_price']:
            return (((float(self.web_data['last_extended_hours_trade_price']) - float(self.web_data['last_trade_price'])) / float(self.web_data['last_trade_price']) * 100))
        else:
            return np.nan

    def dollar_change(self):
        return self.shares * self.percent_change()

    def dollar_change_extended(self):
        import numpy as np

        if self.web_data['last_extended_hours_trade_price']:
            return self.shares * self.percent_change_extended()
        else:
            return np.nan

    # displays a general quote summary
    def summary(self):
        print('Previous Close: %s' % (self.previous_close()))
        print('Last Trade Price: %s' % (self.last_trade_price()))
        print('Last Trade Extended: %s' % (self.last_extended_hours_trade_price()))
        print('Percent Change: %s' % (self.percent_change()))
        print('Percent Change Extended: %s' % (self.percent_change_extended()))
        print('Dollar Change: %s' % (self.dollar_change()))
        print('Dollar Change Extended: %s' % (self.dollar_change_extended()))

# gets a list of all symbols listed on the nasdaq
def get_nasdaq_symbols(use_old=True):

    # use a previous symbol list
    if use_old:
        import pickle
        p = open('../data/nasdaq_symbols.pkl', 'rb')
        symbols = pickle.load(p)
        p.close()

    else:
        import pandas_datareader.data as web
        from pandas_datareader.nasdaq_trader import get_nasdaq_symbols
        from datetime import date

        # get current list of all nasdaq symbols
        symbols = get_nasdaq_symbols()

        # create file that will be accessed by current code
        import pickle
        p = open('../data/nasdaq_symbols.pkl', 'wb')
        pickle.dump(symbols, p)
        p.close()

        # also create a backup as the symbols are updated daily
        # and historical data is not available from nasdaq
        p = open('../data/nasdaq_symbols_%s%s%s.pkl' % (date.today().day,\
                                                  date.today().month,\
                                                  date.today().year)
                 , 'wb')
        pickle.dump(symbols, p)
        p.close()

    return symbols

# aggreataes 30yr of EOD data for all stocks listed in symbols
def build_personal_database(symbols, use_old=True):

    if use_old:
        import pickle
        p = open('../data/all_data.pkl', 'rb')
        all_data = pickle.load(p)
        p.close()

    else:
        import pandas_datareader as web
        import datetime
        import pandas as pd

        # get 30 years of data if available
        end = datetime.datetime.today()
        start = datetime.date(end.year-30,1,1)

        all_data = pd.DataFrame()

        for symbol in symbols.index:
            try:
                df = web.DataReader(symbol, 'yahoo', start, end)
                df['Symbol'] = symbol
                df = df.reset_index()
                all_data = all_data.append(df, ignore_index=True)
            except:
                pass

        import pickle
        p = open('all_data.pkl', 'wb')
        pickle.dump(all_data, p)
        p.close()


    return all_data

# adds a number of calculations that can be used for machine learning
def classify(all_data, use_old=True):

    if use_old:
        import pickle
        p = open('../data/classified_data.pkl', 'rb')
        all_data = pickle.load(p)
        p.close()

    else:
        # there is a bug that sometime prevents this first method from functioning as expected
        # all_data['Pct_Change'] = all_data.groupby('Symbol', sort=False)['Adj Close'].pct_change()
        all_data['Pct_Change'] = all_data.groupby('Symbol', sort=False)['Adj Close'].apply(lambda x: x.pct_change())

        # shift is used to move the data up or down by row
        # this classifies all recent EOD changes as either pos or neg
        all_data['Up_1d'] = all_data.groupby('Symbol').Pct_Change.shift(1) > 0
        all_data['Up_2d'] = all_data.groupby('Symbol').Pct_Change.shift(2) > 0
        all_data['Up_3d'] = all_data.groupby('Symbol').Pct_Change.shift(3) > 0
        all_data['Up_4d'] = all_data.groupby('Symbol').Pct_Change.shift(4) > 0
        all_data['Up_5d'] = all_data.groupby('Symbol').Pct_Change.shift(5) > 0

        # classifies the desired purchase stocks and dates
        # between 3-4% change is about 2.9% of all EOD data
        all_data['Purchase'] = ((all_data.groupby('Symbol').Pct_Change.shift(-1) > 0.03) & (all_data.groupby('Symbol').Pct_Change.shift(-1) < 0.04))

        # classifies if total price movement is up or down over period of time
        # the formula returns the total change in price over a course of time with 1 being no change
        # explanation here: https://math.stackexchange.com/questions/3062312/calculate-total-percent-change-from-incremental-changes
        pct_change = 1
        for i in range(3):
            pct_change = pct_change * (1 + 1 * all_data.groupby('Symbol').Pct_Change.shift(i))
        all_data['Tot_Up_3d'] = pct_change
        all_data.Tot_Up_3d = all_data.Tot_Up_3d > 1 # if tot percent change > 1 set increase as True

        pct_change = 1
        for i in range(5):
            pct_change = pct_change * (1 + 1 * all_data.groupby('Symbol').Pct_Change.shift(i))
        all_data['Tot_Up_5d'] = pct_change
        all_data.Tot_Up_5d = all_data.Tot_Up_5d > 1 # if tot percent change > 1 set increase as True

        pct_change = 1
        for i in range(10):
            pct_change = pct_change * (1 + 1 * all_data.groupby('Symbol').Pct_Change.shift(i))
        all_data['Tot_Up_10d'] = pct_change
        all_data.Tot_Up_10d = all_data.Tot_Up_10d > 1 # if tot percent change > 1 set increase as True

        pct_change = 1
        for i in range(15):
            pct_change = pct_change * (1 + 1 * all_data.groupby('Symbol').Pct_Change.shift(i))
        all_data['Tot_Up_15d'] = pct_change
        all_data.Tot_Up_15d = all_data.Tot_Up_15d > 1 # if tot percent change > 1 set increase as True

        pct_change = 1
        for i in range(30):
            pct_change = pct_change * (1 + 1 * all_data.groupby('Symbol').Pct_Change.shift(i))
        all_data['Tot_Up_30d'] = pct_change
        all_data.Tot_Up_30d = all_data.Tot_Up_30d > 1 # if tot percent change > 1 set increase as True

        # save classified data
        import pickle
        p = open('classified_data.pkl', 'wb')
        pickle.dump(all_data, p)
        p.close()

    return all_data

# converts to 1/0 format that is usable by machine learning algorithm
def convert(all_data, use_old=True):
    # useful trick that converting to int will convert bool to 1/0
    all_data[['Up_1d','Up_2d','Up_3d','Up_4d','Up_5d','Purchase','Tot_Up_3d','Tot_Up_5d','Tot_Up_10d','Tot_Up_15d','Tot_Up_30d']] = \
    all_data[['Up_1d','Up_2d','Up_3d','Up_4d','Up_5d','Purchase','Tot_Up_3d','Tot_Up_5d','Tot_Up_10d','Tot_Up_15d','Tot_Up_30d']].astype(int)

    return all_data

# this provides more exhaustive classification
def new_classify(all_data, use_old=True):

    if use_old:
        import pickle
        p = open('../data/classified_data.pkl', 'rb')
        all_data = pickle.load(p)
        p.close()

    else:
        # there is a bug that sometime prevents this first method from functioning as expected
        # all_data['Pct_Change'] = all_data.groupby('Symbol', sort=False)['Adj Close'].pct_change()
        all_data['Pct_Change'] = all_data.groupby('Symbol', sort=False)['Adj Close'].apply(lambda x: x.pct_change())

        # shift is used to move the data up or down by row
        # this classifies all recent EOD changes as either pos or neg for 30 days prior to current day
        for i in range(1,31):
            all_data['Up_%id' % (i)] = all_data.groupby('Symbol').Pct_Change.shift(i) > 0

        # classifies data by how the following day performs
        # between 3-4% change is about 2.9% of all EOD data
        all_data['Purchase'] = ((all_data.groupby('Symbol').Pct_Change.shift(-1) > 0.03) & (all_data.groupby('Symbol').Pct_Change.shift(-1) < 0.04))

        # classifies if total price movement is up or down over period of time
        # the formula returns the total change in price over a course of time with 1 being no change
        # explanation here: https://math.stackexchange.com/questions/3062312/calculate-total-percent-change-from-incremental-changes
        for i in range(1,31):
            pct_change = 1
            for i in range(i):
                pct_change = pct_change * (1 + 1 * all_data.groupby('Symbol').Pct_Change.shift(i))
            all_data['Tot_Up_%id' % (i)] = pct_change
            all_data['Tot_Up_%id' % (i)] = all_data['Tot_Up_%id' % (i)] > 1 # if tot percent change > 1 set increase as True

        # save classified data
        import pickle
        p = open('classified_data.pkl', 'wb')
        pickle.dump(all_data, p)
        p.close()

    return all_data

# converts to 1/0 format that is usable by machine learning algorithm
def new_convert(all_data, use_old=True):
    # useful trick that converting to int will convert bool to 1/0
    all_data[['Up_1d','Up_2d','Up_3d','Up_4d','Up_5d','Purchase','Tot_Up_3d','Tot_Up_5d','Tot_Up_10d','Tot_Up_15d','Tot_Up_30d']] = \
    all_data[['Up_1d','Up_2d','Up_3d','Up_4d','Up_5d','Purchase','Tot_Up_3d','Tot_Up_5d','Tot_Up_10d','Tot_Up_15d','Tot_Up_30d']].astype(int)

    return all_data

sym_loc = '/content/drive/My Drive/Finance/turbospoon/data/nasdaq_symbols.pkl'
# gets a list of all symbols listed on the nasdaq
def get_nasdaq_symbols(loc=sym_loc, use_old=True):

    # use a previous symbol list
    if use_old:
        import pickle
        p = open(loc, 'rb')
        symbols = pickle.load(p)
        p.close()

    else:
        import pandas_datareader.data as web
        from pandas_datareader.nasdaq_trader import get_nasdaq_symbols
        from datetime import date

        # get current list of all nasdaq symbols
        symbols = get_nasdaq_symbols()

        # create file that will be accessed by current code
        import pickle
        p = open('../data/nasdaq_symbols.pkl', 'wb')
        pickle.dump(symbols, p)
        p.close()

        # also create a backup as the symbols are updated daily
        # and historical data is not available from nasdaq
        p = open('../data/nasdaq_symbols_%s%s%s.pkl' % (date.today().day,\
                                                  date.today().month,\
                                                  date.today().year)
                 , 'wb')
        pickle.dump(symbols, p)
        p.close()

    return symbols

# aggreataes 30yr of EOD data for all stocks listed in symbols
def build_personal_database(symbols, data, use_old=True):

    if use_old:
        import pickle
        p = open(data, 'rb')
        all_data = pickle.load(p)
        p.close()

    else:
        import pandas_datareader as web
        import datetime
        import pandas as pd

        # get 30 years of data if available
        end = datetime.datetime.today()
        start = datetime.date(end.year-30,1,1)

        all_data = pd.DataFrame()

        for symbol in symbols.index:
            try:
                df = web.DataReader(symbol, 'yahoo', start, end)
                df['Symbol'] = symbol
                df = df.reset_index()
                all_data = all_data.append(df, ignore_index=True)
            except:
                pass

        import pickle
        p = open('all_data.pkl', 'wb')
        pickle.dump(all_data, p)
        p.close()


    return all_data

# adds a number of calculations that can be used for machine learning
def classify(all_data, data, use_old=True):

    if use_old:
        import pickle
        p = open(data, 'rb')
        all_data = pickle.load(p)
        p.close()

    else:
        # there is a bug that sometime prevents this first method from functioning as expected
        # all_data['Pct_Change'] = all_data.groupby('Symbol', sort=False)['Adj Close'].pct_change()
        all_data['Pct_Change'] = all_data.groupby('Symbol', sort=False)['Adj Close'].apply(lambda x: x.pct_change())

        # shift is used to move the data up or down by row
        # this classifies all recent EOD changes as either pos or neg
        all_data['Up_1d'] = all_data.groupby('Symbol').Pct_Change.shift(1) > 0
        all_data['Up_2d'] = all_data.groupby('Symbol').Pct_Change.shift(2) > 0
        all_data['Up_3d'] = all_data.groupby('Symbol').Pct_Change.shift(3) > 0
        all_data['Up_4d'] = all_data.groupby('Symbol').Pct_Change.shift(4) > 0
        all_data['Up_5d'] = all_data.groupby('Symbol').Pct_Change.shift(5) > 0

        # classifies the desired purchase stocks and dates
        # between 3-4% change is about 2.9% of all EOD data
        all_data['Purchase'] = ((all_data.groupby('Symbol').Pct_Change.shift(-1) > 0.03) & (all_data.groupby('Symbol').Pct_Change.shift(-1) < 0.04))

        # classifies if total price movement is up or down over period of time
        # the formula returns the total change in price over a course of time with 1 being no change
        # explanation here: https://math.stackexchange.com/questions/3062312/calculate-total-percent-change-from-incremental-changes
        pct_change = 1
        for i in range(3):
            pct_change = pct_change * (1 + 1 * all_data.groupby('Symbol').Pct_Change.shift(i))
        all_data['Tot_Up_3d'] = pct_change
        all_data.Tot_Up_3d = all_data.Tot_Up_3d > 1 # if tot percent change > 1 set increase as True

        pct_change = 1
        for i in range(5):
            pct_change = pct_change * (1 + 1 * all_data.groupby('Symbol').Pct_Change.shift(i))
        all_data['Tot_Up_5d'] = pct_change
        all_data.Tot_Up_5d = all_data.Tot_Up_5d > 1 # if tot percent change > 1 set increase as True

        pct_change = 1
        for i in range(10):
            pct_change = pct_change * (1 + 1 * all_data.groupby('Symbol').Pct_Change.shift(i))
        all_data['Tot_Up_10d'] = pct_change
        all_data.Tot_Up_10d = all_data.Tot_Up_10d > 1 # if tot percent change > 1 set increase as True

        pct_change = 1
        for i in range(15):
            pct_change = pct_change * (1 + 1 * all_data.groupby('Symbol').Pct_Change.shift(i))
        all_data['Tot_Up_15d'] = pct_change
        all_data.Tot_Up_15d = all_data.Tot_Up_15d > 1 # if tot percent change > 1 set increase as True

        pct_change = 1
        for i in range(30):
            pct_change = pct_change * (1 + 1 * all_data.groupby('Symbol').Pct_Change.shift(i))
        all_data['Tot_Up_30d'] = pct_change
        all_data.Tot_Up_30d = all_data.Tot_Up_30d > 1 # if tot percent change > 1 set increase as True

        # save classified data
        import pickle
        p = open('classified_data.pkl', 'wb')
        pickle.dump(all_data, p)
        p.close()

    return all_data

In [0]:
# get upgraded to a high RAM session
def more_ram():
    a = []
    while(1):
        a.append('1')

In [0]:
# from google.colab import drive
# drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


# Get Historical Data

In [0]:
# get nasdaq symbols
symbols = get_nasdaq_symbols(use_old=True)

# build personal EOD database
data = '/content/drive/My Drive/Finance/turbospoon/data/all_data.pkl'
all_data = build_personal_database(symbols, data, use_old=True)

# classify data
data = '/content/drive/My Drive/Finance/turbospoon/data/classified_data.pkl'
all_data = classify(all_data, data, use_old=True)

# convert to dummy variables
all_data = convert(all_data, use_old=True)
#dummies = pd.get_dummies(all_data[['Up_1d','Up_2d','Up_3d','Up_4d','Up_5d','Tot_Up_3d','Tot_Up_5d','Tot_Up_10d','Tot_Up_15d','Tot_Up_30d']])

### Train Model

In [0]:
clf = tree.DecisionTreeClassifier()
clf_train = clf.fit(all_data[['Up_1d','Up_2d','Up_3d','Up_4d','Up_5d','Tot_Up_3d','Tot_Up_5d','Tot_Up_10d','Tot_Up_15d','Tot_Up_30d']],
                    all_data['Purchase'])

### Query Model

In [0]:
prediction = clf_train.predict([[0,0,0,1,1,0,0,1,1,1]])
prediction[0]

array([0])

### Test Model Usefulness

In [0]:
# prediction function
#f = lambda x: clf_train.predict([[x.Up_1d, x.Up_2d, x.Up_3d, x.Up_4d, x.Up_5d, x.Tot_Up_3d, x.Tot_Up_5d, x.Tot_Up_10d, x.Tot_Up_15d, x.Tot_Up_30d]])[0]
#temp['Prediction'] = temp.apply(f)
'''
temp['Prediction'] = clf_train.predict([[temp.Up_1d, temp.Up_2d, temp.Up_3d,
                                           temp.Up_4d, temp.Up_5d, temp.Tot_Up_3d,
                                           temp.Tot_Up_5d, temp.Tot_Up_10d, temp.Tot_Up_15d,
                                           temp.Tot_Up_30d]])[0]
                                           '''
temp['Prediction'] = str(temp['Up_1d'])

### Display

In [0]:
dot_data = tree.export_graphviz(clf, filled=True, rounded=True)
graph = graphviz.Source(dot_data)

NameError: name 'graphviz' is not defined

### Todo
Have a look at this:  
`all_data[all_data.Symbol == 'AAPL'].tail(10)`  
It does not seem that the classification of the Tot_Up_3d is correct on the 26th of December.

In [0]:
all_data[['Up_1d','Up_2d','Up_3d','Up_4d','Up_5d','Tot_Up_3d','Tot_Up_5d','Tot_Up_10d','Tot_Up_15d','Tot_Up_30d']].head()

In [0]:
x = 10000

for i in range(3 * 365):
    x = x * 1.01
print(x)

539391740.5144053


# KNN Clustering
The first thing being done below is to see if there is any noticeable clustering of different classifications and `purchase` status.

In [99]:
names = ['GOOG','AAPL','GE','SBUX','GS']
dat = pd.DataFrame()
purchase = pd.Series()

for name in names:

    dat = dat.append(all_data[all_data.Symbol == name][[
                                                        'Up_1d','Up_2d','Up_3d','Up_4d','Up_5d',
                                                        'Tot_Up_3d','Tot_Up_5d','Tot_Up_10d',
                                                        'Tot_Up_15d','Tot_Up_30d']].head(5000).tail(4500),
                     ignore_index=True)
    purchase = purchase.append(all_data[all_data.Symbol == name].Purchase.head(5000).tail(4500),
                               ignore_index=True)

X_train, X_test, y_train, y_test = train_test_split(
    dat, purchase, random_state=0)
print(X_train[:5], y_train[:5])
print(X_test[:5], y_test[:5])

  This is separate from the ipykernel package so we can avoid doing imports until


       Up_1d  Up_2d  Up_3d  ...  Tot_Up_10d  Tot_Up_15d  Tot_Up_30d
4688       0      0      0  ...           0           0           0
15715      0      1      1  ...           1           1           1
12352      0      1      1  ...           1           1           1
11582      0      0      1  ...           1           1           1
16761      1      1      0  ...           1           1           1

[5 rows x 10 columns] 4688     0
15715    0
12352    0
11582    0
16761    0
dtype: int64
       Up_1d  Up_2d  Up_3d  ...  Tot_Up_10d  Tot_Up_15d  Tot_Up_30d
4808       0      0      0  ...           1           1           1
19508      0      1      0  ...           0           0           1
4309       0      1      0  ...           0           0           1
2604       1      1      1  ...           0           0           0
1783       1      1      0  ...           1           1           0

[5 rows x 10 columns] 4808     0
19508    0
4309     0
2604     0
1783     0
dtype: int64


In [81]:
knn = KNeighborsClassifier(n_neighbors=1)
knn.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=1, p=2,
                     weights='uniform')

In [89]:
y_pred = knn.predict(X_test)
#print(y_pred)
#print(y_test)
print(np.mean(y_pred == y_test))
# print(knn.score(X_test, y_test))

[0 0 0 ... 0 0 0]
0.9385640266469282


In [0]:
y_pred = pd.Series(y_pred)
y_test = pd.Series(y_test)