In [1]:
from stockDatabase import StockDatabase
from stockplot.models import Stock, StockData, Depot
import Analyze_ValueDB
import stockDepot
import matplotlib.pyplot as plt
import datetime
import pandas as pd
import numpy as np

%matplotlib

Using matplotlib backend: TkAgg


### Features

- Which features can be used to predict stock price change over the next n days? I.e. does the price increase or decrease?

- mostly features independent of the stock price (almost scaled)
- PPO (Percentage Price Oscillator)
- net change
- percent change
- 10, 20, 30 day volatility
- EMA (Exponential Moving Average): positive cross, negative cross, higher -> find times when a shorter EMA crosses a longer one
- Find when price crosses EMA or is higher than EMA
- Find when EMA of MACD crosses MACD or is higher (MACD = Moving Average Convergence Divergence)
- same for PPO
- find when price is higher than upper bollinger band, average one or lower than the lower band
- Money Flow Index (MFI)
- considered overbought / very overbought if MFI > 80 / 90
- considered oversold / very oversold if MFI < 20 / 10++

- fundamentals:
- market capitalization
- PCF (price / cash flow ratio)
- PSR (price / sales ratio)
- P/B ratio
- EPSD (Earnings Per Share Net Income Diluted)
- Current ratio (liquidity ratio measuring a company's ability to pay short term and long term obligations)
- Quick ratio (indicator of a company's short term liquidity)


- not yet implemented: (http://cs229.stanford.edu/proj2013/DaiZhang-MachineLearningInStockPriceTrendForecasting.pdf)
    - PE ratio
    - PX volume
    - PX ebitda
    - current enterprise value
    - quick ratio
    - alpha overridable
    - alpha for beta pm
    - beta raw overridable
    - risk premium
    - IS EPS
    - corresponding S&P 500 index

- not yet implemented: (https://arxiv.org/pdf/1603.00751.pdf)
    - Book value - the net asset value of a company, calculated by total assets minus intangible assets (patents, goodwill) and liabilities.
    - Market capitalization - the market value of a company's issued share capital; it is equal to the share price times the number of shares outstanding.
    - Change of stock Net price over the one month period
    - Percentage change of Net price over the one month period
    - Dividend yield - indicates how much a company pays out in dividends each year relative to its share price.
    - Earnings per share - a portion of a company's profit divided by the number of issued shares. Earnings per share serves as an indicator of a company's profitability.
    - Earnings per share growth – the growth of earnings per share over the trailing one-year period.
    - Sales revenue turnover -
    - Net revenue - the proceeds from the sale of an asset, minus commissions, taxes, or other expenses related to the sale.
    - Net revenue growth – the growth of Net revenue over the trailing one-year period.
    - Sales growth – sales growth over the trailing one-year period.
    - Price to earnings ratio – measures company’s current share price relative to its per-share earnings.
    - Price to earnings ratio, five years average – averaged price to earnings ratio over the period of five years.
    - Price to book ratio - compares a company's current market price to its book value.
    - Price to sales ratio – ratio calculated by dividing the company's market cap by the revenue in the most recent year.
    - Dividend per share - is the total dividends paid out over an entire year divided by the number of
    - ordinary shares issued.
    - Current ratio - compares a firm's current assets to its current liabilities.
    - Quick ratio - compares the total amount of cash, marketable securities and accounts receivable to the amount of current liabilities.
    - Total debt to equity - ratio used to measure a company's financial leverage, calculated by dividing a company's total liabilities by its stockholders' equity.
    - Analyst ratio – ratio given by human analyst.
    - Revenue growth adjusted by 5 year compound annual growth ratio
    - Profit margin – a profitability ratio calculated as net income divided by revenue, or net profits divided by sales
    - Operating margin - ratio used to measure a company's pricing strategy and operating efficiency. It is a measurement of what proportion of a company's revenue is left over after paying for variable costs of production such as wages, raw materials, etc.
    - Asset turnover - the ratio of the value of a company’s sales or revenues generated relative to the value of its assets

Source: https://www.cs.princeton.edu/sites/default/files/uploads/saahil_madge.pdf
- when deciding to buy or not, use past of stock to determine accuracy of machine learning algorithm for that particular stock

Source, master thesis: http://www.diva-portal.org/smash/get/diva2:354463/FULLTEXT01.pdf

Source, MIT based on earnings reports: http://ocw.mit.edu/courses/sloan-school-of-management/15-097-prediction-machine-learning-and-statistics-spring-2012/projects/MIT15_097S12_proj2.pdf

Source, Lehman: https://www.cis.upenn.edu/~mkearns/papers/rlexec.pdf

Source, Github App: https://github.com/DMTSource/daily-stock-forecast

In [2]:
def crossing(feature1, feature2):
    # create feature for feature ema crossing feature
    poscross = [0] # positive cross
    negcross = [0] # negative cross
    higher = [0] # ema higher than feature

    for i in range (1, len(feature1)):
        # check if ema is crossing feature
        if feature1[i-1] < feature2[i-1] and feature1[i] > feature2[i]:
            poscross.append(1)
            negcross.append(0)
        elif feature1[i-1] > feature2[i-1] and feature1[i] < feature2[i]:
            poscross.append(0)
            negcross.append(1)
        else:
            poscross.append(0)
            negcross.append(0)
            
        # check if ema is higher or lower than feature
        if feature1[i] > feature2[i]:
            higher.append(1)
        else:
            higher.append(0)
    return poscross, negcross, higher

In [3]:
from math import sqrt

def createFeatures(df):
    
    data = df['Price'].values
    
    # price change features
    netchange = [0]
    netchange += [data[i]-data[i-1] for i in range(1, len(data))]
    percentchange = [netchange[i]/data[i] * 100 for i in range(0, len(data))]

    # volatility based on price change
    sqrt254 = sqrt(254) # square root of yearly trading days
    volatility10 = [0] * 10
    volatility20 = [0] * 20
    volatility30 = [0] * 30
    volatility10 += [sqrt254 * np.std(percentchange[i-10:i]) for i in range(10, len(data))]
    volatility20 += [sqrt254 * np.std(percentchange[i-20:i]) for i in range(20, len(data))]
    volatility30 += [sqrt254 * np.std(percentchange[i-30:i]) for i in range(30, len(data))]

    # add features to dataframe
    df['featPPO'] = df['PPO']
    df['featMFI'] = df['MFI']
    df['Netchange'] = netchange
    df['Percentage Change'] = percentchange
    df['10 day volatility'] = volatility10
    df['20 day volatility'] = volatility20
    df['30 day volatility'] = volatility30

    # check if shorter EMA crosses a longer one
    df['EMA 15 / 30 posx'], df['EMA 15 / 30 negx'], df['EMA 15 / 30 higher'] = crossing(df['EMA 15'].values, df['EMA 30'].values)

    df['EMA 15 / 50 posx'], df['EMA 15 / 50 negx'], df['EMA 15 / 50 higher'] = crossing(df['EMA 15'].values, df['EMA 50'].values)

    df['EMA 15 / 100 posx'], df['EMA 15 / 100 negx'], df['EMA 15 / 100 higher'] = crossing(df['EMA 15'].values, df['EMA 100'].values)

    df['EMA 15 / 200 posx'], df['EMA 15 / 200 negx'], df['EMA 15 / 200 higher'] = crossing(df['EMA 15'].values, df['EMA 200'].values)

    df['EMA 30 / 50 posx'], df['EMA 30 / 50 negx'], df['EMA 30 / 50 higher'] = crossing(df['EMA 30'].values, df['EMA 50'].values)

    df['EMA 30 / 100 posx'], df['EMA 30 / 100 negxx'], df['EMA 30 / 100 higher'] = crossing(df['EMA 30'].values, df['EMA 100'].values)

    df['EMA 30 / 200 posx'], df['EMA 30 / 200 negx'], df['EMA 30 / 200 higher'] = crossing(df['EMA 30'].values, df['EMA 200'].values)

    df['EMA 50 / 100 posx'], df['EMA 50 / 100 negx'], df['EMA 50 / 100 higher'] = crossing(df['EMA 50'].values, df['EMA 100'].values)

    df['EMA 50 / 200 posx'], df['EMA 50 / 200 negx'], df['EMA 50 / 200 higher'] = crossing(df['EMA 50'].values, df['EMA 200'].values)

    df['EMA 100 / 200 pox'], df['EMA 100 / 200 negx'],df['EMA 100 / 200 higher'] = crossing(df['EMA 100'].values, df['EMA 200'].values)


    # check if price crosses an EMA

    df['Price / EMA 15 posx'], df['Price / EMA 15 negx'], df['Price / EMA 15 higher'] = crossing(df['Price'].values, df['EMA 15'].values)

    df['Price / EMA 30 posx'], df['Price / EMA 30 negx'], df['Price / EMA 30 higher'] = crossing(df['Price'].values, df['EMA 30'].values)

    df['Price / EMA 50 posx'], df['Price / EMA 50 negx'], df['Price / EMA 50 higher'] = crossing(df['Price'].values, df['EMA 50'].values)

    df['Price / EMA 100 posx'], df['Price / EMA 100 negx'], df['Price / EMA 100 higher'] = crossing(df['Price'].values, df['EMA 100'].values)

    df['Price / EMA 200 posx'], df['Price / EMA 200 negx'], df['Price / EMA 200 higher'] = crossing(df['Price'].values, df['EMA 200'].values)

    # check if EMA of MACD crosses MACD
    df['MACD posx'], df['MACD negx'], df['MACD higher'] = crossing(df['MACD EMA'].values, df['MACD'].values)

    # check if EMA of PPO crosses PPO
    df['PPO posx'] , df['PPO negx'],  df['PPO higher'] = crossing(df['PPO EMA'].values, df['PPO'].values)
    
    
    
    # create features for bollinger
    higheraverage = [] # price higher than average bollinger band
    higherhigh = [] # price higher than upper bollinger band
    lowerlow = [] # price lower than lower bollinger band

    bollinger = df['Bollinger'].values
    bollingerhigh = df['Bollinger high'].values
    bollingerlow = df['Bollinger low'].values
    for i in range (0, len(data)):
            # check if price is higher than Bollinger
            if data[i] > bollinger[i]:
                higheraverage.append(1)
            else:
                higheraverage.append(0)

            if data[i] > bollingerhigh[i]:
                higherhigh.append(1)
            else:
                higherhigh.append(0)

            if data[i] < bollingerlow[i]:
                lowerlow.append(1)
            else:
                lowerlow.append(0)

    df['Bollinger higher'] = higheraverage
    df['Bollinger higher high'] = higherhigh
    df['Bollinger lower low'] = lowerlow
    
    # create features for Money Flow Index:
    strongoverbought = overbought = oversold = strongoversold = [0] * len(df)
    MFI = df['MFI'].values
    for i in range(0, len(df)):
        if MFI[i] > 90:
            strongoverbought[i] = 1
            overbought[i] = 1
        if MFI[i] > 80:
            overbought[i] = 1
        elif MFI[i] < 10:
            oversold[i] = 1
            strongoversold[i] = 1
        elif MFI[i] < 20:
            oversold[i] = 1

    df['MFI strong overbought'] = strongoverbought
    df['MFI overbought'] = overbought
    df['MFI oversold'] = oversold
    df['MFI strong oversold'] = strongoversold
    
        
    return df

### Prediction Features

- What should be predicted?
- Here: is the price higher or lower the next n days? (1, 5, 10, 20, 50, 100)

In [52]:
# predict if price is higher the next n days.

def predictY(df):
    
    price = df['Price'].values
    
    lower50 = [0]*len(price)    
    higher50_0 = [0]*len(price)
    #higher50_3 = [0]*len(price)
    #higher50_5 = [0]*len(price)
    higher50_10 = [0]*len(price)
    #higher50_20 = [0]*len(price)
    higher50_30 = [0]*len(price)
    #higher50_40 = [0]*len(price)
    higher50_50 = [0]*len(price)
    #higher50_75 = [0]*len(price)
    
    for i in range (0, len(price)-50):
        if price[i+50] <= price[i]:
            lower50[i] = 1
        elif price[i+50] > price[i] and price[i+50] <= price[i]*1.1:
            higher50_0[i] = 1
      #  elif price[i+50] > price[i] * 1.03 and price[i+50] <= price[i]*1.05:
      #      higher50_3[i] = 1
      #  elif price[i+50] > price[i] * 1.05 and price[i+50] <= price[i]*1.1:
      #      higher50_5[i] = 1
        elif price[i+50] > price[i] * 1.1 and price[i+50] <= price[i]*1.3:
            higher50_10[i] = 1
      #  elif price[i+50] > price[i] * 1.2 and price[i+50] <= price[i]*1.3:
      #      higher50_20[i] = 1
        elif price[i+50] > price[i] * 1.3 and price[i+50] <= price[i]*1.5:
            higher50_30[i] = 1
      #  elif price[i+50] > price[i] * 1.4 and price[i+50] <= price[i]*1.5:
      #      higher50_40[i] = 1
        elif price[i+50] > price[i] * 1.5:# and price[i+50] <= price[i]*1.75:
            higher50_50[i] = 1
      #  elif price[i+50] > price[i] * 1.75:
      #      higher50_75[i] = 1
            
    for i in range(len(price)-50, len(price)):
        lower50[i] = float('NaN')
        higher50_0[i] = float('NaN')
        #higher50_3[i] = float('NaN')
        #higher50_5[i] = float('NaN')
        higher50_10[i] = float('NaN')
        #higher50_20[i] = float('NaN')
        higher50_30[i] = float('NaN')
        #higher50_40[i] = float('NaN')
        higher50_50[i] = float('NaN')
        #higher50_75[i] = float('NaN')

    df['Prediction Feature -'] = lower50
    df['Prediction Feature 0'] = higher50_0
    #df['Prediction Feature 3'] = higher50_3
    #df['Prediction Feature 5'] = higher50_5
    df['Prediction Feature 10'] = higher50_10
    #df['Prediction Feature 20'] = higher50_20
    df['Prediction Feature 30'] = higher50_30
    #df['Prediction Feature 40'] = higher50_40
    df['Prediction Feature 50'] = higher50_50
    #df['Prediction Feature 75'] = higher50_75
    
    return df

### Create the Dataframe

In [53]:
def getStockDataframe(stocklist, fromDate, toDate, prediction):

    dflist = []

    for i in range(0, len(stocklist)):
        # get a stock
        teststock = StockDatabase(stocklist[i].sourceSymbol)
        
        # get history for stock fromDate to Date
        step = 1
        dates, data = teststock.getStockHistoryDate('close', fromDate, toDate, step)

        # get data for Moving Average Convergence Divergence
        dataMACD = teststock.MACD(dates, data)
        
        # get 9 day Exponential Moving Average for MACD
        emaMACD = [float('NaN')] * len(dates)
        emaMACD[26:len(dates)] = teststock.ExpAverage(dates, dataMACD[26:len(dates)], 9)

        # get data for Percentage Price Oscillator
        dataPPO = teststock.PPO(dates, data)
        
        # get 9 day EMA for PPO
        emaPPO = [float('NaN')] * len(dates)
        emaPPO[26:len(dates)] = teststock.ExpAverage(dates, dataPPO[26:len(dates)], 9)

        # get bollinger band
        days = 20 # bollinger band based on 20-day Simple Moving Average
        factor = 2 # factor for standard deviation, lowBol = averageBol - 2 * std
        lowBol, averageBol, highBol = teststock.Bollinger(dates, data, days, factor)

        # get exponential moving averages
        ema15 = teststock.ExpAverage(dates, data, 15)
        ema30 = teststock.ExpAverage(dates, data, 30)
        ema50 = teststock.ExpAverage(dates, data, 50)
        ema100 = teststock.ExpAverage(dates, data, 100)
        ema200 = teststock.ExpAverage(dates, data, 200)
        
        # get Money Flow Index (MFI)
        typical, volume, pos_MF, neg_MF, pos_MF14, neg_MF14, MFR14, MFI = teststock.MFI(fromDate, toDate)

        # get fundamentals (Markus)
        compNumber = stocklist[i].fundamentalsCompNumber
        PCFlist, PSRlist, PBlist, EPSDlist, quicklist, currentlist = ([] for i in range(6))
        for date in dates:
            price = teststock.getStockPriceDate('close', date)[1]
            datestr = datetime.datetime.fromtimestamp(date).strftime('%Y-%m-%d')
            shares, capitalization, PCF, PSR, PB, EPSD, quick, current = Analyze_ValueDB.fundamentals(compNumber, datestr, price)
            PCFlist.append(PCF) # price cash flow ratio
            PSRlist.append(PSR) # price sales ratio
            PBlist.append(PB)   # P/B ratio
            EPSDlist.append(EPSD) # EPS (Earnings Per Share) Net Income - Diluted
            quicklist.append(quick) # quick ratio
            currentlist.append(current) # current ratio
            
        
        # create pandas dataframe for all date
        dfdata = {'Stock': stocklist[i], 'Date': dates, 'Price': data, 'EMA 15': ema15, 'EMA 30': ema30, 'EMA 50': ema50, 'EMA 100': ema100, 'EMA 200': ema200, 'MACD': data, 'MACD EMA': emaMACD, 'PPO': dataPPO, 'PPO EMA': emaPPO, 'Bollinger low': lowBol, 'Bollinger': averageBol, 'Bollinger high': highBol, 'MFI': MFI, 'PCF': PCFlist, 'PSR': PSRlist, 'PB': PBlist, 'EPSD': EPSDlist, 'quick': quicklist, 'current': currentlist }
        df = pd.DataFrame(dfdata, columns=['Stock', 'Date', 'Price', 'EMA 15', 'EMA 30', 'EMA 50', 'EMA 100', 'EMA 200', 'MACD', 'MACD EMA', 'PPO', 'PPO EMA', 'Bollinger low', 'Bollinger', 'Bollinger high', 'MFI', 'PCF', 'PSR', 'PB', 'EPSD', 'quick', 'current'])

        df = df.dropna()
        
        if len(df) > 0:
            # create features
            df = createFeatures(df)

            # create prediction features
            if prediction:
                df = predictY(df)

            df = df.dropna()
            dflist.append(df)

    predictionvalues = [-1, 0, 3, 5, 10, 20, 30, 40, 50, 75]
    
    #try:
    return pd.concat(dflist), predictionvalues
    #except:
    #    return pd.concat(dflist)

In [54]:
# create dataframe with stocks
stocklist = []
stocks = Stock.objects.filter(source = 'Quandl')
for i in range(0, 10):#int(3*len(stocks)/4)):
    stocklist.append(stocks[i])

start = datetime.datetime.strptime('2000-01-01', "%Y-%m-%d").timestamp()
end = datetime.datetime.strptime('2015-05-01', "%Y-%m-%d").timestamp()#'2012-04-01', "%Y-%m-%d").timestamp()
stockdf, predictionvalues = getStockDataframe(stocklist, start, end, True) # true for prediction features

stockdf.head(3)

Error. Too many days!
Error. Too many days!


Unnamed: 0,Stock,Date,Price,EMA 15,EMA 30,EMA 50,EMA 100,EMA 200,MACD,MACD EMA,...,Bollinger lower low,MFI strong overbought,MFI overbought,MFI oversold,MFI strong oversold,Prediction Feature -,Prediction Feature 0,Prediction Feature 10,Prediction Feature 30,Prediction Feature 50
1154,"Kloeckner, KCO.F, Frankfurt Stock Exchange",1294009000.0,21.64,21.017408,20.359853,19.562832,18.474622,17.793981,21.64,0.680094,...,0,0,0,0,0,0.0,1.0,0.0,0.0,0.0
1155,"Kloeckner, KCO.F, Frankfurt Stock Exchange",1294096000.0,21.925,21.130857,20.46083,19.655466,18.542946,17.835086,21.925,0.663021,...,0,0,0,0,0,1.0,0.0,0.0,0.0,0.0
1156,"Kloeckner, KCO.F, Frankfurt Stock Exchange",1294182000.0,21.4,21.1645,20.521422,19.723879,18.599521,17.870558,21.4,0.643108,...,0,0,0,0,0,0.0,1.0,0.0,0.0,0.0


In [55]:
stockdf.tail(3)

Unnamed: 0,Stock,Date,Price,EMA 15,EMA 30,EMA 50,EMA 100,EMA 200,MACD,MACD EMA,...,Bollinger lower low,MFI strong overbought,MFI overbought,MFI oversold,MFI strong oversold,Prediction Feature -,Prediction Feature 0,Prediction Feature 10,Prediction Feature 30,Prediction Feature 50
3776,"Airbus Group, AIR.F, Frankfurt Stock Exchange",1423782000.0,49.725,48.479533,47.505539,46.94462,46.7738,47.157049,49.725,0.93856,...,0,0,0,0,0,0.0,0.0,1.0,0.0,0.0
3777,"Airbus Group, AIR.F, Frankfurt Stock Exchange",1424041000.0,50.48,48.729592,47.697439,47.083263,46.84719,47.190114,50.48,0.968273,...,0,0,0,0,0,0.0,0.0,1.0,0.0,0.0
3778,"Airbus Group, AIR.F, Frankfurt Stock Exchange",1424128000.0,50.69,48.974643,47.890508,47.224703,46.923285,47.224938,50.69,1.002902,...,0,0,0,0,0,0.0,0.0,1.0,0.0,0.0


In [56]:
len(stockdf)

13214

In [57]:
stockdf['Stock'].unique()

array([<Stock: Kloeckner, KCO.F, Frankfurt Stock Exchange>,
       <Stock: Infineon, IFX.F, Frankfurt Stock Exchange>,
       <Stock: Gea Group, G1A.F, Frankfurt Stock Exchange>,
       <Stock: H&r, 2HR.F, Frankfurt Stock Exchange>,
       <Stock: Amadeus Fire Ag, AAD.F, Frankfurt Stock Exchange>,
       <Stock: Carl Zeiss Meditec, AFK.F, Frankfurt Stock Exchange>,
       <Stock: Adva Optical Networking, ADV.F, Frankfurt Stock Exchange>,
       <Stock: Airbus Group, AIR.F, Frankfurt Stock Exchange>], dtype=object)

In [58]:
start2 = datetime.datetime.strptime('2014-05-01', "%Y-%m-%d").timestamp()
end2 = datetime.datetime.strptime('2015-12-31', "%Y-%m-%d").timestamp()
stockdftest, predictionvalues = getStockDataframe(stocklist, start2, end2, True) 

Error. Too many days!


In [59]:
stockdftest = stockdftest.sort(['Date'])

  if __name__ == '__main__':


In [60]:
stockdftest.head(3)

Unnamed: 0,Stock,Date,Price,EMA 15,EMA 30,EMA 50,EMA 100,EMA 200,MACD,MACD EMA,...,Bollinger lower low,MFI strong overbought,MFI overbought,MFI oversold,MFI strong oversold,Prediction Feature -,Prediction Feature 0,Prediction Feature 10,Prediction Feature 30,Prediction Feature 50
200,"Kloeckner, KCO.F, Frankfurt Stock Exchange",1424041000.0,10.075,9.540316,9.358233,9.298589,9.473178,10.1215,10.075,0.156307,...,0,0,0,0,0,1.0,0.0,0.0,0.0,0.0
200,"Adva Optical Networking, ADV.F, Frankfurt Stoc...",1424041000.0,3.272,3.054405,3.005725,2.975475,2.955255,2.968845,3.272,0.02481,...,0,0,0,0,0,0.0,0.0,0.0,0.0,1.0
200,"Amadeus Fire Ag, AAD.F, Frankfurt Stock Exchange",1424041000.0,67.54,67.162296,65.781407,64.212696,61.777983,58.48595,67.54,1.396833,...,0,0,0,0,0,0.0,0.0,1.0,0.0,0.0


In [61]:
stockdftest.tail(3)

Unnamed: 0,Stock,Date,Price,EMA 15,EMA 30,EMA 50,EMA 100,EMA 200,MACD,MACD EMA,...,Bollinger lower low,MFI strong overbought,MFI overbought,MFI oversold,MFI strong oversold,Prediction Feature -,Prediction Feature 0,Prediction Feature 10,Prediction Feature 30,Prediction Feature 50
359,"Amadeus Fire Ag, AAD.F, Frankfurt Stock Exchange",1444687000.0,82.97,82.189603,82.595248,82.690251,81.045936,76.095008,82.97,-0.52853,...,0,0,0,0,0,1.0,0.0,0.0,0.0,0.0
360,"Amadeus Fire Ag, AAD.F, Frankfurt Stock Exchange",1445292000.0,81.88,82.150902,82.549103,82.658477,81.062452,76.15257,81.88,-0.492863,...,0,0,0,0,0,1.0,0.0,0.0,0.0,0.0
361,"Amadeus Fire Ag, AAD.F, Frankfurt Stock Exchange",1445378000.0,82.44,82.18704,82.542064,82.649909,81.08973,76.215131,82.44,-0.453739,...,0,0,0,0,0,1.0,0.0,0.0,0.0,0.0


### Prediction

- Machine Learning begins here

In [62]:
print(stockdf.columns.get_loc('PCF'))
print(stockdf.columns.get_loc('Prediction Feature -'))

16
87


# Continue here with multiclass classification

In [66]:
# use random forest for prediction
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

featurestart = stockdf.columns.get_loc('PCF')
featureend = stockdf.columns.get_loc('Prediction Feature -')
score = []
rows, cols = stockdf.shape

# train and test set
#train, test = train_test_split(stockdf, test_size = 0.20)
train = stockdf
test = stockdftest

stockforest = RandomForestClassifier(n_estimators = 100)
stockforest = stockforest.fit(train.values[0::, featurestart:featureend], train.values[0::, featureend:cols].astype(int))
score.append(stockforest.score(test.values[0:50, featurestart:featureend], test.values[0:50, featureend:cols].astype(int)))

score

[0.17999999999999999]

In [37]:
i = 6
prediction = stockforest.predict(test.values[i, featurestart:featureend].reshape(1,-1))
prediction_prob = stockforest.predict_proba(test.values[i, featurestart:featureend].reshape(1,-1))

rows, cols = test.shape
realvalues = test.values[i, featureend:cols]

print(prediction_prob)
print('- 0 3 5 10 20 30 40 50 75')
print(prediction)
print(realvalues)

[array([[ 0.66,  0.34]]), array([[ 0.85,  0.15]]), array([[ 0.93,  0.07]]), array([[ 0.78,  0.22]]), array([[ 0.86,  0.14]]), array([[ 0.98,  0.02]]), array([[ 1.,  0.]]), array([[ 0.98,  0.02]]), array([[ 0.96,  0.04]]), array([[ 1.,  0.]])]
- 0 3 5 10 20 30 40 50 75
[[ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]]
[1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0]


In [None]:
prediction_prob[0][0][1]

In [None]:
# training of models:
def trainmodels(stockdf):
    
    featurecol = stockdf.columns.get_loc('Prediction Feature -')
    stockforest = RandomForestClassifier(n_estimators = 100)
    stockforest = stockforest.fit(train.values[0::, featurestart:featureend], train.values[0::, featureend:cols].astype(int))
    score.append(stockforest.score(test.values[0::, featurestart:featureend], test.values[0::, featureend:cols].astype(int)))
    
    return stockforest

In [None]:
# final training of models:
stockforest = trainmodels(stockdf) 

### Trading System

- implement simple trading system to test prediction

In [None]:
# create depot as test.
from django.contrib.auth.models import User

depotname = 'PredictionTest1'
user = User.objects.get(username='oliver')

try:
    stockDepot.deleteDepot(user, depotname)
except:
    pass

value = 10000
depot = stockDepot.createDepot(user, depotname, value)
#print (depot)
depotcontent_total, balance, depotvalue, available, change = stockDepot.depotAnalysis(depot)
print(depotcontent_total, balance, depotvalue, available, change)

In [None]:
# Import stocks for prediction:
# data until 2012 was used to train algorithm
# data from 2012 to 2013 will be used to calculate accuracy
#start1 = datetime.datetime.strptime('2011-05-01', "%Y-%m-%d")
#end1 = datetime.datetime.strptime('2013-04-01', "%Y-%m-%d")
# data from 2013 until today will be used for test depot / trading system


start2 = datetime.datetime.strptime('2011-05-01', "%Y-%m-%d").timestamp()
today = datetime.datetime.now().timestamp()
#stockdfAccuracy = [] # used to check accuracy for each stock
stockdfDepotList1 = [] # used to run trading system

for i in range(0, len(stocklist)):
    print (i/(len(stocklist)-1)*100)
    # create dataframe for trading system
    # false so that no prediction features are created
    # possibly try except needed here to catch empty dataframes
    try:
        stockdfDepot, predictionvalues = getStockDataframe([stocklist[i]], start2, today, False)
        stockdfDepotList1.append(stockdfDepot)
    except:
        print("Stock missing")

#### Buy or Sell

In [None]:
def buyorsell(depot, probabilities, date):
    depotcontent_total, balance, depotvalue, available, change = stockDepot.depotAnalysisDate(depot, date)

    maxperstock = depotvalue / 4 # maximum amount to be spent on one stock
    mintrans = 1000 # minimum amount to be bought for
    fee = 4.9
    
    for stockprob in sorted(probabilities, key=lambda x: x['Predictionrange'], reverse = True): # go through sorted list
        # sorted to have stocks with biggest probability in front (for 50 days higher / lower)
        
        stockid= stockprob['Stock'].id
        # buy at day's high
        datatype = 'close'
        current_price = stockDepot.getStockPriceDate(stockid, datatype, date)
        # no trade if no price available or accuracy of prediction too low
        if current_price == 0:
            continue
        
        if available >= mintrans + fee and stockprob['Maximum Probability'] > 0.4 and stockprob['Predictionrange'] >= 3 and expectation >= 0.7: 
            amounttobuy = int(maxperstock/current_price) # buy maximum allowed if possible
            for content in depotcontent_total: # find if stock is already bought
                if content.stock == stockprob['Stock']:
                    amounttobuy = maxperstock - int(content.current_total/current_price) # reduce maximum amount by already available
                    break
            if amounttobuy > int(mintrans/current_price) and available >= amounttobuy * current_price + fee:
                #buy at day's high
                print(stockprob['Stock'])
                print(amounttobuy)
                print('bought at: ' + str(current_price))
                print('on: ' + datetime.datetime.fromtimestamp(date).strftime('%Y-%m-%d'))
                stockDepot.buyStockDate(depot, stockid, amounttobuy, datatype, fee, date)
                available = available - amounttobuy*current_price - fee
    
    # sell
    # stop loss at x %
    # go through all stocks and check
    for content in depotcontent_total:
        datatype = 'close'
        stockid = content.stock.id
        current_price = stockDepot.getStockPriceDate(stockid, datatype, date)
        if content.amount > 0:
            if current_price > content.maxSinceBought:
                stockDepot.changeStockMax(depot, content.stock.id, current_price)
            if current_price < content.maxSinceBought * 0.97:
                amounttosell = content.amount #int(content.current_total/current_price)
                print(content.stock)
                print(amounttosell)
                print('sold at: ' + str(current_price))
                print('on: ' + datetime.datetime.fromtimestamp(date).strftime('%Y-%m-%d'))
                print('max since bought: ' + str(content.maxSinceBought))
                stockDepot.sellStockDate(depot, stockid, amounttosell, datatype, fee, date)       
        
        # sell at day's low
        #datatype = 'close'
        #current_price = stockDepot.getStockPriceDate(stockid, datatype, date)
        #if (stockprob['Proba50'] < 0.6 and stockprob['Proba20'] < 0.6 and stockprob['Proba5'] < 0.5) or stockprob['Proba5'] < 0.3: # condition for selling
        #    amounttosell = 0
        #    for content in depotcontent_total: # find if stock was bought
        #        if content.stock == stockprob['Stock']:
        #            amounttosell = int(content.current_total/current_price)
        #            break
        #    if amounttosell > 0:
        #        # sell at day's low
        #        print(stockprob['Stock'])
        #        print('sold at: ' + str(current_price))
        #        stockDepot.sellStockDate(depot, stockid, amounttosell, datatype, fee, date)

#### Reduce probabilities and scoring system

In [None]:
def scoring(probability):
    probas = []
    for i in range(0, len(probability)):
        probas.append(probability[i][0][1])
    probmax = max(probas)
    indexmax = probas.index(max(probas))
    predictionrange = predictionvalues[indexmax]
    
    probabig0 = sum(probas[2::])
        
    return probmax, predictionrange, probabig0

#### Start test

In [None]:
import gc
gc.collect()

# throw out stocks with missing values:
stockdfDepotList = []
for df in stockdfDepotList1:
    if df.shape == df.dropna().shape:
        stockdfDepotList.append(df)

featurestart = stockdfDepotList[0].columns.get_loc('PCF')

# find longest dataframe:
days = 0
index = 0
for i in range(0, len(stockdfDepotList)):
    if len(stockdfDepotList[i]) > days:
        days = len(stockdfDepotList[i])
        index = i

# go through dataframe
for i in range(0, days): # for each day
    probabilities = []
    date = stockdfDepotList[index].values[i, 1] # date from longest dataframe
    stocknb = 0
    for j in range(0, len(stockdfDepotList)): # for each stock
        stock = stockdfDepotList[j]
        indexdf = stock[stock['Date'] == date].index.tolist()
        if(len(stock) <= i or indexdf == []): #or stock.values[indexdf:indexdf+1, featurestart::] == []):
            probabilities.append({'Stock': stock['Stock'].iloc[0], 'Maximum Probability': 0, 'Predictionrange': -1, 'Expectation': 0, 'Date': 0})
        else:
            stocknb = stocknb + 1
            indexdf = indexdf[0]
            probabs = stockforest.predict_proba(stock.loc[indexdf, :].values[featurestart::].reshape(1,-1))
            #print(probabs)
            probmax, predictionrange, expectation = scoring(probabs)
            #print(probmax)
            #print(predictionrange)
            #print(expectation)
            probabilities.append({'Stock': stock['Stock'].iloc[0], 'Maximum Probability': probmax, 'Predictionrange': predictionrange, 'Expectation': expectation, 'Date': stockdfDepotList[j].values[i,1]})
    buyorsell(depot, probabilities, date)

    depotcontent_total, balance, depotvalue, available, change = stockDepot.depotAnalysisDate(depot, date)
    if (i%10 == 0):
        print(stocknb)
        print(datetime.datetime.fromtimestamp(date).strftime('%Y-%m-%d'))
        print(depotcontent_total)
        print(i)

    #if (i%50 == 0 and i != 0):
    #    # recalculate prediction models
    #    start = datetime.datetime.strptime('2000-01-01', "%Y-%m-%d").timestamp()
    #    end = date
    #    modeldf = getStockDataframe(stocklist, start, end, True)
    #    gc.collect()
    #    stockforest5, stockforest20, stockforest50 = trainmodels(modeldf)
    print(balance, depotvalue, available, change)


In [None]:
%reset