In [None]:
import pandas as pd
import xgboost as xgb
import datetime as dt
import os
import numpy as np
from xgboost import plot_importance
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
from matplotlib.pyplot import show
from matplotlib.pyplot import figure
from matplotlib.pyplot import suptitle
from sklearn.model_selection import train_test_split, GridSearchCV
import yfinance
from statsmodels.tsa.seasonal import STL
from dateutil.relativedelta import relativedelta
import pandas_datareader as web
from plotly.offline import plot

%matplotlib inline


In [None]:
def createcsv():
    stocks = ["AAPL", "GOOG","BRK-A", "MSFT", "V"]
    start = dt.datetime.now() - relativedelta(years=10)
    end = dt.datetime.now()
    for i in stocks:
        data = web.DataReader(i, 'yahoo', start, end)
        data.to_csv("/Users/krtinjain/Desktop/EE-590/" + i + ".csv")

In [None]:
def upar(mu):
    theta = ((mu > 0) * mu).sum()
    return theta

def nee(mu):
    theta = ((mu < 0) * mu).sum()
    return theta

def readcsv(Name):
    data = pd.read_csv("/Users/krtinjain/Desktop/EE-590/" + Name + ".csv")
    data['Date'] = pd.to_datetime(data['Date'])
    data = data[(data['Date'].dt.year >= 2010)].copy()
    data.index = range(len(data))
    return data

In [None]:

def OHLC(data, i):
    fig = plt.figure()
    x = data.Date
    fig, ax = plt.subplots()
    ax.plot(x, data.Close, label='Close')
    ax.plot(x, data.Open, label='Open')
    ax.plot(x, data.High, label='High')
    ax.plot(x, data.Low, label='Low')
    ax.set_xlabel('Date')
    ax.set_ylabel('Price')
    ax.set_title("Open High Low Close Graph", fontsize=20)
    lines_labels = [ax.get_legend_handles_labels() for ax in fig.axes]
    lines, labels = [sum(lol, []) for lol in zip(*lines_labels)]
    fig.legend(lines, labels)
    fig.set_size_inches(10, 10)
    i = '/Users/krtinjain/Desktop/EE-590/Graphs/' + i +'_OHLC.png'
    fig.savefig(i, dpi=150)


In [None]:
def decomp(data, i):
    closedata = data[['Date', 'Close']].copy()
    closedata = closedata.set_index('Date')
    dec = STL(closedata, period=365)
    res = dec.fit()
    fig = res.plot()
    i = '/Users/krtinjain/Desktop/EE-590/Graphs/' + i +'_decomp.png'
    fig.savefig(i, dpi=150)
    fig.show()

In [None]:
def MA(data, i):
    data['SMA_30'] = data['Close'].rolling(30).mean().shift()
    data['SMA_5'] = data['Close'].rolling(5).mean().shift()
    data['SMA_15'] = data['Close'].rolling(15).mean().shift()
    data['SMA_10'] = data['Close'].rolling(10).mean().shift()
    data['EMA_9'] = data['Close'].ewm(9).mean().shift()
    fig = plt.figure()
    x = data.Date
    fig, ax = plt.subplots()
    ax.plot(x, data.EMA_9, label='EMA 9')
    ax.plot(x, data.SMA_5, label='SMA 5')
    ax.plot(x, data.SMA_10, label='SMA 10')
    ax.plot(x, data.SMA_15, label='SMA 15')
    ax.plot(x, data.SMA_30, label='SMA 30')
    ax.set_xlabel('Date')
    ax.set_ylabel('Price')
    ax.set_title("Moving Average Graph", fontsize=20)
    lines_labels = [ax.get_legend_handles_labels() for ax in fig.axes]
    lines, labels = [sum(lol, []) for lol in zip(*lines_labels)]
    fig.legend(lines, labels)
    fig.set_size_inches(10, 10)
    i = '/Users/krtinjain/Desktop/EE-590/Graphs/' + i +'_MA.png'
    fig.savefig(i, dpi=150)

    return data

In [None]:
def rsindex(data, i, n=14):
    close = data['Close']
    dt = close.diff()
    dt = dt[1:]
    pdown = dt.copy()
    pup = dt.copy()
    pdown[pdown > 0] = 0
    pup[pup < 0] = 0
    rdown = pdown.abs().rolling(n).mean()
    rup = pup.rolling(n).mean()
    rs = rup / rdown
    rsi = 100.0 - (100.0 / (1.0 + rs))
    data["RSI"] = rsi.fillna(0)
    fig = plt.figure()
    x = data.Date
    fig, ax = plt.subplots()
    ax.plot(x, data["RSI"], label='Relative Strength Index')
    ax.set_title("Relative Strength Index", fontsize=20)
    lines_labels = [ax.get_legend_handles_labels() for ax in fig.axes]
    lines, labels = [sum(lol, []) for lol in zip(*lines_labels)]
    fig.legend(lines, labels)
    fig.set_size_inches(10, 10)
    i = '/Users/krtinjain/Desktop/EE-590/Graphs/' + i +'_RSI.png'
    fig.savefig(i, dpi=150)
    return data

In [None]:
def MACD(data, i):
    EMA_12 = pd.Series(data['Close'].ewm(span=12, min_periods=12).mean())
    EMA_26 = pd.Series(data['Close'].ewm(span=26, min_periods=26).mean())
    data['MACD'] = pd.Series(EMA_12 - EMA_26)
    data['MACD_signal'] = pd.Series(data.MACD.ewm(span=9, min_periods=9).mean())
    
    fig = plt.figure()
    fig.suptitle("MACD Graph", fontsize=20)
    x = data.Date
    fig, ax = plt.subplots(2)
    ax[0].plot(x, data.Close, label='Close')
    ax[0].plot(x, EMA_12, label='EMA 12')
    ax[0].plot(x, EMA_26, label='EMA 26')
    ax[1].plot(x, data['MACD'], label='MACD')
    ax[1].plot(x, data['MACD_signal'], label='Signal')
    ax[0].set_xlabel('Date')
    ax[0].set_ylabel('Price')
    ax[0].set_title("Close, EMA 12, EMA 26", fontsize=10)
    ax[1].set_title("MACD and MACD Signal", fontsize=10)
    fig.subplots_adjust(top=2)
    lines_labels = [ax.get_legend_handles_labels() for ax in fig.axes]
    lines, labels = [sum(lol, []) for lol in zip(*lines_labels)]
    fig.legend(lines, labels)
    i = '/Users/krtinjain/Desktop/EE-590/Graphs/' + i +'_MACD.png'
    fig.savefig(i, dpi=150)
    
    return data

In [None]:

def stoch_osc(data, i):
    data['14-high'] = data['High'].rolling(14).max()
    data['14-low'] = data['Low'].rolling(14).min()
    data['%K'] = (data['Close'] - data['14-low'])*100/(data['14-high'] - data['14-low'])
    data['%D'] = data['%K'].rolling(3).mean()
    fig = plt.figure()
    fig.suptitle("MACD Graph", fontsize=20)
    x = data.Date
    fig, ax = plt.subplots(3)
    ax[0].plot(x, data.Close, label='Close', color = 'red')
    ax[1].plot(x, data['%K'], label='%K', color = 'blue')
    ax[2].plot(x, data['%D'], label='%D', color = 'green')
    ax[0].set_xlabel('Date')
    ax[1].set_xlabel('Date')
    ax[2].set_xlabel('Date')
    ax[0].set_ylabel('Price')
    ax[0].set_title("Close", fontsize=10)
    ax[1].set_title("%K", fontsize=10)
    ax[2].set_title("%D", fontsize=10)
    fig.subplots_adjust(top=3)
    lines_labels = [ax.get_legend_handles_labels() for ax in fig.axes]
    lines, labels = [sum(lol, []) for lol in zip(*lines_labels)]
    fig.legend(lines, labels)
    i = '/Users/krtinjain/Desktop/EE-590/Graphs/' + i +'_STOCH.png'
    fig.savefig(i, dpi=150)
    return data

In [None]:

def mfi(df, st, n=14):
    tp = (df['High'] + df['Low'] + df['Close'])/3
    mf = tp * df['Volume']
    sn = np.where(tp > tp.shift(1), 1, -1)
    mfs = mf * sn
    mf_avg_loss = mfs.rolling(n).apply(nee, raw=True)
    mf_avg_gain = mfs.rolling(n).apply(upar, raw=True)
    ratio = mf_avg_gain / abs(mf_avg_loss)
    df['MFI'] = (100 - (100 / (1 + ratio)).to_numpy()
    fig = plt.figure()
    fig.suptitle("Money Flow Index", fontsize=20)
    x = df.Date
    fig, ax = plt.subplots()
    ax.plot(x, df['MFI'], label='Money Flow Index')
    ax.set_xlabel('Date')
    ax.set_ylabel('Price')
    ax.set_title("Money Flow Index", fontsize=20)
    lines_labels = [ax.get_legend_handles_labels() for ax in fig.axes]
    lines, labels = [sum(lol, []) for lol in zip(*lines_labels)]
    fig.legend(lines, labels)
    fig.set_size_inches(10, 10)
    st = '/Users/krtinjain/Desktop/EE-590/Graphs/' + st +'_MFI.png'
    fig.savefig(st, dpi=150)
    return df


In [None]:
def feature_engineering(data, st):
    data['Close'] = data['Close'].shift(-1)
    data = data.iloc[35:] 
    data = data[:-1]      
    data.index = range(len(data))
    test_size  = 0.025
    valid_size = 0.025
    test_idx = int(data.shape[0] * (1-test_size))
    valid_idx = int(data.shape[0] * (1-(valid_size+test_size)))
    train_data = data.loc[:valid_idx].copy()
    valid_data = data.loc[valid_idx+1:test_idx].copy()
    test_data = data.loc[test_idx+1:].copy()
    fig = plt.figure()
    x = data.Date
    fig, ax = plt.subplots()
    ax.plot(train_data.Date, train_data.Close, label='Train')
    ax.plot(valid_data.Date, valid_data.Close, label='Validation')
    ax.plot(test_data.Date, test_data.Close, label='Test')
    ax.set_xlabel('Date')
    ax.set_ylabel('Price')
    ax.set_title("Train Validation Test Split", fontsize=20)
    lines_labels = [ax.get_legend_handles_labels() for ax in fig.axes]
    lines, labels = [sum(lol, []) for lol in zip(*lines_labels)]
    fig.legend(lines, labels)
    fig.set_size_inches(10, 10)
    st = '/Users/krtinjain/Desktop/EE-590/Graphs/' + st +'_split.png'
    fig.savefig(st, dpi=150)
    drp = ['Date', 'Volume', 'Open', 'Low', 'High', 'Adj Close']
    train_data = train_data.drop(drp, 1)
    valid_data = valid_data.drop(drp, 1)
    test_data  = test_data.drop(drp, 1)
    y_test  = test_data['Close'].copy()
    X_test  = test_data.drop(['Close'], 1)
    y_valid = valid_data['Close'].copy()
    X_valid = valid_data.drop(['Close'], 1)
    y_train = train_data['Close'].copy() 
    X_train = train_data.drop(['Close'], 1)

    return data, y_train, X_train, y_valid, X_valid, y_test, X_test, test_idx

In [None]:
def auto_tune_xgb(df, X_train, y_train, X_valid, y_valid):
    parameters = {
        'n_estimators': [100, 250, 400, 550, 700],
        'learning_rate': [0.025, 0.05, 0.1, 0.2],
        'max_depth': [8, 9, 10, 11],
        'gamma': [0.005, 0.01, 0.05, 0.1],
        'random_state': [42],
        'min_child_weight':[2, 3, 4],
        'subsample':[0.8, 1],
        'colsample_bytree':[1],
        'colsample_bylevel':[1]
    }
    eval_set = [(X_train, y_train), (X_valid, y_valid)]
    model = xgb.XGBRegressor(eval_set=eval_set, objective='reg:squarederror', tree_method = 'hist')
    clf = GridSearchCV(model, parameters)
    clf.fit(X_train, y_train)
    print(f'Best params: {clf.best_params_}')
    print(f'Best validation score = {clf.best_score_}')
    return clf, eval_set


In [None]:

def hand_tune_xgb(df, X_train, y_train, X_valid, y_valid):
    parameters = {
        'n_estimators': [100],
        'learning_rate': [0.1],
        'max_depth': [9],
        'gamma': [0.1],
        'random_state': [42],
        'min_child_weight':[2],
        'subsample':[1]
    }
    eval_set = [(X_train, y_train), (X_valid, y_valid)]
    model = xgb.XGBRegressor(eval_set=eval_set, objective='reg:squarederror', tree_method = 'hist')
    clf = GridSearchCV(model, parameters)
    clf.fit(X_train, y_train)
    return clf, eval_set

In [None]:
def pred(clf, eval_set, X_train, y_train, X_test, y_test):
    model = xgb.XGBRegressor(**clf.best_params_, objective='reg:squarederror')
    model.fit(X_train, y_train, eval_set=eval_set, verbose=False)
    y_pred = model.predict(X_test)
    print(f'y_true = {np.array(y_test)[:5]}')
    print(f'y_pred = {y_pred[:5]}')
    print(f'mean_squared_error = {mean_squared_error(y_test, y_pred)}')
    return y_pred, model


In [None]:

def plot_pred(df, y_pred, y_test, test_split_idx, i):
    predicted_prices = df.loc[test_split_idx+1:].copy()
    predicted_prices['Close'] = y_pred
    fig = plt.figure()
    x = df.Date
    fig, ax = plt.subplots()
    ax.plot(predicted_prices.Date, y_test, label='True')
    ax.plot(predicted_prices.Date, y_pred, label='Prediction')
    ax.set_xlabel('Date')
    ax.set_ylabel('Price')
    ax.set_title("Predicted Values and True Values", fontsize=20)
    lines_labels = [ax.get_legend_handles_labels() for ax in fig.axes]
    lines, labels = [sum(lol, []) for lol in zip(*lines_labels)]
    fig.legend(lines, labels)
    fig.set_size_inches(10, 10)
    i = '/Users/krtinjain/Desktop/EE-590/Graphs/' + i +'_Prediction.png'
    fig.savefig(i, dpi=150)
    

In [None]:
if __name__ == '__main__':
    createcsv()

    i = "AAPL"
    df = readcsv(i)
    OHLC(df, i)
    decomp(df, i)
    df = MA(df, i)
    df = rsindex(df, i)
    df = MACD(df, i)
    df = stoch_osc(df, i)
    df = mfi(df, i)
    (df, y_train, X_train, y_valid, X_valid, y_test, X_test, test_split_idx) = feature_engineering(df, i)
    


In [None]:
    (clf, eval_set) = auto_tune_xgb(df, X_train, y_train, X_valid, y_valid)
    (y_pred,model) = pred(clf, eval_set, X_train, y_train, X_test, y_test)
    h = '/Users/krtinjain/Desktop/EE-590/Models/' + i +'_bestmodel.json'
    model.save_model(h)
    ax = plot_importance(model)
    j = '/Users/krtinjain/Desktop/EE-590/Graphs/' + i +'_Importance.png'
    ax.figure.savefig(j, dpi=150)
    #plot_importance(model);
    plot_pred(df, y_pred, y_test,test_split_idx, i)

In [None]:
    i = "GOOG"
    df = readcsv(i)
    OHLC(df, i)
    decomp(df, i)
    df = MA(df, i)
    df = rsindex(df, i)
    df = MACD(df, i)
    df = stoch_osc(df, i)
    df = mfi(df, i)
    (df, y_train, X_train, y_valid, X_valid, y_test, X_test, test_split_idx) = feature_engineering(df, i)
    

In [None]:
    (clf, eval_set) = auto_tune_xgb(df, X_train, y_train, X_valid, y_valid)
    
    (y_pred,model) = pred(clf, eval_set, X_train, y_train, X_test, y_test)
    h = '/Users/krtinjain/Desktop/EE-590/Models/' + i +'_bestmodel.json'
    model.save_model(h)
    ax = plot_importance(model)
    j = '/Users/krtinjain/Desktop/EE-590/Graphs/' + i +'_Importance.png'
    ax.figure.savefig(j, dpi=150)
    #plot_importance(model);
    plot_pred(df, y_pred, y_test,test_split_idx, i)

In [None]:
    i = "BRK-A"
    df = readcsv(i)
    OHLC(df, i)
    decomp(df, i)
    df = MA(df, i)
    df = rsindex(df, i)
    df = MACD(df, i)
    df = stoch_osc(df, i)
    df = mfi(df, i)
    (df, y_train, X_train, y_valid, X_valid, y_test, X_test, test_split_idx) = feature_engineering(df, i)

In [None]:
    (clf, eval_set) = auto_tune_xgb(df, X_train, y_train, X_valid, y_valid)
    (y_pred,model) = pred(clf, eval_set, X_train, y_train, X_test, y_test)
    h = '/Users/krtinjain/Desktop/EE-590/Models/' + i +'_bestmodel.json'
    model.save_model(h)
    ax = plot_importance(model)
    j = '/Users/krtinjain/Desktop/EE-590/Graphs/' + i +'_Importance.png'
    ax.figure.savefig(j, dpi=150)
    #plot_importance(model);
    plot_pred(df, y_pred, y_test,test_split_idx, i)

In [None]:
    i = "MSFT"
    df = readcsv(i)
    OHLC(df, i)
    decomp(df, i)
    df = MA(df, i)
    df = rsindex(df, i)
    df = MACD(df, i)
    df = stoch_osc(df, i)
    df = mfi(df, i)
    (df, y_train, X_train, y_valid, X_valid, y_test, X_test, test_split_idx) = feature_engineering(df, i)

In [None]:
    (clf, eval_set) = auto_tune_xgb(df, X_train, y_train, X_valid, y_valid)
    
    (y_pred,model) = pred(clf, eval_set, X_train, y_train, X_test, y_test)
    h = '/Users/krtinjain/Desktop/EE-590/Models/' + i +'_bestmodel.json'
    model.save_model(h)
    ax = plot_importance(model)
    j = '/Users/krtinjain/Desktop/EE-590/Graphs/' + i +'_Importance.png'
    ax.figure.savefig(j, dpi=150)
    #plot_importance(model);
    plot_pred(df, y_pred, y_test,test_split_idx, i)

In [None]:
    i = "V"
    df = readcsv(i)
    OHLC(df, i)
    decomp(df, i)
    df = MA(df, i)
    df = rsindex(df, i)
    df = MACD(df, i)
    df = stoch_osc(df, i)
    df = mfi(df, i)
    (df, y_train, X_train, y_valid, X_valid, y_test, X_test, test_split_idx) = feature_engineering(df, i)

In [None]:
    (clf, eval_set) = auto_tune_xgb(df, X_train, y_train, X_valid, y_valid)
    
    (y_pred,model) = pred(clf, eval_set, X_train, y_train, X_test, y_test)
    h = '/Users/krtinjain/Desktop/EE-590/Models/' + i +'_bestmodel.json'
    model.save_model(h)
    ax = plot_importance(model)
    j = '/Users/krtinjain/Desktop/EE-590/Graphs/' + i +'_Importance.png'
    ax.figure.savefig(j, dpi=150)
    #plot_importance(model);
    plot_pred(df, y_pred, y_test,test_split_idx, i)