In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import datetime

from ta import add_all_ta_features
from ta.utils import dropna

import yfinance as yf

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

from sklearn.feature_selection import SelectKBest, f_regression, mutual_info_regression, SelectPercentile
from sklearn.decomposition import PCA

from keras.models import Sequential
from keras.layers import LSTM
from keras.layers import Dense
from keras.layers import Dropout
from sklearn.metrics import accuracy_score, r2_score

import warnings
warnings.filterwarnings('ignore')

# Importing data and processing it using TA lib

In [2]:
#Returns X and y data in dataframe form processed and with TA lib features
#time is 10 years
def process_data(stock_name):
    start = datetime.datetime(2020,4,6)
    end = datetime.datetime(2022,4,6)
    stock = yf.download(stock_name,start,end)
    
    df = add_all_ta_features(
        stock, open="Open", high="High", low="Low", close="Close", volume="Volume")
    
    df.drop(columns = ["trend_psar_up","trend_psar_down"], inplace = True)
    df.dropna(inplace = True)
    
    X = df.drop("Close",axis = 1)
    y = df["Close"]

    
    return X, y

# K- Feature Selector

In [3]:
def K_best_selector(X, y):
    select = SelectKBest(f_regression, k = 25)
    select.fit_transform(X, y)
    cols = select.get_support(indices=True)
    X = X.iloc[:,cols]
    return X,y

# PCA features

In [4]:
def pca_selector(X, y):    

    X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.2, shuffle = None)
    
    def get_variance(comp_count, train, test):
      pca = PCA(n_components=comp_count)
      train = pca.fit_transform(train)
      test = pca.transform(test)

      return pca.explained_variance_ratio_, train, test, pca

    components = 1
    explained_variance = 0
    while np.sum(explained_variance) < .95:

        explained_variance, X_train_fit, X_test_fit, pca = get_variance(components, X_train, X_test)

        components += 1

    # Reformats the principal components into a usable dictionary
    vals = np.array(pca.components_).T
    #print(vals)
    pc_lst = [f'PC{num+1}' for num in range(len(vals.T))]
    data = pd.DataFrame(vals, columns=pc_lst, index=X.columns)
    # Creates an empty dictionary to store feature importance
    feature_importance = {feature:0 for feature in X.columns}

    # Populates dictionary with their principal component correspondants
    for key in feature_importance.keys():
      feature_importance[key] = sum(data.loc[key])

    # Creates an ordered list of the importance values from the dictionary
    features_ordered = list(feature_importance.items())
    features_ordered = sorted(features_ordered, key = lambda x: x[1])[::-1]

    # Creates a list of bar heights and labels for each val above a threshold
    thresh  = 0.3
    height = [val for name, val in features_ordered if val >= thresh]
    x = [name for name, val in features_ordered][:len(height)]
    X = X.loc[:,x]
    print(X)
    print(y)
    return X, y

# Lasso Regression feature selection

In [30]:
# define the lasso regression class
class LassoRegression() :

  # define the constructor and pass
  # learning rate, iterations i.e epochs and l1 penality for Lasso
  def __init__( self, lr, epochs, l1_penality ) :

      self.lr = lr
      self.epochs = epochs
      self.l1_penality = l1_penality
      
  # fit the model on training data
  def fit( self, X, Y ) :
      
      # no_of_training_examples, no_of_features
      self.m, self.n = X.shape
      
      # weight initialization
      self.W = np.zeros( self.n )
      
      self.b = 0
      self.X = X
      self.Y = Y
      
      # gradient descent learning
      for i in range( self.epochs ) :
          self.update_weights()
      return self

  # function to update weights in gradient descent
  def update_weights( self ) :

      Y_pred = self.predict( self.X )
      
      # calculate gradients
      dW = np.zeros( self.n )

      # add or subtract penalty depending upon positivity or negativity of weights
      for j in range( self.n ):
          if self.W[j] > 0 :
              dW[j] = ( - ( 2 * ( self.X[:, j] ).dot( self.Y - Y_pred ) )
                      
                      + self.l1_penality ) / self.m
          else :
              dW[j] = ( - ( 2 * ( self.X[:, j] ).dot( self.Y - Y_pred ) )
                      
                      - self.l1_penality ) / self.m

      db = - 2 * np.sum( self.Y - Y_pred ) / self.m
      
      # update weights
      self.W = self.W - self.lr * dW
      self.b = self.b - self.lr * db
      
      return self

  # predict the output
  def predict( self, X ) :
      return X.dot( self.W ) + self.b

In [31]:
def perform_lasso(X_train, X_test, y_train, y_test):
  # Model training
  model = LassoRegression( epochs = 1000, lr = 0.01, l1_penality = 500 )
  model.fit( X_train, y_train )

  # Prediction on test set
  y_pred = model.predict( X_test )

  return model.W, X_train

In [32]:
from seaborn.utils import axes_ticklabels_overlap
def bar_importance(coeff, X_train, feature_names, thresh):
  ''' ''' 

  # steps for feature importance
  feature_importance = np.std(X_train, 0)*np.array(coeff)
  
  above_threshold ={}
  # print feature name and their importance
  for name,importance in zip(feature_names,feature_importance):
      if abs(importance) >= thresh:
        above_threshold[name] = importance

  #create a dataframe containing feature name and their importance
  dff = pd.DataFrame([])
  dff['features'] = above_threshold.keys()
  dff['importance'] = above_threshold.values()

  d = dff.groupby(['features']).sum().sort_values('importance',ascending=False)
  
  # plot graph for feature importance
  d['importance'].plot.bar()
  return above_threshold

# Making a model with all the layers and activation functions

In [33]:
def make_model():
    model = Sequential()
    model.add(LSTM(units = 180, activation = 'relu',return_sequences = True))
    model.add(Dropout(0.2))
    model.add(LSTM(units = 60, activation = 'relu',return_sequences = True))
    model.add(Dropout(0.2))
    model.add(LSTM(units = 60, activation = 'relu',return_sequences = True))
    model.add(Dropout(0.2))
    model.add(LSTM(units = 30))
    model.add(Dense(units = 1))
    model.compile(loss = "mean_squared_error", optimizer = "adam", metrics = "mae")
    
    return model

# Plotting function

In [34]:
def plot_values(pred_vals, actual_vals):
    plt.rcParams["figure.figsize"] = (20,3)
    plt.plot(pred_vals)
    plt.plot(actual_vals)
    plt.show()
    plt.savefig(i+'.jpg')
    plt.clf()

# Predictor function

Normalises data, feeds it into the model and gets prediction. Next, it plots the data using matplotlib

In [35]:
def create_model(X, y):
    
    scaler = MinMaxScaler()
    
    y = np.array(y)
        
    X_transformed = scaler.fit_transform(X)
    y_transformed = scaler.fit_transform(y.reshape(-1,1))
    
    X_train, X_test, y_train, y_test = train_test_split(X_transformed, y_transformed, test_size = 0.2, shuffle = None)
    
    train_len = len(X_train)
    test_len = len(X_test)
    
    X_train = np.array(X_train)
    X_train = X_train.reshape(train_len,-1,1)
    
    predictor = make_model()
    
    predictor.fit(X_train, y_train, epochs = 10)
    
    X_test = np.array(X_test)
    X_test = X_test.reshape(test_len,-1,1)
    y_test = np.reshape(y_test,(-1,1))
    
    
    y_pred = predictor.predict(X_test)
    y_pred_inv_transformed = scaler.inverse_transform(y_pred)
    y_actual_inv_transformed = scaler.inverse_transform(y_test)
    
    plot_values(y_pred_inv_transformed, y_actual_inv_transformed)   
    
    print("The r2 score of this prediction is : ", r2_score(y_actual_inv_transformed, y_pred_inv_transformed))

In [36]:
def lass_create_model(X, y):
    
    scaler = MinMaxScaler()
    
    y = np.array(y)
    
    X_transformed = scaler.fit_transform(X)
    y_transformed = scaler.fit_transform(y.reshape(-1,1))
    
    X_train, X_test, y_train, y_test = train_test_split(X_transformed, y_transformed, test_size = 0.2, shuffle = None)
    
    weight, X_train_refit = perform_lasso(X_train, X_test, y_train, y_test)
    feature_names = X.columns

    weight_thresh = 0.1
    importance_dict = bar_importance(weight, X_train_refit, feature_names, weight_thresh)
    
    X_train = X_train[:,list(importance_dict.keys())]
    X_test = X_test[:,list(importance_dict.keys())]
    
    train_len = len(X_train)
    test_len = len(X_test)
    
    X_train = np.array(X_train)
    X_train = X_train.reshape(train_len,-1,1)
    
    predictor = make_model()
    
    predictor.fit(X_train, y_train, epochs = 10)
    
    X_test = np.array(X_test)
    X_test = X_test.reshape(test_len,-1,1)
    y_test = np.reshape(y_test,(-1,1))
    
    
    y_pred = predictor.predict(X_test)
    y_pred_inv_transformed = scaler.inverse_transform(y_pred)
    y_actual_inv_transformed = scaler.inverse_transform(y_test)
    
    plot_values(y_pred_inv_transformed, y_actual_inv_transformed)   
    
    print("The r2 score of this prediction is : ", r2_score(y_actual_inv_transformed, y_pred_inv_transformed))

In [37]:
def final_kbest_func(name):
    X, y = process_data(name)
    new_X, new_y = K_best_selector(X, y)
    create_model(new_X, new_y)

In [38]:
def final_pca_func(name):
    X, y = process_data(name)
    new_X, new_y = pca_selector(X, y)
    create_model(new_X, new_y)    

In [39]:
def final_lasso_func(name):
    X, y = process_data(name)
    lass_create_model(X,y)

In [40]:
stocks =  ['AAPL']

In [41]:
for i in stocks:
    final_lasso_func(i)

[*********************100%***********************]  1 of 1 completed


ValueError: setting an array element with a sequence.