In [1]:
import pandas as pd
import numpy as np
from pmdarima.arima import AutoARIMA
import plotly.express as px
import plotly.graph_objects as go
from tqdm.notebook import tqdm as tqdm
from sklearn.metrics import mean_squared_error
from datetime import date, timedelta
import yfinance as yf

In [70]:
# Getting the date five years ago to download the current timeframe
years = (date.today() - timedelta(weeks=260)).strftime("%Y-%m-%d")
# Stocks to analyze
stocks = ['WIPRO.NS', 'INFY.NS', 'BSOFT.NS', 'LT.NS'] #
# stocks =['GE', 'GPRO', 'FIT', 'F']
# Getting the data for multiple stocks
df = yf.download(stocks, start=years).dropna()
# Storing the dataframes in a dictionary
stock_df = {}
for col in set(df.columns.get_level_values(0)):
    
    # Assigning the data for each stock in the dictionary
    stock_df[col] = df[col]


[*********************100%***********************]  4 of 4 completed


In [71]:
stock_df

{'Open':               BSOFT.NS      INFY.NS        LT.NS    WIPRO.NS
 Date                                                        
 2016-01-22   86.593300   574.049988   743.932983  208.388000
 2016-01-25   88.551498   570.125000   767.333008  205.462997
 2016-01-27   91.131203   573.224976   748.000000  206.250000
 2016-01-28   89.017700   568.000000   734.000000  207.449997
 2016-01-29   89.514999   569.000000   716.000000  205.725006
 ...                ...          ...          ...         ...
 2021-01-11  267.899994  1342.099976  1382.000000  436.000000
 2021-01-12  264.549988  1378.000000  1354.000000  447.950012
 2021-01-13  267.299988  1373.849976  1354.000000  461.000000
 2021-01-14  267.299988  1360.000000  1358.000000  452.000000
 2021-01-15  266.950012  1360.000000  1376.699951  454.750000
 
 [1228 rows x 4 columns],
 'High':               BSOFT.NS      INFY.NS        LT.NS    WIPRO.NS
 Date                                                        
 2016-01-22   89.017700   

In [72]:
# Finding the log returns
stock_df['LogReturns'] = stock_df['Adj Close'].apply(np.log).diff().dropna()
# Using Moving averages
stock_df['MovAvg'] = stock_df['Adj Close'].rolling(10).mean().dropna()
# Logarithmic scaling of the data and rounding the result
stock_df['Log'] = stock_df['MovAvg'].apply(np.log).apply(lambda x: round(x, 2))

In [73]:
stock_df

{'Open':               BSOFT.NS      INFY.NS        LT.NS    WIPRO.NS
 Date                                                        
 2016-01-22   86.593300   574.049988   743.932983  208.388000
 2016-01-25   88.551498   570.125000   767.333008  205.462997
 2016-01-27   91.131203   573.224976   748.000000  206.250000
 2016-01-28   89.017700   568.000000   734.000000  207.449997
 2016-01-29   89.514999   569.000000   716.000000  205.725006
 ...                ...          ...          ...         ...
 2021-01-11  267.899994  1342.099976  1382.000000  436.000000
 2021-01-12  264.549988  1378.000000  1354.000000  447.950012
 2021-01-13  267.299988  1373.849976  1354.000000  461.000000
 2021-01-14  267.299988  1360.000000  1358.000000  452.000000
 2021-01-15  266.950012  1360.000000  1376.699951  454.750000
 
 [1228 rows x 4 columns],
 'High':               BSOFT.NS      INFY.NS        LT.NS    WIPRO.NS
 Date                                                        
 2016-01-22   89.017700   

In [79]:
# Days in the past to train on
days_to_train = 180 

# Days in the future to predict
days_to_predict = 5

# Establishing a new DF for predictions
stock_df['Predictions'] = pd.DataFrame(index=stock_df['Log'].index,
                                       columns=stock_df['Log'].columns)

# Iterate through each stock
for stock in tqdm(stocks):
    
    # Current predicted value
    pred_val = 0
    
    # Training the model in a predetermined date range
    for day in tqdm(range(1000, 
                          stock_df['Log'].shape[0]-days_to_predict)):        

        # Data to use, containing a specific amount of days
        training = stock_df['Log'][stock].iloc[day-days_to_train:day+1].dropna()
        
        # Determining if the actual value crossed the predicted value
        cross = ((training[-1] >= pred_val >= training[-2]) or 
                 (training[-1] <= pred_val <= training[-2]))
#         print(cross)
        # Running the model when the latest training value crosses the predicted value or every other day 
        if cross or day % 2 == 0:

            # Finding the best parameters
            model    = AutoARIMA(start_p=0, start_q=0,
                                 start_P=0, start_Q=0,
                                 max_p=8, max_q=8,
                                 max_P=5, max_Q=5,
                                 error_action='ignore',
                                 information_criterion='bic',
                                 suppress_warnings=True)

            # Getting predictions for the optimum parameters by fitting to the training set            
            forecast = model.fit_predict(training,
                                         n_periods=days_to_predict)

            # Getting the last predicted value from the next N days
            stock_df['Predictions'][stock].iloc[day:day+days_to_predict] = np.exp(forecast[-1])


            # Updating the current predicted value
            pred_val = forecast[-1]


  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/214 [00:00<?, ?it/s]

  0%|          | 0/214 [00:00<?, ?it/s]

  0%|          | 0/214 [00:00<?, ?it/s]

  0%|          | 0/214 [00:00<?, ?it/s]

In [80]:
# Shift ahead by 1 to compare the actual values to the predictions
pred_df = stock_df['Predictions'].shift(1).astype(float).dropna()


In [81]:
pred_df

Unnamed: 0_level_0,BSOFT.NS,INFY.NS,LT.NS,WIPRO.NS
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2020-03-03,93.019907,751.505384,1159.453022,231.079446
2020-03-04,93.019907,751.505384,1159.453022,231.079446
2020-03-05,96.070455,744.817281,1126.671755,233.730015
2020-03-06,96.070455,744.817281,1126.671755,231.340092
2020-03-09,94.632408,723.569103,1125.856715,228.911012
...,...,...,...,...
2021-01-08,265.041176,1287.287380,1336.369662,399.027289
2021-01-11,265.041176,1287.287380,1336.369662,399.027289
2021-01-12,265.041176,1287.287380,1336.369662,399.027289
2021-01-13,265.041176,1287.287380,1336.369662,399.027289


In [82]:
for stock in stocks:
    
    fig = go.Figure()
    
    # Plotting the actual values
    fig.add_trace(go.Scatter(x=pred_df.index,
                             y=stock_df['MovAvg'][stock].loc[pred_df.index],
                             name='Actual Moving Average',
                             mode='lines'))
    
    # Plotting the predicted values
    fig.add_trace(go.Scatter(x=pred_df.index,
                             y=pred_df[stock],
                             name='Predicted Moving Average',
                             mode='lines'))
    
    # Setting the labels
    fig.update_layout(title=f'Predicting the Moving Average for the Next {days_to_predict} days for {stock}',
                      xaxis_title='Date',
                      yaxis_title='Prices')
    
    fig.show()


In [83]:
for stock in stocks:
    
    # Finding the root mean squared error
    rmse = mean_squared_error(stock_df['MovAvg'][stock].loc[pred_df.index], pred_df[stock], squared=False)
print(f"On average, the model is off by {rmse} for {stock}\n")


On average, the model is off by 25.954227727451432 for LT.NS



In [84]:
def get_positions(difference, thres=3, short=True):
    """
    Compares the percentage difference between actual 
    values and the respective predictions.
    
    Returns the decision or positions to long or short 
    based on the difference.
    
    Optional: shorting in addition to buying
    """
    
    if difference > thres/100:
        
        return 1
    
    
    elif short and difference < -thres/100:
        
        return -1
    
    
    else:
        
        return 0


In [85]:
# Creating a DF dictionary for trading the model
trade_df = {}

# Getting the percentage difference between the predictions and the actual values
trade_df['PercentDiff'] = (stock_df['Predictions'].dropna() / 
                           stock_df['MovAvg'].loc[stock_df['Predictions'].dropna().index]) - 1

# Getting positions
trade_df['Positions'] = trade_df['PercentDiff'].applymap(lambda x: get_positions(x, 
                                                                                 thres=1, 
                                                                                 short=True) / len(stocks))

# Preventing lookahead bias by shifting the positions
trade_df['Positions'] = trade_df['Positions'].shift(2).dropna()

# Getting Log Returns
trade_df['LogReturns'] = stock_df['LogReturns'].loc[trade_df['Positions'].index]                                    


In [86]:
trade_df['LogReturns']

Unnamed: 0_level_0,BSOFT.NS,INFY.NS,LT.NS,WIPRO.NS
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2020-03-04,0.045499,0.015607,-0.003944,0.022761
2020-03-05,-0.002620,-0.005485,0.004917,0.003707
2020-03-06,-0.016398,-0.020958,-0.020117,-0.027139
2020-03-09,-0.066140,-0.047813,-0.050390,-0.041792
2020-03-11,0.009640,-0.026540,0.013520,-0.006082
...,...,...,...,...
2021-01-07,0.009402,-0.015683,0.018810,0.000861
2021-01-08,0.010260,0.038812,0.025404,0.056051
2021-01-11,0.005655,0.047697,-0.017185,0.037861
2021-01-12,0.000376,-0.003239,-0.000148,0.024103


In [87]:
trade_df['Positions']

Unnamed: 0_level_0,BSOFT.NS,INFY.NS,LT.NS,WIPRO.NS
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2020-03-04,-0.25,-0.25,-0.25,-0.25
2020-03-05,-0.25,0.00,-0.25,-0.25
2020-03-06,0.25,-0.25,-0.25,0.00
2020-03-09,0.25,0.00,-0.25,0.00
2020-03-11,0.00,-0.25,-0.25,0.00
...,...,...,...,...
2021-01-07,0.25,0.25,0.25,0.25
2021-01-08,0.25,0.25,0.25,0.25
2021-01-11,0.25,0.25,0.25,0.25
2021-01-12,0.25,0.25,0.25,0.00


In [88]:
# Calculating Returns by multiplying the 
# positions by the log returns
returns = trade_df['Positions'] * trade_df['LogReturns']
# Calculating the performance as we take the cumulative 
# sum of the returns and transform the values back to normal
performance = returns.cumsum().apply(np.exp)
# Plotting the performance per stock
px.line(performance,
        x=performance.index,
        y=performance.columns,
        title='Returns Per Stock Using ARIMA Forecast',
        labels={'variable':'Stocks',
                'value':'Returns'})

In [116]:
# Returns for the portfolio
returns = (trade_df['Positions'] * trade_df['LogReturns']).sum(axis=1)

# Returns for SPY
spy = yf.download('^NSEI', start=returns.index[0])
spy = spy[spy.index.isin(returns.index)]
spy = spy['Adj Close'].apply(np.log).diff().dropna().cumsum().apply(np.exp)

# Calculating the performance as we take the cumulative sum of the returns and transform the values back to normal
performance = returns.cumsum().apply(np.exp)

# Plotting the comparison between SPY returns and ARIMA returns
fig = go.Figure()

fig.add_trace(go.Scatter(x=spy.index,
                         y=spy,
                         name='^NSEI Returns',
                         mode='lines'))

fig.add_trace(go.Scatter(x=performance.index,
                         y=performance.values,
                         name='ARIMA Returns on Portfolio',
                         mode='lines'))

fig.update_layout(title='^NSEI vs ARIMA Overall Portfolio Returns',
                  xaxis_title='Date',
                  yaxis_title='Returns')

fig.show()


[*********************100%***********************]  1 of 1 completed
