# Setup

In [None]:
import yfinance as yf
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

def train_step(model, X_train, y_train, loss_function, optimizer):
  pred_train = model(X_train)
  loss = loss_function(pred_train , y_train)

  model.zero_grad()
  loss.backward()
  optimizer.step()

  return loss.item()


def add_original_feature(df, df_new):
  df_new['open'] = df['Open']
  df_new['open_1'] = df['Open'].shift(1)
  df_new['close_1'] = df['Close'].shift(1)
  df_new['high_1'] = df['High'].shift(1)
  df_new['low_1'] = df['Low'].shift(1)
  df_new['volume_1'] = df['Volume'].shift(1)


def add_avg_price(df,df_new):
  df_new['avg_price_5'] = df['Close'].rolling(window=5).mean().shift(1)
  df_new['avg_price_30'] = df['Close'].rolling(window=21).mean().shift(1)
  df_new['avg_price_365'] = df['Close'].rolling(window=252).mean().shift(1)
  df_new['ratio_avg_price_5_30'] = df_new['avg_price_5'] / df_new['avg_price_30']
  df_new['ratio_avg_price_5_365'] = df_new['avg_price_5'] / df_new['avg_price_365']
  df_new['ratio_avg_price_30_365'] = df_new['avg_price_30'] / df_new['avg_price_365']


def add_avg_volume(df,df_new):
  df_new['avg_volume_5'] = df['Volume'].rolling(window=5).mean().shift(1)
  df_new['avg_volume_30'] = df['Volume'].rolling(window=21).mean().shift(1)
  df_new['avg_volume_365'] = df['Volume'].rolling(window=252).mean().shift(1)
  df_new['ratio_avg_volume_5_30'] = df_new['avg_volume_5'] / df_new['avg_volume_30']
  df_new['ratio_avg_volume_5_365'] = df_new['avg_volume_5'] / df_new['avg_volume_365']
  df_new['ratio_avg_volume_30_365'] = df_new['avg_volume_30'] / df_new['avg_volume_365']


def add_std_price(df,df_new):
  df_new['std_price_5'] = df['Close'].rolling(window=5).mean().shift(1)
  df_new['std_price_30'] = df['Close'].rolling(window=21).mean().shift(1)
  df_new['std_price_365'] = df['Close'].rolling(window=252).mean().shift(1)
  df_new['ratio_std_price_5_30'] = df_new['std_price_5'] / df_new['std_price_30']
  df_new['ratio_std_price_5_365'] = df_new['std_price_5'] / df_new['std_price_365']
  df_new['ratio_std_price_30_365'] = df_new['std_price_30'] / df_new['std_price_365']

def add_std_volume(df,df_new):
  df_new['std_volume_5'] = df['Close'].rolling(window=5).mean().shift(1)
  df_new['std_volume_30'] = df['Close'].rolling(window=21).mean().shift(1)
  df_new['std_volume_365'] = df['Close'].rolling(window=252).mean().shift(1)
  df_new['ratio_std_volume_5_30'] = df_new['std_volume_5'] / df_new['std_volume_30']
  df_new['ratio_std_volume_5_365'] = df_new['std_volume_5'] / df_new['std_volume_365']
  df_new['ratio_std_volume_30_365'] = df_new['std_volume_30'] / df_new['std_volume_365']


def add_return_feature(df,df_new):
  df_new['return_1'] = ((df['Close']-df['Close'].shift(1))/df['Close'].shift(1)).shift(1)
  df_new['return_5'] = ((df['Close']- df['Close'].shift(5))/df['Close'].shift(5)).shift(1)
  df_new['return_30'] = ((df['Close']- df['Close'].shift(21))/df['Close'].shift(21)).shift(1)
  df_new['return_365'] = ((df['Close']- df['Close'].shift(252))/df['Close'].shift(252)).shift(1)
  df_new['movie_avg_5'] = df_new['return_1'].rolling(window=5).mean().shift(1)
  df_new['movie_avg_30'] = df_new['return_1'].rolling(window=21).mean().shift(1)
  df_new['movie_avg_365'] = df_new['return_1'].rolling(window=252).mean().shift(1)



def generate_features(df):
  """
  Generate features for a stock/index based on historical price and
  performance
  @param df: dataframe with columns "Open", "Close", "High", "Low","Volume", "Adj Close"
  @return: dataframe, data set with new features
 """
  df_new = pd.DataFrame()
  #6 original features
  add_original_feature(df, df_new)
  #31 generated features
  add_avg_price(df, df_new)
  add_avg_volume(df, df_new)
  add_std_price(df, df_new)
  add_std_volume(df, df_new)
  add_return_feature(df, df_new)
  #the target
  df_new['close'] = df['Close']
  df_new = df_new.dropna()
  return df_new


print("Downloading NASDAQ Composite data (1990-2023)...")
data = yf.download(
        "^IXIC",
        start="1990-01-01",
        end="2023-06-30",
        progress=False
    )

data.to_csv('19900101_20230630.csv')

data_raw = pd.read_csv('19900101_20230630.csv', header=None, index_col=0, skiprows=2)


# Assign column names to the data columns after setting the index
data_raw.columns = ['Close', 'High', 'Low', 'Open', 'Volume']

# The first row after skipping headers is the actual first data entry, no need for iloc[1:]

# Set the index name for clarity
data_raw.index.name = 'Date'

# Generate features using the cleaned data
data = generate_features(data_raw)
start_train = '1990-01-01'
end_train = '2022-12-31'
start_test = '2023-01-01'
end_test = '2023-06-30'
data_train = data.loc[start_train:end_train]
X_train = data_train.drop('close', axis=1).values
y_train = data_train['close'].values
print(X_train.shape)
print(y_train.shape)
data_train = data.loc[start_train:end_train]
data_test = data.loc[start_test:end_test] # Added to get test data
X_test = data_test.drop('close', axis=1).values # Added to get test features
y_test = data_test['close'].values # Added to get test targets
print(X_test.shape) # Changed to print X_test shape
print(y_test.shape) # Added to print y_test shape

# Exercices

1. As mentioned, can you use more hidden layers in the neural network stock predictor and rerun
the model fine-tuning? Can you get a better result?


2. Following the first exercise, can you apply dropout and/or early stopping and see if you can
beat the current best R
2 of 0.977?