# Importing Dependencies

In [1]:
# Task B2
import numpy as np
import pandas as pd
import yfinance as yf
import datetime as dt
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
import os

# Task B3
import plotly.graph_objects as go
import plotly.express as px

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


# Hyperparameters

In [2]:
# Task B2
TICKER = "AAPL"
START_DATE = "2014-01-01"
END_DATE = "2020-12-31"
LOOK_UP_DAYS = 30      
TRAINING_RATIO = 0.8    # 0.7 == 70%
SPLIT_BY_DATE = False
SPLIT_RANDOMLY = False
SCALE_DATA = True
SCALING_METHOD = "Standard"       # MinMax, Standard

# Task B3
TRADING_PERIOD = 30
CONSECUTIVE_DAYS = 300

# Task B2: DATA PROCESSING 1

## Scaling dataset

In [3]:
def scale_data(stock_data, scaling_method=SCALING_METHOD):

    if scaling_method == "MinMax":
        scaler = preprocessing.MinMaxScaler()
    
    elif scaling_method == "Standard":
        scaler = preprocessing.StandardScaler()
    
        
    col_names = stock_data.columns
    features = stock_data[col_names]
    scaler.fit(features.values)
    features = scaler.transform(features.values)
    scaled_features = pd.DataFrame(features, columns = col_names)
    scaled_features.index = stock_data.index
    
    return scaled_features, scaler


## Loading Data Function

In [4]:
def Load_Dataset(ticker=TICKER, start_date=START_DATE, end_date=END_DATE, scale=SCALE_DATA, scaling_method=SCALING_METHOD):
    '''
    ticker: is the code of the target ticker
    start_date: a start date string with format YYYY/MM/DD
    end_date: an end date string with format YYYY/MM/DD
    scale: a boolean value, True by default
    scaling_method: MinMax(by default), Standard.
    '''

    # result
    result = {
        "dataset": None,
        "scaler": None
    }

    # processing the input parameters
    start_date = dt.datetime.strptime(start_date, "%Y-%m-%d")
    end_date = dt.datetime.strptime(end_date, "%Y-%m-%d")


    # creating necessary folder
    if not os.path.isdir("results"):
        os.mkdir("results")
    
    if not os.path.isdir("data"):
        os.mkdir("data")


    
    # checking if the data is already downloaded 
    ## Get a list of files in the directory
    files = os.listdir("data")
    ## Check each file in the directory
    data = None
    for file_name in files:
        ## if we already downloaded the ticket data
        if file_name.startswith(ticker) and file_name.endswith(".csv"):
            ### Read the file 
            file_path = os.path.join("data", f"{ticker}.csv")
            data = pd.read_csv(file_path, parse_dates=['Date'])
            break

    ## else, we gonna download the stock data
    if data is None:
        stock_data = yf.download(ticker, start_date, end_date)
        file_path = os.path.join("data", f"{ticker}.csv")
        stock_data.to_csv(file_path)
        data = pd.read_csv(file_path, parse_dates=['Date'])

    # if the given time is included in the file, we just take the nessecary dataframe
    if data.head(1)["Date"].values[0] <= np.datetime64(start_date) and data.tail(1)["Date"].values[0] >= np.datetime64(end_date):
        data = data[(data['Date'] >= pd.to_datetime(start_date)) & (data['Date'] <= pd.to_datetime(end_date))]
        print("Local Stock Data is enough for requirements, do not need to download")
    else: 
        stock_data = yf.download(ticker, start_date, end_date)
        file_path = os.path.join("data", f"{ticker}.csv")
        stock_data.to_csv(file_path)
        data = pd.read_csv(file_path, parse_dates=['Date'])
        print("Local Stock Data is not enough for requirements, continuing downloading...")

    # Setting Date as Index
    data.set_index('Date', inplace=True)

    # Scale Data
    if scale:
        data, scaler = scale_data(data, scaling_method)
        result["dataset"] = data
        result["scaler"] = scaler
        return result

    result["dataset"] = data
    
    return result

# Splitting Dataset

In [5]:
def split(dataset: pd.DataFrame, look_up_days=LOOK_UP_DAYS, 
        training_ratio=TRAINING_RATIO, split_by_date=SPLIT_BY_DATE, 
        random=SPLIT_RANDOMLY, feature_columns=['Open','High','Low','Close','Adj Close','Volume']):
    '''
    dataset: a Pandas Dataframe
    training_ratio: is equal to TRAINING_RATION constant
    split_by_date: is equal to SPLIT_BY_DATE constant
    random: is equal to SPLIT_RANDOMLY
    '''
    # result
    result = {
        "X_training_set": {
                        'Open': None,
                        'High': None,
                        'Low': None,
                        'Close': None,
                        'Adj Close': None,
                        'Volume': None
                        },
        "Y_training_set": {
                        'Open': None,
                        'High': None,
                        'Low': None,
                        'Close': None,
                        'Adj Close': None,
                        'Volume': None
                        },
        "X_testing_set": {
                        'Open': None,
                        'High': None,
                        'Low': None,
                        'Close': None,
                        'Adj Close': None,
                        'Volume': None
                        },
        "Y_testing_set": {
                        'Open': None,
                        'High': None,
                        'Low': None,
                        'Close': None,
                        'Adj Close': None,
                        'Volume': None
                        }
    }
  
    for column in feature_columns:
        dataset_in_column = dataset[column].values.reshape(-1, 1)      # <class 'numpy.ndarray'>
        x_data = []
        y_data = []
    
        for x in range(look_up_days, len(dataset_in_column)):
            x_data.append(dataset_in_column[x - look_up_days:x, 0])
            y_data.append(dataset_in_column[x, 0])

        result["X_training_set"][column], result["X_testing_set"][column], result["Y_training_set"][column], result['Y_testing_set'][column] = train_test_split(x_data, y_data, test_size=1-training_ratio, shuffle=False)

        ## Converting to numpy.array

        for column in feature_columns:
            result["X_training_set"][column] = np.array(result["X_training_set"][column])
            result["Y_training_set"][column] = np.array(result["Y_training_set"][column])
            result["X_testing_set"][column] = np.array(result["X_testing_set"][column])
            result["Y_testing_set"][column] = np.array(result["Y_testing_set"][column])

    return result

## Data Processing 1 (FULL)

A function to load and process a dataset with multiple features with the following requirements: 

+ Specify the start date and the end date for the whole 
dataset as inputs. 
+ Allowing you to deal with the NaN issue in the data
+ Splitting dataset according to some specified ratio of train/test
+ Storing the downloaded data on your local machine for future uses
+ Allowing you to have an option to scale your feature columns and store the scalers in a data structure to allow future access to these scalers.

In [6]:
def Data_Processing_1():
    StockData = Load_Dataset()

    scaledStockData = scale_data(stock_data=StockData["dataset"])

    FinalResult = split(dataset=scaledStockData[0])

    print("Loaded Done!\nThe result is a dictionary as below:\n")
    print('''{
        "X_training_set": {
                        'Open': <class 'numpy.ndarray'>,
                        'High': <class 'numpy.ndarray'>,
                        'Low': <class 'numpy.ndarray'>,
                        'Close': <class 'numpy.ndarray'>,
                        'Adj Close': <class 'numpy.ndarray'>,
                        'Volume': <class 'numpy.ndarray'>
                        },
        "Y_training_set": {
                        'Open': ...,
                        'High': ...,
                        'Low': ...,
                        'Close': ...,
                        'Adj Close': ...,
                        'Volume': ...
                        },
        "X_testing_set": {
                        'Open': ...,
                        'High': ...,
                        'Low': ...,
                        'Close': ...,
                        'Adj Close': ...,
                        'Volume': ...
                        },
        "Y_testing_set": {
                        'Open': ...,
                        'High': ...,
                        'Low': ...,
                        'Close': ...,
                        'Adj Close': ...,
                        'Volume': ...
                        }
    }''')

    return FinalResult
    

In [7]:
StockData = Data_Processing_1()

  df.index += _pd.TimedeltaIndex(dst_error_hours, 'h')
[*********************100%%**********************]  1 of 1 completed

Local Stock Data is not enough for requirements, continuing downloading...
Loaded Done!
The result is a dictionary as below:

{
        "X_training_set": {
                        'Open': <class 'numpy.ndarray'>,
                        'High': <class 'numpy.ndarray'>,
                        'Low': <class 'numpy.ndarray'>,
                        'Close': <class 'numpy.ndarray'>,
                        'Adj Close': <class 'numpy.ndarray'>,
                        'Volume': <class 'numpy.ndarray'>
                        },
        "Y_training_set": {
                        'Open': ...,
                        'High': ...,
                        'Low': ...,
                        'Close': ...,
                        'Adj Close': ...,
                        'Volume': ...
                        },
        "X_testing_set": {
                        'Open': ...,
                        'High': ...,
                        'Low': ...,
                        'Close': ...,
           




# Task B3: DATA PROCESSING 2

## Inverse Data

In [8]:
def Inverse_Dataset():
    
    # Loading the Data
    dataLoader = Load_Dataset()
    StockData = dataLoader["dataset"]
    scaler = dataLoader["scaler"]
    
    # Getting Column name
    col_names = StockData.columns
    # Inversing the dataframe
    re_scaled_features = scaler.inverse_transform(StockData)
    re_scaled_stock_data = pd.DataFrame(re_scaled_features, columns = col_names)
    # Assigning index to the rescaled_data
    re_scaled_stock_data.index = StockData.index


    return re_scaled_stock_data
       

## Candlestick Chart

In [9]:
def CandleStick(TradingPeriod=TRADING_PERIOD):
    
    # Loading the dataset
    StockData = Load_Dataset()["dataset"]
    scaled_stock_data = None
    # rescale the dataset if required
    if SCALE_DATA:
        scaled_stock_data = Inverse_Dataset()
    else:
        scaled_stock_data = StockData


    # Processed Data to fit the Trading Period
    total_records = len(scaled_stock_data)
    Price_Data = {  'Date': [],
                    'Open': [],
                    'High': [],
                    'Low': [],
                    'Close': [],
                    'Volume': []
                }
    
    
    ## Loop through the DataFrame in batches of TradingPeriod
    for i in range(0, total_records, TradingPeriod):
        batch = scaled_stock_data.iloc[i:i+TradingPeriod]

        Price_Data['Date'].append(batch.index[0])
        Price_Data['Open'].append(batch['Open'].values[0])   
        Price_Data['High'].append(max(batch['High'].values))
        Price_Data['Low'].append(min(batch['Low'].values))
        Price_Data['Close'].append(batch['Close'].values[len(batch) - 1])
        Price_Data['Volume'].append(sum(batch['Volume'].values))

    
    # Converting to Pandas Dataframe
    NewDataFrame = pd.DataFrame(Price_Data)
    NewDataFrame.set_index('Date', inplace=True)


    fig = go.Figure(data=[go.Candlestick(x=NewDataFrame.index,                         
                open=NewDataFrame['Open'],
                high=NewDataFrame['High'],
                low=NewDataFrame['Low'],
                close=NewDataFrame['Close'])])

    fig.update_layout(
        title=f"Candle Stick Chart for {TICKER} shared price, Trading Period = {TRADING_PERIOD} day(s)",
        xaxis_title="Time",
        yaxis_title="Shared Price (USD)"
    )
    fig.show()


## Boxplot Chart

In [11]:
def BoxPlot(ConsecutiveDays=CONSECUTIVE_DAYS, features=['Open', 'High', 'Low', 'Close']):
    # Loading the dataset
    StockData = Load_Dataset()["dataset"]
    scaled_stock_data = None
    # rescale the dataset if required
    if SCALE_DATA:
        scaled_stock_data = Inverse_Dataset()
    else:
        scaled_stock_data = StockData

    total_records = len(scaled_stock_data)

    consecutive_days_array = []
    ## Loop through the DataFrame in batches of ConsecutiveDays
    for i in range(0, total_records, ConsecutiveDays):
        batch = scaled_stock_data.iloc[i:i+ConsecutiveDays]
        StartDate = batch.index[0].strftime('%Y-%m-%d')
        EndDate = batch.index[len(batch) - 1].strftime('%Y-%m-%d')
        
        consecutive_period = f"{StartDate} to {EndDate}"
        for _ in range(len(batch)):
            consecutive_days_array.append(consecutive_period)
        
    scaled_stock_data['Consecutive'] = consecutive_days_array
    scaled_stock_data['Consecutive'] = scaled_stock_data['Consecutive'].astype("string")
    

    for feature in features:
        fig = px.box(scaled_stock_data, x="Consecutive", y="Open")

        fig.update_layout(
        title=f"Boxplot Chart for {TICKER} shared price, Consecutive Days = {CONSECUTIVE_DAYS} day(s)",
        xaxis_title="Time",
        yaxis_title=f"{feature} Price (USD)"
        )
        fig.show()

## Task B3: Data Processing 2 (Full)

In [12]:
def Data_Processing_2():
    CandleStick()

    BoxPlot()

In [13]:
Data_Processing_2()


The 'unit' keyword in TimedeltaIndex construction is deprecated and will be removed in a future version. Use pd.to_timedelta instead.

[*********************100%%**********************]  1 of 1 completed

The 'unit' keyword in TimedeltaIndex construction is deprecated and will be removed in a future version. Use pd.to_timedelta instead.

[*********************100%%**********************]  1 of 1 completed

Local Stock Data is not enough for requirements, continuing downloading...
Local Stock Data is not enough for requirements, continuing downloading...






The 'unit' keyword in TimedeltaIndex construction is deprecated and will be removed in a future version. Use pd.to_timedelta instead.

[*********************100%%**********************]  1 of 1 completed


Local Stock Data is not enough for requirements, continuing downloading...


The 'unit' keyword in TimedeltaIndex construction is deprecated and will be removed in a future version. Use pd.to_timedelta instead.

[*********************100%%**********************]  1 of 1 completed



Local Stock Data is not enough for requirements, continuing downloading...
