# Importing Dependencies

In [2]:
# Task B2
import numpy as np
import pandas as pd
import yfinance as yf
import datetime as dt
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
import os

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


# Hyperparameters

In [1]:
# Task B2
TICKER = "AAPL"
START_DATE = "2014-01-01"
END_DATE = "2020-12-31"
LOOK_UP_DAYS = 30      
TRAINING_RATIO = 0.8    # 0.7 == 70%
SPLIT_BY_DATE = False
SPLIT_RANDOMLY = False
SCALE_DATA = True
SCALING_METHOD = "MinMax"       # MinMax, Standard

# Task B2: DATA PROCESSING 1

## Scaling dataset

In [3]:
def scale_data(stock_data, scaling_method=SCALING_METHOD):

    if scaling_method == "MinMax":
        scaler = preprocessing.MinMaxScaler()
    
    elif scaling_method == "Standard":
        scaler = preprocessing.StandardScaler()
    
        
    col_names = stock_data.columns
    features = stock_data[col_names]
    scaler.fit(features.values)
    features = scaler.transform(features.values)
    scaled_features = pd.DataFrame(features, columns = col_names)
    scaled_features.index = stock_data.index
    
    return scaled_features, scaler


## Loading Data Function

In [5]:
def Load_Dataset(ticker=TICKER, start_date=START_DATE, end_date=END_DATE, scale=SCALE_DATA, scaling_method=SCALING_METHOD):
    '''
    ticker: is the code of the target ticker
    start_date: a start date string with format YYYY/MM/DD
    end_date: an end date string with format YYYY/MM/DD
    scale: a boolean value, True by default
    scaling_method: MinMax(by default), Standard.
    '''

    # result
    result = {
        "dataset": None,
        "scaler": None
    }

    # processing the input parameters
    start_date = dt.datetime.strptime(start_date, "%Y-%m-%d")
    end_date = dt.datetime.strptime(end_date, "%Y-%m-%d")


    # creating necessary folder
    if not os.path.isdir("results"):
        os.mkdir("results")
    
    if not os.path.isdir("data"):
        os.mkdir("data")


    
    # checking if the data is already downloaded 
    ## Get a list of files in the directory
    files = os.listdir("data")
    ## Check each file in the directory
    data = None
    for file_name in files:
        ## if we already downloaded the ticket data
        if file_name.startswith(ticker) and file_name.endswith(".csv"):
            ### Read the file 
            file_path = os.path.join("data", f"{ticker}.csv")
            data = pd.read_csv(file_path, parse_dates=['Date'])
            break

    ## else, we gonna download the stock data
    if data is None:
        stock_data = yf.download(ticker, start_date, end_date)
        file_path = os.path.join("data", f"{ticker}.csv")
        stock_data.to_csv(file_path)
        data = pd.read_csv(file_path, parse_dates=['Date'])

    # if the given time is included in the file, we just take the nessecary dataframe
    if data.head(1)["Date"].values[0] <= np.datetime64(start_date) and data.tail(1)["Date"].values[0] >= np.datetime64(end_date):
        data = data[(data['Date'] >= pd.to_datetime(start_date)) & (data['Date'] <= pd.to_datetime(end_date))]
        print("Local Stock Data is enough for requirements, do not need to download")
    else: 
        stock_data = yf.download(ticker, start_date, end_date)
        file_path = os.path.join("data", f"{ticker}.csv")
        stock_data.to_csv(file_path)
        data = pd.read_csv(file_path, parse_dates=['Date'])
        print("Local Stock Data is not enough for requirements, continuing downloading...")

    # Setting Date as Index
    data.set_index('Date', inplace=True)

    # Scale Data
    if scale:
        data, scaler = scale_data(data, scaling_method)
        result["dataset"] = data
        result["scaler"] = scaler
        return result

    result["dataset"] = data
    
    return result

In [6]:
StockData = Load_Dataset("AAPL", "2010-12-01", "2020-01-01")

Local Stock Data is enough for requirements, do not need to download


In [6]:
StockData["dataset"].info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 2286 entries, 2010-12-01 to 2019-12-31
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Open       2286 non-null   float64
 1   High       2286 non-null   float64
 2   Low        2286 non-null   float64
 3   Close      2286 non-null   float64
 4   Adj Close  2286 non-null   float64
 5   Volume     2286 non-null   float64
dtypes: float64(6)
memory usage: 125.0 KB


In [7]:
print(type(StockData["dataset"]["Open"].values))

<class 'numpy.ndarray'>


# Splitting Dataset

In [7]:
def split(dataset: pd.DataFrame, look_up_days=LOOK_UP_DAYS, 
        training_ratio=TRAINING_RATIO, split_by_date=SPLIT_BY_DATE, 
        random=SPLIT_RANDOMLY, feature_columns=['Open','High','Low','Close','Adj Close','Volume']):
    '''
    dataset: a Pandas Dataframe
    training_ratio: is equal to TRAINING_RATION constant
    split_by_date: is equal to SPLIT_BY_DATE constant
    random: is equal to SPLIT_RANDOMLY
    '''
    # result
    result = {
        "X_training_set": {
                        'Open': None,
                        'High': None,
                        'Low': None,
                        'Close': None,
                        'Adj Close': None,
                        'Volume': None
                        },
        "Y_training_set": {
                        'Open': None,
                        'High': None,
                        'Low': None,
                        'Close': None,
                        'Adj Close': None,
                        'Volume': None
                        },
        "X_testing_set": {
                        'Open': None,
                        'High': None,
                        'Low': None,
                        'Close': None,
                        'Adj Close': None,
                        'Volume': None
                        },
        "Y_testing_set": {
                        'Open': None,
                        'High': None,
                        'Low': None,
                        'Close': None,
                        'Adj Close': None,
                        'Volume': None
                        }
    }
  
    for column in feature_columns:
        dataset_in_column = dataset[column].values.reshape(-1, 1)      # <class 'numpy.ndarray'>
        x_data = []
        y_data = []
    
        for x in range(look_up_days, len(dataset_in_column)):
            x_data.append(dataset_in_column[x - look_up_days:x, 0])
            y_data.append(dataset_in_column[x, 0])

        result["X_training_set"][column], result["X_testing_set"][column], result["Y_training_set"][column], result['Y_testing_set'][column] = train_test_split(x_data, y_data, test_size=1-training_ratio, shuffle=False)

        ## Converting to numpy.array

        for column in feature_columns:
            result["X_training_set"][column] = np.array(result["X_training_set"][column])
            result["Y_training_set"][column] = np.array(result["Y_training_set"][column])
            result["X_testing_set"][column] = np.array(result["X_testing_set"][column])
            result["Y_testing_set"][column] = np.array(result["Y_testing_set"][column])

    return result

## Data Processing 1 (FULL)

A function to load and process a dataset with multiple features with the following requirements: 

+ Specify the start date and the end date for the whole 
dataset as inputs. 
+ Allowing you to deal with the NaN issue in the data
+ Splitting dataset according to some specified ratio of train/test
+ Storing the downloaded data on your local machine for future uses
+ Allowing you to have an option to scale your feature columns and store the scalers in a data structure to allow future access to these scalers.

In [8]:
def Data_Processing_1():
    StockData = Load_Dataset()

    scaledStockData = scale_data(stock_data=StockData["dataset"])

    FinalResult = split(dataset=scaledStockData[0])

    print("Loaded Done!\nThe result is a dictionary as below:\n")
    print('''{
        "X_training_set": {
                        'Open': <class 'numpy.ndarray'>,
                        'High': <class 'numpy.ndarray'>,
                        'Low': <class 'numpy.ndarray'>,
                        'Close': <class 'numpy.ndarray'>,
                        'Adj Close': <class 'numpy.ndarray'>,
                        'Volume': <class 'numpy.ndarray'>
                        },
        "Y_training_set": {
                        'Open': ...,
                        'High': ...,
                        'Low': ...,
                        'Close': ...,
                        'Adj Close': ...,
                        'Volume': ...
                        },
        "X_testing_set": {
                        'Open': ...,
                        'High': ...,
                        'Low': ...,
                        'Close': ...,
                        'Adj Close': ...,
                        'Volume': ...
                        },
        "Y_testing_set": {
                        'Open': ...,
                        'High': ...,
                        'Low': ...,
                        'Close': ...,
                        'Adj Close': ...,
                        'Volume': ...
                        }
    }''')

    return FinalResult
    

In [9]:
StockData = Data_Processing_1()

Local Stock Data is enough for requirements, do not need to download
Loaded Done!
The result is a dictionary as below:

{
        "X_training_set": {
                        'Open': <class 'numpy.ndarray'>,
                        'High': <class 'numpy.ndarray'>,
                        'Low': <class 'numpy.ndarray'>,
                        'Close': <class 'numpy.ndarray'>,
                        'Adj Close': <class 'numpy.ndarray'>,
                        'Volume': <class 'numpy.ndarray'>
                        },
        "Y_training_set": {
                        'Open': ...,
                        'High': ...,
                        'Low': ...,
                        'Close': ...,
                        'Adj Close': ...,
                        'Volume': ...
                        },
        "X_testing_set": {
                        'Open': ...,
                        'High': ...,
                        'Low': ...,
                        'Close': ...,
                 