# Importing Libraries:

### pandas: used for data manipulation and analysis
### os: used to interact with the operating system
### numpy: used for numerical computing with arrays and matrices
### unittest: used for writing and running unit tests
### train_test_split: used to split the data into training and testing sets
### mean_squared_error: used to calculate the mean squared error metric
### joblib: used to calculate the mean squared error metric
### xgb: Machine Learning Algorithm used for machine learning tasks
### pickle: used for object serialization and deserialization
### logging: used for emitting log messages from your code

In [3]:
import pandas as pd
import os
import numpy as np
import unittest
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import joblib
import xgboost as xgb
import pickle
import logging

  from pandas import MultiIndex, Int64Index


# Converting all csv files into a dataframe

### Loading ETF and stock datasets
### Datasets avaialble at: https://www.kaggle.com/datasets/jacksoncrow/stock-market-dataset
### Converting all csv files of ETF and stocks into a dataframe and merging that with symbols_valid_meta.csv to get security_name column and saving the dataframe in combined_df


In [22]:
# Path to the folder containing the CSV files
folder_paths = ['etfs/','stocks/']
updated_dfs = []

# Looping through the folders of etfs and stocks
for folder_path in folder_paths:
    # Creating an empty list to store individual DataFrames
    dfs = []
    file_names = []

    # Iterating over each file in the folder
    for filename in os.listdir(folder_path):
        if filename.endswith(".csv"):
            # Getting the full file path which includes the path and the filename
            file_path = os.path.join(folder_path, filename)
            # Reading the CSV file into a DataFrame. Since each file has it's own csv file
            df = pd.read_csv(file_path)
            # getting the filename of each csv file and loading the same in a column called symbol
            df['Symbol'] = filename.replace('.csv', '')
            # Creating different dataframes and appending that to a list
            dfs.append(df)
            # Store the filename of all the files in a list
            file_names.append(filename)

    # Combine all DataFrames in the list into a single DataFrame
    df = pd.concat(dfs, ignore_index=True)


    # Read the CSV file which has the security name. Here the file_names is being used to get the security_names
    comparison_file_path = 'symbols_valid_meta.csv'
    comparison_df = pd.read_csv(comparison_file_path)

    # Creating a new DataFrame with only the 'Symbol' and 'security_name' columns from the comparison DataFrame
    comparison_subset = comparison_df[['Symbol', 'Security Name']]

    # Merging the comparison DataFrame subset with the combined DataFrame based on the 'Symbol' column
    df = pd.merge(df, comparison_subset, on='Symbol', how='left')
    updated_dfs.append(df)


etfs_df = updated_dfs[0]
stocks_df = updated_dfs[1]


combined_df = pd.concat([etfs_df, stocks_df], ignore_index=True)

# Defining the columns so that they will be in the order as defined in the problem.
columns = ['Symbol','Security Name', 'Date', 'Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume']

combined_df = combined_df[columns]
combined_df.to_parquet("combined.parquet")


In [23]:
combined_df

Unnamed: 0,Symbol,Security Name,Date,Open,High,Low,Close,Adj Close,Volume
0,AAAU,Perth Mint Physical Gold ETF,2018-08-15,11.84,11.84,11.74,11.74,11.74,27300.0
1,AAAU,Perth Mint Physical Gold ETF,2018-08-16,11.78,11.80,11.74,11.74,11.74,428400.0
2,AAAU,Perth Mint Physical Gold ETF,2018-08-17,11.80,11.82,11.77,11.82,11.82,52400.0
3,AAAU,Perth Mint Physical Gold ETF,2018-08-20,11.88,11.91,11.85,11.90,11.90,28700.0
4,AAAU,Perth Mint Physical Gold ETF,2018-08-21,11.92,11.95,11.89,11.93,11.93,30600.0
...,...,...,...,...,...,...,...,...,...
28148363,ZYXI,"Zynex, Inc. - Common Stock",2020-03-26,10.23,11.43,10.23,11.10,11.10,189500.0
28148364,ZYXI,"Zynex, Inc. - Common Stock",2020-03-27,10.70,10.98,10.06,10.30,10.30,145000.0
28148365,ZYXI,"Zynex, Inc. - Common Stock",2020-03-30,10.16,11.06,10.16,10.80,10.80,162300.0
28148366,ZYXI,"Zynex, Inc. - Common Stock",2020-03-31,10.68,11.14,10.59,11.07,11.07,280400.0


# Feature Engineering

### Calculating the moving average of the trading volume (Volume) of 30 days per each stock and ETF, and retain it in a newly added column vol_moving_avg

### Calculating the median of the Adj Close of 30 days per each stock and ETF, and retain it in a newly added column adj_close_rolling_med

In [24]:
# Calculate the moving average of the trading volume (Volume) for each stock and ETF
grouped_df = combined_df.groupby('Symbol')
moving_avg = grouped_df['Volume'].rolling(window=30).mean()

# Reset the index and drop the original index to align with the combined_df
moving_avg = moving_avg.reset_index(level=0, drop=True)

# Assign the calculated moving average values to the vol_moving_avg column in combined_df
combined_df['vol_moving_avg'] = moving_avg


In [25]:
# Calculate the rolling median of the 'Adj Close' column for each stock and ETF
grouped_df = combined_df.groupby('Symbol')
rolling_median = grouped_df['Adj Close'].rolling(window=30).median()

# Reset the index and drop the original index to align with the combined_df
rolling_median = rolling_median.reset_index(level=0, drop=True)

# Assign the calculated rolling median values to the adj_close_rolling_med column in combined_df
combined_df['adj_close_rolling_med'] = rolling_median


In [5]:
combined_df.to_parquet("final_df.parquet")

In [4]:
combined_df = pd.read_parquet("final_df.parquet")

# Unittest

In [18]:
# Unittest for median:


class RollingMedianTestCase(unittest.TestCase):
    
    def setUp(self):
        # Creating a sample DataFrame for testing purpose
        data = {
            'Symbol': ['AAPL', 'AAPL', 'AAPL', 'GOOG', 'GOOG', 'GOOG'],
            'Adj Close': [10, 20, 30, 40, 50, 60]
        }
        self.df = pd.DataFrame(data)
    
    def test_rolling_median(self):
        # Calculating the rolling median for each stock/ETF of the sample dataset
        expected_result = [10.0, 15.0, 25.0, 35.0, 45.0, 55.0]
        # Grouping the dataframe by "symbol" column
        grouped_df = self.df.groupby('Symbol')
        # Then calculating 
        rolling_median = self.df['Adj Close'].rolling(window=2, min_periods=1).median()
        rolling_median = rolling_median.reset_index(level=0, drop=True)
        self.df['adj_close_rolling_med'] = rolling_median
        
        # Compare the calculated rolling median with the expected result
        result = self.df['adj_close_rolling_med'].tolist()
        self.assertEqual(result, expected_result)

if __name__ == '__main__':
    unittest.main(argv=[''], exit=False)


.
----------------------------------------------------------------------
Ran 1 test in 0.002s

OK


# Training

### Trained GBDT: (Gradient Boost Decision Tree)

In [6]:
# Loading and preprocessing the data
combined_df['Date'] = pd.to_datetime(combined_df['Date'])

# Setting the Date column as index
combined_df.set_index('Date', inplace=True)

# Dropping rows with missing values
combined_df.dropna(inplace=True)



# Splitting the data into features and target
features = combined_df[['vol_moving_avg', 'adj_close_rolling_med']]
target = combined_df['Volume']

# Step 4: splitting the features and target into Train-test split
X_train, X_test, y_train, y_test = train_test_split(features.values, target.values, test_size=0.2, random_state=42)

# Setting up the logging configuration
logging.basicConfig(filename='training.log', level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Training the Machine Learning model
model = xgb.XGBRegressor()
model.fit(X_train, y_train)

# Logging the training metrics
logging.info('Training Metrics:')
# Using model that trained above for prediction 
y_pred = model.predict(X_train)
mse = mean_squared_error(y_train, y_pred)
logging.info(f'Mean Squared Error (MSE): {mse}')

# Evaluating the model on the test_data
logging.info('Evaluation Metrics:')
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
logging.info(f'Mean Squared Error (MSE): {mse}')

# Saving the trained model in the pkl format
model_filename = 'xgb_model.pkl'
with open(model_filename, 'wb') as file:
    pickle.dump(model, file)

# Prediction

In [20]:
# Predicting the model
# Creating the dataframe with vol_moving_avg and adj_close_rolling_med as columns
input_data = pd.DataFrame({'vol_moving_avg': [421296.666667], 'adj_close_rolling_med': [11.095]})
# Getting the 
expected_features = input_data.columns.tolist()
input_df = input_data[expected_features]
input_array = input_df.values




model.predict((input_array))

array([420898.4], dtype=float32)