In [23]:
# imports

import json
import numpy as np
import pandas as pd

from tqdm import tqdm

In [24]:
# Preprocessing


class GeneralPPS:
    @staticmethod
    def map_columns(df, column_dict):
        df.columns = [column_dict[col] for col in df.columns]
        return df

    @staticmethod
    def drop_zero(df):
        columns = ["volume", "open", "low", "high", "close"]
        for column in columns:
            df = df[df[column] != 0]
        return df

    @staticmethod
    class DROP_LACK_DATA:
        def __init__(self, df, percentage) -> None:
            self.df = df
            self.percentage = percentage

        def get_ticker_count_series(self):
            df = self.df
            ticker_count_series = df.groupby("ticker").count()["date"]
            return ticker_count_series

        def get_available_tickers(self, ticker_count_series):
            percentage = self.percentage

            available_tickers = ticker_count_series[
                ticker_count_series > ticker_count_series.max() * percentage
            ].index
            return available_tickers

        def filter_available_tickers(self, available_tickers):
            df = self.df
            df = df[df["ticker"].isin(available_tickers)]
            return df

        def __call__(self):
            ticker_count_series = self.get_ticker_count_series()
            available_tickers = self.get_available_tickers(ticker_count_series)
            df = self.filter_available_tickers(available_tickers)
            return df

In [25]:
def get_x_y(arraylist, CFG):
    i_window = CFG["input_window"]
    o_window = CFG["output_window"]

    x_dataset = list()
    y_dataset = list()

    for dataset_idx in range(len(arraylist) - i_window - o_window + 1):
        _x = arraylist[dataset_idx : dataset_idx + i_window]
        _y = arraylist[dataset_idx + i_window : dataset_idx + i_window + o_window]
        x_dataset.append(_x)
        y_dataset.append(_y)

    x_dataset = np.array(x_dataset)
    y_dataset = np.array(y_dataset)
    final_x = arraylist[-i_window:]
    return x_dataset, y_dataset, final_x

In [26]:
# model dataset
def sort_dataset_by_date(dataset, column="date"):
    sorted_dataset = dataset.sort_values(column)
    return sorted_dataset


def append_price_diff(dataset, open_col="open", close_col="close"):
    dataset["price_diff"] = (dataset[open_col] - dataset[close_col]) / dataset[open_col]
    return dataset


def get_array_list(dataset, column):
    _arraylist = dataset[column].values
    return _arraylist

In [27]:
# models

# Similarity_model
def get_cosine_similarity(array_1, array_2):
    cosine_similarity = np.dot(array_1, array_2) / (
        np.linalg.norm(array_1) * np.linalg.norm(array_2)
    )
    return cosine_similarity


def get_similarity_df(x_dataset, y_dataset, final_x):
    similarity_results = list()
    for x_data, y_data in zip(x_dataset, y_dataset):
        _similarity_score = get_cosine_similarity(x_data, final_x)
        similarity_results.append(
            {
                "similarity_score": _similarity_score,
                "actual_y": y_data,
            }
        )
    similarity_df = pd.DataFrame(similarity_results)
    return similarity_df


def get_y_pred(similarity_df):
    similarity_main_df = similarity_df.nlargest(5, "similarity_score")
    y_pred = (
        similarity_main_df["similarity_score"] * similarity_main_df["actual_y"]
    ).mean()
    return y_pred

In [28]:
# ETL

raw_train = pd.read_csv("./data/raw_data/train.csv")
raw_submission = pd.read_csv("./data/raw_data/sample_submission.csv")

with open("./data/assets/column_dict.json", "r") as f:
    column_dict = json.load(f)

In [29]:
datasets = raw_train.copy()
submission = raw_submission.copy()

In [30]:
general_pps = GeneralPPS()

datasets = general_pps.map_columns(datasets, column_dict)
datasets = general_pps.drop_zero(datasets)
datasets = general_pps.DROP_LACK_DATA(datasets, 0.8)()

In [31]:
CFG = {
    "input_window": 15,
    "output_window": 15,
}
tickers = sorted(set(datasets["ticker"]))

datasets_list = list()

for ticker in tqdm(tickers):
    _dataset = datasets[datasets["ticker"] == ticker]

    _dataset = sort_dataset_by_date(_dataset, "date")
    _dataset = append_price_diff(_dataset, "open", "close")

    _price_diff_arraylist = get_array_list(_dataset, "price_diff")

    x_dataset, y_dataset, final_x = get_x_y(_price_diff_arraylist, CFG)
    y_dataset = y_dataset.sum(axis=1)
    _ticker_df = pd.DataFrame([[[x] for x in x_dataset], y_dataset]).T
    _ticker_df.columns = ["x", "y"]
    _ticker_df["x"] = _ticker_df["x"].apply(lambda x: x[0])

    datasets_list.append(_ticker_df)

datasets_df = pd.concat(datasets_list)

100%|██████████| 1947/1947 [01:16<00:00, 25.32it/s]


In [32]:
ticker_pred_dict = dict()

for ticker in tqdm(tickers):
    _dataset = datasets[datasets["ticker"] == ticker]
    _dataset = sort_dataset_by_date(_dataset, "date")
    _dataset = append_price_diff(_dataset, "open", "close")

    _price_diff_arraylist = get_array_list(_dataset, "price_diff")
    final_x = _price_diff_arraylist[-CFG["input_window"] :]
    _datasets_df = datasets_df.sample(10000)

    _datasets_df["score"] = _datasets_df["x"].apply(
        lambda x: get_cosine_similarity(x, final_x)
    )
    datasets_main_df = _datasets_df.nlargest(5, "score")
    pred_y = (datasets_main_df["y"] * datasets_main_df["score"]).mean()
    ticker_pred_dict[ticker] = pred_y

100%|██████████| 1947/1947 [02:40<00:00, 12.13it/s]
