In [7]:
# imports
import numpy as np
import pandas as pd
from tqdm import tqdm

In [8]:
# Load
datasets = pd.read_csv("./data/pps_data/datasets.csv")

In [9]:
# PPS
def append_price_diff(dataset):
    dataset.loc[:, ["price_diff"]] = (dataset["open"] - dataset["close"]) / dataset[
        "open"
    ]
    return dataset

datasets = append_price_diff(datasets)

In [10]:
# Model
def get_cosine_similarity(array_1, array_2):
    cosine_similarity = np.dot(array_1, array_2) / (np.linalg.norm(array_1) * np.linalg.norm(array_2))
    return cosine_similarity

In [11]:
tickers = sorted(set(datasets["ticker"]))
ticker = tickers[0]

dataset = datasets[datasets["ticker"] == ticker]

In [12]:
_arraylist = dataset["price_diff"].values

In [13]:
input_window = 15
output_window = 15

In [14]:
x_dataset = list()
y_dataset = list()

for dataset_idx in range(len(_arraylist) - input_window - output_window + 1):
    _x = _arraylist[dataset_idx : dataset_idx + input_window]
    _y = _arraylist[dataset_idx + input_window : dataset_idx + input_window + output_window].sum()
    x_dataset.append(_x)
    y_dataset.append(_y)

x_dataset = np.array(x_dataset)
y_dataset = np.array(y_dataset)
final_x = _arraylist[-input_window:]

In [15]:
result_list = list()
for idx, _x in enumerate(x_dataset):
    cosine_similarity = get_cosine_similarity(_x, final_x)
    result_list.append({"score": cosine_similarity, "y": y_dataset[idx]})

In [16]:
result_df = pd.DataFrame(result_list)
result_main_df = result_df.nlargest(5, "score")
pred_y = (
    result_main_df["score"].apply(lambda x: np.exp(x)) * result_main_df["y"]
).sum()

In [17]:
ticker_score_dict = dict()
input_window = 25
output_window = 15

for ticker in tqdm(tickers):
    dataset = datasets[datasets["ticker"] == ticker]
    _arraylist = dataset["price_diff"].values

    x_dataset = list()
    y_dataset = list()

    for dataset_idx in range(len(_arraylist) - input_window - output_window + 1):
        _x = _arraylist[dataset_idx : dataset_idx + input_window]
        _y = _arraylist[
            dataset_idx + input_window : dataset_idx + input_window + output_window
        ].sum()
        x_dataset.append(_x)
        y_dataset.append(_y)

    x_dataset = np.array(x_dataset)
    y_dataset = np.array(y_dataset)
    final_x = _arraylist[-input_window:]

    result_list = list()
    for idx, _x in enumerate(x_dataset):
        cosine_similarity = get_cosine_similarity(_x, final_x)
        result_list.append({"score": cosine_similarity, "y": y_dataset[idx]})

    result_df = pd.DataFrame(result_list)
    result_main_df = result_df.nlargest(5, "score")
    pred_y = (
        result_main_df["score"].apply(lambda x: np.exp(x)) * result_main_df["y"]
    ).sum()

    ticker_score_dict[ticker] = pred_y

  0%|          | 0/1947 [00:00<?, ?it/s]

100%|██████████| 1947/1947 [00:05<00:00, 333.02it/s]


In [18]:
sample_submission = pd.read_csv("./data/raw_data/sample_submission.csv")

In [19]:
def make_submission_df(sample_submission, ticker_score_dict):
    sample_submission["score"] = sample_submission["종목코드"].map(ticker_score_dict)
    sample_submission["score"] = sample_submission["score"].fillna(0)
    sample_submission["순위"] = (
        sample_submission["score"].rank(method="first", ascending=False).astype(int)
    )
    submission_result = sample_submission.loc[:, ["종목코드", "순위"]]
    return submission_result


submission_result = make_submission_df(sample_submission, ticker_score_dict)

In [21]:
submission_result.to_csv("./data/result/similarity_all_25_15.csv")