In [27]:
# Naming convention
""" 
datasets_df
    - 전체 주식 데이터 <=> train.csv
dataset_df 
    - 특정 ticker에 대한 데이터 
        <=> datasets_df[datasets_df['ticker'] == 'ticker']

"""
""""""

''

In [48]:
# External libs
import numpy as np
import pandas as pd
from tqdm import tqdm

In [49]:
submission_raw = pd.read_csv("./data/raw_data/sample_submission.csv")
datasets_df_raw = pd.read_csv("./data/raw_data/train.csv")

column_dict = {
    "일자": "date",
    "종목코드": "ticker_code",
    "종목명": "ticker_name",
    "거래량": "volume",
    "시가": "open",
    "고가": "high",
    "저가": "low",
    "종가": "close",
}

In [30]:
datasets_df = datasets_df_raw.copy()
submission_df = submission_raw.copy()

In [31]:
# Preprocessing
## 1. Column 이름 매핑
def map_column_names(datasets_df, column_dict):
    datasets_df.columns = [column_dict[column] for column in datasets_df.columns]
    return datasets_df


## 2. 특이값 제거
### ["volume", "open", "low", "high", "close"] 이 0값이 row 제거
def drop_zero(datasets_df):
    columns = ["volume", "open", "low", "high", "close"]
    for column in columns:
        datasets_df = datasets_df[datasets_df[column] != 0]
    return datasets_df


## 3. 특정 한계 이하로 row가 적은 ticker 제거
class DROP_LACK_DATA:
    def __init__(self, df, percentage) -> None:
        self.df = df
        self.percentage = percentage

    def get_ticker_count_series(self):
        df = self.df
        ticker_count_series = df.groupby("ticker_code").count()["date"]
        return ticker_count_series

    def get_available_tickers(self, ticker_count_series):
        percentage = self.percentage

        available_tickers = ticker_count_series[
            ticker_count_series > ticker_count_series.max() * percentage
        ].index
        return available_tickers

    def filter_available_tickers(self, available_tickers):
        df = self.df
        df = df[df["ticker_code"].isin(available_tickers)]
        return df

    def __call__(self):
        ticker_count_series = self.get_ticker_count_series()
        available_tickers = self.get_available_tickers(ticker_count_series)
        df = self.filter_available_tickers(available_tickers)
        return df
    

# Application
datasets_df = map_column_names(datasets_df, column_dict)
datasets_df = drop_zero(datasets_df)
datasets_df = DROP_LACK_DATA(datasets_df, 0.8)()

In [32]:
# Utils
def get_ticker_codes(datasets_df):
    ticker_codes = sorted(set(datasets_df["ticker_code"]))
    return ticker_codes


def sort_dataset_df(dataset_df, column):
    sorted_dataset_df = dataset_df.sort_values(column)
    return sorted_dataset_df


ticker_codes = get_ticker_codes(datasets_df)

In [33]:
# tmp
ticker_code = ticker_codes[0]

dataset_df = datasets_df[datasets_df["ticker_code"] == ticker_code]
dataset_df = sort_dataset_df(dataset_df, "date")

In [34]:
dataset_df

Unnamed: 0,date,ticker_code,ticker_name,volume,open,high,low,close
502,20210601,A000020,동화약품,114966,14700,14700,14450,14600
2502,20210602,A000020,동화약품,109559,14700,14700,14450,14500
4502,20210603,A000020,동화약품,96158,14550,14650,14450,14600
6502,20210604,A000020,동화약품,133900,14600,14800,14550,14700
8502,20210607,A000020,동화약품,511140,14800,15550,14750,15150
...,...,...,...,...,...,...,...,...
978502,20230523,A000020,동화약품,641524,9160,9900,9160,9770
980502,20230524,A000020,동화약품,205243,9770,9820,9550,9740
982502,20230525,A000020,동화약품,398326,9660,10180,9660,10040
984502,20230526,A000020,동화약품,196257,10050,10150,9850,9850


In [35]:
# Model Preprocessing
def append_price_diff(dataset, open_col, close_col):
    dataset["price_diff"] = (dataset[open_col] - dataset[close_col]) / dataset[open_col]
    return dataset

def get_array_list(dataset, column):
    _arraylist = dataset[column].values
    return _arraylist

In [36]:
dataset_df = append_price_diff(dataset_df, "open", "close")
price_diff_arraylist = get_array_list(dataset_df, "price_diff")

In [37]:
# Format dataset


def get_x_y_dataset(arraylist, CFG):
    i_window = CFG["input_window"]
    o_window = CFG["output_window"]

    x_dataset = list()
    y_dataset = list()

    for idx in range(len(arraylist) - i_window - o_window + 1):
        _x = arraylist[idx : idx + i_window]
        _y = arraylist[idx + i_window : idx + i_window + o_window]
        x_dataset.append(_x)
        y_dataset.append(_y)

    x_dataset = np.array(x_dataset)
    y_dataset = np.array(y_dataset)
    return x_dataset, y_dataset

In [38]:
CFG = {
    "input_window": 8,
    "output_window": 15,
}

x_dataset, y_dataset = get_x_y_dataset(price_diff_arraylist, CFG)
y_dataset = y_dataset.sum(axis=1)
final_x = price_diff_arraylist[-CFG["input_window"] :]

In [39]:
# Model
## Get similairty_score
def get_cosine_similarity(array_1, array_2):
    cosine_similarity = np.dot(array_1, array_2) / (
        np.linalg.norm(array_1) * np.linalg.norm(array_2)
    )
    return cosine_similarity


def get_similarity_df(x_dataset, y_dataset, final_x):
    similarity_results = list()
    for x_data, y_data in zip(x_dataset, y_dataset):
        _similarity_score = get_cosine_similarity(x_data, final_x)
        similarity_results.append(
            {
                "similarity_score": _similarity_score,
                "actual_y": y_data,
            }
        )
    similarity_df = pd.DataFrame(similarity_results)
    return similarity_df


def get_similarity_main_df(x_dataset, y_dataset, final_x, n):
    similarity_results = list()
    for x_data, y_data in zip(x_dataset, y_dataset):
        _similarity_score = get_cosine_similarity(x_data, final_x)
        similarity_results.append(
            {
                "similarity_score": _similarity_score,
                "actual_y": y_data,
            }
        )
    similarity_df = pd.DataFrame(similarity_results)
    similarity_main_df = similarity_df.nlargest(n, "similarity_score")
    return similarity_main_df


def get_pred_y(similarity_df):
    pred_y = (similarity_df["similarity_score"] * similarity_df["actual_y"]).mean()
    return pred_y

In [40]:
CFG = {
    "input_window": 20,
    "output_window": 15,
}


x_dataset, y_dataset = get_x_y_dataset(price_diff_arraylist, CFG)
y_dataset = y_dataset.sum(axis=1)
final_x = price_diff_arraylist[-CFG["input_window"] :]

similarity_main_df = get_similarity_main_df(x_dataset, y_dataset, final_x, 5)
pred_y = get_pred_y(similarity_main_df)

# similarity_df = get_similarity_df(x_dataset, y_dataset, final_x)
# pred_y_v1 = get_pred_y(similarity_df)

In [41]:
CFG = {
    "input_window": 20,
    "output_window": 15,
}
ticker_pred_dict = dict()
for ticker_code in tqdm(ticker_codes):
    dataset_df = datasets_df[datasets_df["ticker_code"] == ticker_code]
    dataset_df = sort_dataset_df(dataset_df, "date")
    dataset_df = append_price_diff(dataset_df, "open", "close")

    price_diff_arraylist = get_array_list(dataset_df, "price_diff")

    x_dataset, y_dataset = get_x_y_dataset(price_diff_arraylist, CFG)
    y_dataset = y_dataset.sum(axis=1)

    final_x = price_diff_arraylist[-CFG["input_window"] :]

    similarity_main_df = get_similarity_main_df(x_dataset, y_dataset, final_x, 5)
    pred_y = get_pred_y(similarity_main_df)
    ticker_pred_dict[ticker_code] = pred_y

100%|██████████| 1947/1947 [01:11<00:00, 27.24it/s]


In [42]:
def make_submission_df(submission, ticker_score_dict):
    submission["score"] = submission["종목코드"].map(ticker_score_dict)
    submission["score"] = submission["score"].fillna(0)
    submission["순위"] = (
        submission["score"].rank(method="first", ascending=False).astype(int)
    )
    submission_result = submission.loc[:, ["종목코드", "순위"]]
    return submission_result


In [43]:
submission = submission_raw.copy()
submission_result = make_submission_df(submission_raw, ticker_pred_dict)

In [45]:
submission_result.to_csv("./data/final_result/similarity_top5_window2015.csv")

In [50]:
CFG = {
    "input_window": 20,
    "output_window": 15,
}
ticker_pred_dict = dict()
for ticker_code in tqdm(ticker_codes):
    dataset_df = datasets_df[datasets_df["ticker_code"] == ticker_code]
    dataset_df = sort_dataset_df(dataset_df, "date")
    dataset_df = append_price_diff(dataset_df, "open", "close")

    price_diff_arraylist = get_array_list(dataset_df, "price_diff")

    x_dataset, y_dataset = get_x_y_dataset(price_diff_arraylist, CFG)
    y_dataset = y_dataset.sum(axis=1)

    final_x = price_diff_arraylist[-CFG["input_window"] :]

    similarity_df = get_similarity_df(x_dataset, y_dataset, final_x)
    pred_y = get_pred_y(similarity_df)
    ticker_pred_dict[ticker_code] = pred_y

100%|██████████| 1947/1947 [01:11<00:00, 27.26it/s]


In [51]:
submission = submission_raw.copy()
submission_result = make_submission_df(submission, ticker_pred_dict)

In [53]:
submission_result.to_csv("./data/final_result/similarity_all_window2015.csv")