In [39]:
# imports

import json
import numpy as np
import pandas as pd


from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error

In [40]:
# ETL

raw_train = pd.read_csv("./data/raw_data/train.csv")
raw_submission = pd.read_csv("./data/raw_data/sample_submission.csv")

with open("./data/assets/column_dict.json", "r") as f:
    column_dict = json.load(f)

In [41]:
datasets = raw_train.copy()
submission = raw_submission.copy()

In [42]:
# Preprocessing

class GeneralPPS:
    @staticmethod
    def map_columns(df, column_dict):
        df.columns = [column_dict[col] for col in df.columns]
        return df

    @staticmethod
    def drop_zero(df):
        columns = ["volume", "open", "low", "high", "close"]
        for column in columns:
            df = df[df[column] != 0]
        return df

    @staticmethod
    class DROP_LACK_DATA:
        def __init__(self, df, percentage) -> None:
            self.df = df
            self.percentage = percentage

        def get_ticker_count_series(self):
            df = self.df
            ticker_count_series = df.groupby("ticker").count()["date"]
            return ticker_count_series

        def get_available_tickers(self, ticker_count_series):
            percentage = self.percentage

            available_tickers = ticker_count_series[
                ticker_count_series > ticker_count_series.max() * percentage
            ].index
            return available_tickers

        def filter_available_tickers(self, available_tickers):
            df = self.df
            df = df[df["ticker"].isin(available_tickers)]
            return df

        def __call__(self):
            ticker_count_series = self.get_ticker_count_series()
            available_tickers = self.get_available_tickers(ticker_count_series)
            df = self.filter_available_tickers(available_tickers)
            return df

In [43]:
general_pps = GeneralPPS()

datasets = general_pps.map_columns(datasets, column_dict)
datasets = general_pps.drop_zero(datasets)
datasets = general_pps.DROP_LACK_DATA(datasets, 0.8)()

In [44]:
raw_datasets = datasets.copy()

In [45]:
# Model

## Linear Coef model

In [46]:
from sklearn.linear_model import LinearRegression


def sort_dataset_by_date(dataset, column="date"):
    sorted_dataset = dataset.sort_values(column)
    return sorted_dataset


def append_price_diff(dataset, open_col="open", close_col="close"):
    dataset["price_diff"] = (dataset[open_col] - dataset[close_col]) / dataset[open_col]
    return dataset


def get_array_list(dataset, column):
    _arraylist = dataset[column].values
    return _arraylist


def get_x_y(arraylist, CFG):
    i_window = CFG["input_window"]
    o_window = CFG["output_window"]

    _x_dataset = list()
    _y_dataset = list()

    for idx in range(len(arraylist) - i_window - o_window + 1):
        _x = arraylist[idx : idx + i_window]
        _y = arraylist[idx + i_window : idx + i_window + o_window]

        _x_dataset.append(_x)
        _y_dataset.append(_y)

    _x_dataset = np.array(_x_dataset)
    _y_dataset = np.array(_y_dataset)
    return _x_dataset, _y_dataset

In [47]:
CFG = {
    "dataset_window": 100,
    "input_window": 10,
    "output_window": 15,
}

In [35]:
tickers = sorted(set(datasets["ticker"]))

In [36]:
tickers = sorted(set(datasets["ticker"]))
ticker = tickers[0]

_dataset = datasets[datasets["ticker"] == ticker]
_dataset = sort_dataset_by_date(_dataset, "date")
_dataset = append_price_diff(_dataset, "open", "close")
_price_diff_arraylist = get_array_list(_dataset, "price_diff")

result_list = list()
for dataset_idx in range(len(_price_diff_arraylist) - CFG["dataset_window"] + 1):
    _price_diff_arraylist_dataset = _price_diff_arraylist[
        dataset_idx : dataset_idx + CFG["dataset_window"]
    ]

    x_dataset, y_dataset = get_x_y(_price_diff_arraylist_dataset, CFG)
    y_dataset = y_dataset.sum(axis=1)
    x_train, x_test, y_train, y_test = (
        x_dataset[:-1],
        x_dataset[-1],
        y_dataset[:-1],
        y_dataset[-1],
    )

    linear_regression = LinearRegression()
    linear_regression.fit(x_train, y_train)

    y_pred = linear_regression.predict(x_test.reshape(1, -1))[0]
    result_list.append({"y_test": y_test, "y_pred": y_pred})

linear_regression_df = pd.DataFrame(result_list)
linear_regression_df["ticker"] = ticker

In [48]:
from tqdm import tqdm

In [50]:
tickers = sorted(set(datasets["ticker"]))
ticker_score_dict = dict()
for ticker in tqdm(tickers):
    _dataset = datasets[datasets["ticker"] == ticker]
    _dataset = sort_dataset_by_date(_dataset, "date")
    _dataset = append_price_diff(_dataset, "open", "close")

    _price_diff_arraylist = get_array_list(_dataset, "price_diff")

    _price_diff_arraylist_dataset = _price_diff_arraylist[-CFG["dataset_window"] :]

    x_dataset, y_dataset = get_x_y(_price_diff_arraylist_dataset, CFG)
    y_dataset = y_dataset.sum(axis=1)

    linear_regression = LinearRegression()
    linear_regression.fit(x_dataset, y_dataset)
    y_pred = linear_regression.predict(
        _price_diff_arraylist_dataset[-CFG["input_window"] :].reshape(1, -1)
    )[0]
    ticker_score_dict[ticker] = y_pred

100%|██████████| 1947/1947 [01:07<00:00, 28.86it/s]


In [51]:
def make_submission_df(sample_submission, ticker_score_dict):
    sample_submission["score"] = sample_submission["종목코드"].map(ticker_score_dict)
    sample_submission["score"] = sample_submission["score"].fillna(0)
    sample_submission["순위"] = (
        sample_submission["score"].rank(method="first", ascending=False).astype(int)
    )
    submission_result = sample_submission.loc[:, ["종목코드", "순위"]]
    return submission_result


In [52]:
submission_result = make_submission_df(raw_submission, ticker_score_dict)
submission_result.to_csv("./data/result/linear_reg_forward.csv")
# -0.0203316771

In [54]:
ticker_score_reversed_dict = {i: -j for i, j in ticker_score_dict.items()}
submission_result = make_submission_df(raw_submission, ticker_score_reversed_dict)
submission_result.to_csv("./data/result/linear_reg_reversed.csv")
# -0.0792255482	