In [None]:
import os
import numpy as np
import pandas as pd
from scipy.stats import pearsonr
from tqdm import tqdm
from joblib import load, Parallel, delayed

In [None]:
data_path = "../../inputs/ubiquant-market-prediction"

train = pd.read_pickle(os.path.join(data_path, "train.pkl"))

In [None]:
all_investment_ids = sorted(list(dict.fromkeys(train["investment_id"])))
all_time_ids = sorted(list(dict.fromkeys(train["time_id"])))

In [None]:
train.head()

# get 2d targets

In [None]:
def get_targets_per_investment(data_df, investment_id):
    
    targets_per_investment = pd.DataFrame({"time_id": all_time_ids})
    targets_per_investment = pd.merge(targets_per_investment, data_df, on=["time_id"], how="left").rename(columns={"target": investment_id})
    
    return targets_per_investment[investment_id]

targets = Parallel(n_jobs=12, verbose=1)(delayed(get_targets_per_investment)(
    train.loc[train["investment_id"] == investment_id, ["time_id", "target"]].reset_index(drop=True), investment_id) for investment_id in all_investment_ids)

targets = [pd.DataFrame({"time_id" : all_time_ids})] + targets
targets = pd.concat(targets, axis=1)

# get 2d avg targets

In [None]:
targets_lag_1 = targets.copy()
targets_lag_2 = targets.copy()
targets_lag_3 = targets.copy()
targets_lag_4 = targets.copy()

targets_lag_1[all_investment_ids] = targets_lag_1[all_investment_ids].shift(-1).ffill()
targets_lag_2[all_investment_ids] = targets_lag_2[all_investment_ids].shift(-2).ffill()
targets_lag_3[all_investment_ids] = targets_lag_3[all_investment_ids].shift(-3).ffill()
targets_lag_4[all_investment_ids] = targets_lag_4[all_investment_ids].shift(-4).ffill()

In [None]:
avg_targets = targets.copy()

avg_targets[all_investment_ids] = (
    5 * targets[all_investment_ids].values + \
    4 * targets_lag_1[all_investment_ids].values + \
    3 * targets_lag_2[all_investment_ids].values + \
    2 * targets_lag_3[all_investment_ids].values + \
    1 * targets_lag_4[all_investment_ids].values
) / 5

In [None]:
avg_targets.head()

In [None]:
targets.head()

In [None]:
avg_targets_normalized = avg_targets.copy()
targets_normalized = targets.copy()
avg_targets_demean_normalized = avg_targets.copy()
targets_demean_normalized = targets.copy()
targets_demean_normalized

avg_targets_demean_normalized[all_investment_ids] = (avg_targets_normalized[all_investment_ids] - np.mean(avg_targets_normalized[all_investment_ids])) / np.std(avg_targets_normalized[all_investment_ids])
targets_demean_normalized[all_investment_ids] = (targets_normalized[all_investment_ids] - np.mean(targets_normalized[all_investment_ids])) / np.std(targets_normalized[all_investment_ids])
avg_targets_normalized[all_investment_ids] = avg_targets_normalized[all_investment_ids] / np.std(avg_targets_normalized[all_investment_ids])
targets_normalized[all_investment_ids] = targets_normalized[all_investment_ids] / np.std(targets_normalized[all_investment_ids])

In [None]:
avg_targets_values = avg_targets[all_investment_ids].values.flatten()
targets_values = targets[all_investment_ids].values.flatten()

targets_values = targets_values[~np.isnan(avg_targets_values)]
avg_targets_values = avg_targets_values[~np.isnan(avg_targets_values)]

corr, _  = pearsonr(targets_values, avg_targets_values)

print(corr)

In [None]:
avg_targets_normalized_values = avg_targets_normalized[all_investment_ids].values.flatten()
targets_values = targets[all_investment_ids].values.flatten()

targets_values = targets_values[~np.isnan(avg_targets_normalized_values)]
avg_targets_normalized_values = avg_targets_normalized_values[~np.isnan(avg_targets_normalized_values)]

corr, _  = pearsonr(targets_values, avg_targets_normalized_values)

print(corr)

In [None]:
avg_targets_demean_normalized_values = avg_targets_demean_normalized[all_investment_ids].values.flatten()
targets_values = targets[all_investment_ids].values.flatten()

targets_values = targets_values[~np.isnan(avg_targets_demean_normalized_values)]
avg_targets_demean_normalized_values = avg_targets_demean_normalized_values[~np.isnan(avg_targets_demean_normalized_values)]

corr, _  = pearsonr(targets_values, avg_targets_demean_normalized_values)

print(corr)

In [None]:
targets_normalized_values = targets_normalized[all_investment_ids].values.flatten()
targets_values = targets[all_investment_ids].values.flatten()

targets_values = targets_values[~np.isnan(targets_normalized_values)]
targets_normalized_values = targets_normalized_values[~np.isnan(targets_normalized_values)]

corr, _  = pearsonr(targets_values, targets_normalized_values)

print(corr)

In [None]:
targets_demean_normalized_values = targets_demean_normalized[all_investment_ids].values.flatten()
targets_values = targets[all_investment_ids].values.flatten()

targets_values = targets_values[~np.isnan(targets_demean_normalized_values)]
targets_demean_normalized_values = targets_demean_normalized_values[~np.isnan(targets_demean_normalized_values)]

corr, _  = pearsonr(targets_values, targets_demean_normalized_values)

print(corr)

# append avg targets

In [None]:
# train["avg_target"] = train["target"].copy()
# train["avg_target_normalized"] = train["target"].copy()
# train["target_normalized"] = train["target"].copy()
# train["avg_target_demean_normalized"] = train["target"].copy()
# train["target_demean_normalized"] = train["target"].copy()

train["target_demodel_demean_normalized"] = train["target"].copy()

In [None]:
avg_target_arr = []
avg_target_normalized_arr = []
target_normalized_arr = []
avg_target_demean_normalized_arr = []
target_demean_normalized_arr = []

time_id_list, investment_id_list = train["time_id"].tolist(), train["investment_id"].tolist()

for row_idx in tqdm(range(len(time_id_list))):
    
    time_id, investment_id = time_id_list[row_idx], investment_id_list[row_idx]
    
    avg_target_value = avg_targets.loc[avg_targets["time_id"] == time_id, investment_id].values[0]
    avg_target_arr.append(avg_target_value)
    
    avg_target_normalized_value = avg_targets_normalized.loc[avg_targets_normalized["time_id"] == time_id, investment_id].values[0]
    avg_target_normalized_arr.append(avg_target_normalized_value)
    
    target_normalized_value = targets_normalized.loc[targets_normalized["time_id"] == time_id, investment_id].values[0]
    target_normalized_arr.append(target_normalized_value)
    
    avg_target_demean_normalized_value = avg_targets_demean_normalized.loc[avg_targets_demean_normalized["time_id"] == time_id, investment_id].values[0]
    avg_target_demean_normalized_arr.append(avg_target_demean_normalized_value)
    
    target_demean_normalized_value = targets_demean_normalized.loc[targets_demean_normalized["time_id"] == time_id, investment_id].values[0]
    target_demean_normalized_arr.append(target_demean_normalized_value)
    
avg_target_arr = np.array(avg_target_arr)
avg_target_normalized_arr = np.array(avg_target_normalized_arr)
target_normalized_arr = np.array(target_normalized_arr)
avg_target_demean_normalized_arr = np.array(avg_target_demean_normalized_arr)
target_demean_normalized_arr = np.array(target_demean_normalized_arr)

In [None]:
train["avg_target"] = avg_target_arr
train["avg_target_normalized"] = avg_target_normalized_arr
train["target_normalized"] = target_normalized_arr
train["avg_target_demean_normalized"] = avg_target_demean_normalized_arr
train["target_demean_normalized"] = target_demean_normalized_arr
train.shape

In [None]:
train = train.dropna(axis=0)
train.shape

# save new train

In [None]:
train.to_pickle(os.path.join(data_path, "train_new.pkl"))

In [None]:
2 peaks 298, 294, 291, 287
3 peaks 296,