In [1]:
import os
import numpy as np
import pandas as pd
from scipy.stats import pearsonr
from tqdm import tqdm
from joblib import load, Parallel, delayed

In [2]:
data_path = "../../inputs/ubiquant-market-prediction"

train = pd.read_pickle(os.path.join(data_path, "train.pkl"))

In [3]:
all_investment_ids = sorted(list(dict.fromkeys(train["investment_id"])))
all_time_ids = sorted(list(dict.fromkeys(train["time_id"])))

In [4]:
train.head()

Unnamed: 0,row_id,time_id,investment_id,target,f_0,f_1,f_2,f_3,f_4,f_5,...,f_290,f_291,f_292,f_293,f_294,f_295,f_296,f_297,f_298,f_299
0,0_1,0,1,-0.300875,0.932573,0.113691,-0.402206,0.378386,-0.203938,-0.413469,...,0.366028,-1.09562,0.200075,0.819155,0.941183,-0.086764,-1.087009,-1.044826,-0.287605,0.321566
1,0_2,0,2,-0.23104,0.810802,-0.514115,0.742368,-0.616673,-0.194255,1.77121,...,-0.154193,0.912726,-0.734579,0.819155,0.941183,-0.387617,-1.087009,-0.929529,-0.97406,-0.343624
2,0_6,0,6,0.568807,0.393974,0.615937,0.567806,-0.607963,0.068883,-1.083155,...,-0.13802,0.912726,-0.551904,-1.220772,-1.060166,-0.219097,-1.087009,-0.612428,-0.113944,0.243608
3,0_7,0,7,-1.06478,-2.343535,-0.01187,1.874606,-0.606346,-0.586827,-0.815737,...,0.382201,0.912726,-0.266359,-1.220772,0.941183,-0.609113,0.104928,-0.783423,1.15173,-0.773309
4,0_8,0,8,-0.53194,0.842057,-0.262993,2.33003,-0.583422,-0.618392,-0.742814,...,-0.170365,0.912726,-0.741355,-1.220772,0.941183,-0.588445,0.104928,0.753279,1.345611,-0.737624


# get 2d targets

In [5]:
def get_targets_per_investment(data_df, investment_id):
    
    targets_per_investment = pd.DataFrame({"time_id": all_time_ids})
    targets_per_investment = pd.merge(targets_per_investment, data_df, on=["time_id"], how="left").rename(columns={"target": investment_id})
    
    return targets_per_investment[investment_id]

targets = Parallel(n_jobs=12, verbose=1)(delayed(get_targets_per_investment)(
    train.loc[train["investment_id"] == investment_id, ["time_id", "target"]].reset_index(drop=True), investment_id) for investment_id in all_investment_ids)

targets = [pd.DataFrame({"time_id" : all_time_ids})] + targets
targets = pd.concat(targets, axis=1)

[Parallel(n_jobs=12)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done  26 tasks      | elapsed:    3.0s
[Parallel(n_jobs=12)]: Done 176 tasks      | elapsed:    5.1s
[Parallel(n_jobs=12)]: Done 426 tasks      | elapsed:    8.8s
[Parallel(n_jobs=12)]: Done 776 tasks      | elapsed:   13.9s
[Parallel(n_jobs=12)]: Done 1226 tasks      | elapsed:   20.6s
[Parallel(n_jobs=12)]: Done 1776 tasks      | elapsed:   28.6s
[Parallel(n_jobs=12)]: Done 2426 tasks      | elapsed:   38.4s
[Parallel(n_jobs=12)]: Done 3176 tasks      | elapsed:   49.3s
[Parallel(n_jobs=12)]: Done 3579 out of 3579 | elapsed:   54.7s finished


# get 2d avg targets

In [6]:
targets_lag_1 = targets.copy()
targets_lag_2 = targets.copy()
targets_lag_3 = targets.copy()
targets_lag_4 = targets.copy()

targets_lag_1[all_investment_ids] = targets_lag_1[all_investment_ids].shift(-1).ffill()
targets_lag_2[all_investment_ids] = targets_lag_2[all_investment_ids].shift(-2).ffill()
targets_lag_3[all_investment_ids] = targets_lag_3[all_investment_ids].shift(-3).ffill()
targets_lag_4[all_investment_ids] = targets_lag_4[all_investment_ids].shift(-4).ffill()

In [7]:
avg_targets = targets.copy()

avg_targets[all_investment_ids] = (
    5 * targets[all_investment_ids].values + \
    4 * targets_lag_1[all_investment_ids].values + \
    3 * targets_lag_2[all_investment_ids].values + \
    2 * targets_lag_3[all_investment_ids].values + \
    1 * targets_lag_4[all_investment_ids].values
) / 5

In [8]:
avg_targets.head()

Unnamed: 0,time_id,0,1,2,3,4,6,7,8,9,...,3763,3764,3765,3766,3767,3768,3769,3770,3772,3773
0,0,,-0.953181,-0.959555,,,1.417428,-1.772503,-1.147566,,...,,,-0.752403,2.144953,,-0.911327,-1.683036,-0.834823,0.10637,
1,1,,-0.437709,-1.133906,,,1.13104,-1.099631,-1.024091,5.321261,...,,,-1.793275,3.298036,0.495686,-0.893175,-0.758494,-0.454146,-0.134663,
2,2,,0.601001,-1.101686,,,1.418084,-0.980645,-1.301886,6.917754,...,,,-1.669977,4.296007,-1.672579,-1.091533,0.365351,0.820528,0.035319,
3,3,,1.481731,-0.863963,,,1.316396,-1.592388,-1.476737,9.983584,...,,,-1.940256,2.919597,-2.548315,-1.45023,1.041055,1.826463,-0.541516,
4,4,,2.138271,-1.353537,,,-0.343756,-1.092243,-1.25234,,...,,,-1.705071,2.554506,-3.838545,-0.964129,1.576638,1.403292,-1.012362,


In [9]:
targets.head()

Unnamed: 0,time_id,0,1,2,3,4,6,7,8,9,...,3763,3764,3765,3766,3767,3768,3769,3770,3772,3773
0,0,,-0.300875,-0.23104,,,0.568807,-1.06478,-0.53194,1.505904,...,,,0.302557,0.003156,,-0.392297,-0.877746,-0.284696,0.202003,
1,1,,-0.917045,-0.472108,,,-0.147971,-0.372692,-0.105693,0.6225,...,,,-0.560079,0.250396,1.318857,-0.227782,-0.684049,-0.894825,-0.286612,
2,2,,-0.480234,-0.661659,,,0.243674,0.318899,-0.260137,-0.610705,...,,,-0.305467,2.031675,-0.040981,-0.018971,-0.250995,-0.3238,0.300915,
3,3,,-0.323562,-0.055215,,,1.816745,-0.711446,-0.640987,5.271096,...,,,-0.730791,0.857357,0.386379,-0.708491,-0.165561,0.836601,0.076417,
4,4,,2.494479,0.341267,,,0.470476,-1.58165,-0.59297,,...,,,-0.656495,1.897659,-1.476258,-0.210125,-0.206145,0.126859,-0.387297,


In [19]:
avg_targets_normalized = avg_targets.copy()
targets_normalized = targets.copy()
avg_targets_demean_normalized = avg_targets.copy()
targets_demean_normalized = targets.copy()

avg_targets_demean_normalized[all_investment_ids] = (avg_targets_normalized[all_investment_ids] - np.mean(avg_targets_normalized[all_investment_ids])) / np.std(avg_targets_normalized[all_investment_ids])
targets_demean_normalized[all_investment_ids] = (targets_normalized[all_investment_ids] - np.mean(targets_normalized[all_investment_ids])) / np.std(targets_normalized[all_investment_ids])
avg_targets_normalized[all_investment_ids] = avg_targets_normalized[all_investment_ids] / np.std(avg_targets_normalized[all_investment_ids])
targets_normalized[all_investment_ids] = targets_normalized[all_investment_ids] / np.std(targets_normalized[all_investment_ids])

In [10]:
avg_targets_values = avg_targets[all_investment_ids].values.flatten()
targets_values = targets[all_investment_ids].values.flatten()

targets_values = targets_values[~np.isnan(avg_targets_values)]
avg_targets_values = avg_targets_values[~np.isnan(avg_targets_values)]

corr, _  = pearsonr(targets_values, avg_targets_values)

print(corr)

0.6788280163864945


In [21]:
avg_targets_normalized_values = avg_targets_normalized[all_investment_ids].values.flatten()
targets_values = targets[all_investment_ids].values.flatten()

targets_values = targets_values[~np.isnan(avg_targets_normalized_values)]
avg_targets_normalized_values = avg_targets_normalized_values[~np.isnan(avg_targets_normalized_values)]

corr, _  = pearsonr(targets_values, avg_targets_normalized_values)

print(corr)

0.6674379035418238


In [20]:
avg_targets_demean_normalized_values = avg_targets_demean_normalized[all_investment_ids].values.flatten()
targets_values = targets[all_investment_ids].values.flatten()

targets_values = targets_values[~np.isnan(avg_targets_demean_normalized_values)]
avg_targets_demean_normalized_values = avg_targets_demean_normalized_values[~np.isnan(avg_targets_demean_normalized_values)]

corr, _  = pearsonr(targets_values, avg_targets_demean_normalized_values)

print(corr)

0.6669153558090682


In [24]:
targets_normalized_values = targets_normalized[all_investment_ids].values.flatten()
targets_values = targets[all_investment_ids].values.flatten()

targets_values = targets_values[~np.isnan(targets_normalized_values)]
targets_normalized_values = targets_normalized_values[~np.isnan(targets_normalized_values)]

corr, _  = pearsonr(targets_values, targets_normalized_values)

print(corr)

0.9819896072036737


In [23]:
targets_demean_normalized_values = targets_demean_normalized[all_investment_ids].values.flatten()
targets_values = targets[all_investment_ids].values.flatten()

targets_values = targets_values[~np.isnan(targets_demean_normalized_values)]
targets_demean_normalized_values = targets_demean_normalized_values[~np.isnan(targets_demean_normalized_values)]

corr, _  = pearsonr(targets_values, targets_demean_normalized_values)

print(corr)

0.9814238729954026


# append avg targets

In [64]:
train["avg_target"] = train["target"].copy()
train["avg_target_normalized"] = train["target"].copy()
train["target_normalized"] = train["target"].copy()
train["avg_target_demean_normalized"] = train["target"].copy()
train["target_demean_normalized"] = train["target"].copy()

In [25]:
avg_target_arr = []
avg_target_normalized_arr = []
target_normalized_arr = []
avg_target_demean_normalized_arr = []
target_demean_normalized_arr = []

time_id_list, investment_id_list = train["time_id"].tolist(), train["investment_id"].tolist()

for row_idx in tqdm(range(len(time_id_list))):
    
    time_id, investment_id = time_id_list[row_idx], investment_id_list[row_idx]
    
    avg_target_value = avg_targets.loc[avg_targets["time_id"] == time_id, investment_id].values[0]
    avg_target_arr.append(avg_target_value)
    
    avg_target_normalized_value = avg_targets_normalized.loc[avg_targets_normalized["time_id"] == time_id, investment_id].values[0]
    avg_target_normalized_arr.append(avg_target_normalized_value)
    
    target_normalized_value = targets_normalized.loc[targets_normalized["time_id"] == time_id, investment_id].values[0]
    target_normalized_arr.append(target_normalized_value)
    
    avg_target_demean_normalized_value = avg_targets_demean_normalized.loc[avg_targets_demean_normalized["time_id"] == time_id, investment_id].values[0]
    avg_target_demean_normalized_arr.append(avg_target_demean_normalized_value)
    
    target_demean_normalized_value = targets_demean_normalized.loc[targets_demean_normalized["time_id"] == time_id, investment_id].values[0]
    target_demean_normalized_arr.append(target_demean_normalized_value)
    
avg_target_arr = np.array(avg_target_arr)
avg_target_normalized_arr = np.array(avg_target_normalized_arr)
target_normalized_arr = np.array(target_normalized_arr)
avg_target_demean_normalized_arr = np.array(avg_target_demean_normalized_arr)
target_demean_normalized_arr = np.array(target_demean_normalized_arr)

100%|██████████████████████████████████████████████████████████████████████| 3141410/3141410 [50:00<00:00, 1047.07it/s]


In [26]:
train["avg_target"] = avg_target_arr
train["avg_target_normalized"] = avg_target_normalized_arr
train["target_normalized"] = target_normalized_arr
train["avg_target_demean_normalized"] = avg_target_demean_normalized_arr
train["target_demean_normalized"] = target_demean_normalized_arr
train.shape

(3141410, 309)

In [27]:
train = train.dropna(axis=0)
train.shape

(3141291, 309)

# save new train

In [28]:
train.to_pickle(os.path.join(data_path, "train_new.pkl"))

In [None]:
2 peaks 298, 294, 291, 287
3 peaks 296,