# Featurize processed data

In [1]:
repo_path = "/Users/johnzhou/research/decision-making"
expt_dir = f"{repo_path}/experiments"
data_dir = f"{repo_path}/data"

## Import packages

In [2]:
import sys
sys.path.append(repo_path)

In [3]:
import numpy as np
import matplotlib.pyplot as plt
from omegaconf import OmegaConf
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans
import torch

from src.utils import normalize_features

from src.data.blocks import RealDataset, SynthDataset
from src.models.agentnet import AgentNet
from src.models.sigmoidnet import SigmoidNet
from src.models.train import train

  from .autonotebook import tqdm as notebook_tqdm


Sigmoid fitting with MSE loss seems to be empirically more sensitive.

In [4]:
expt_name = "unbounded_a"

config = OmegaConf.create({
    "name": expt_name,
    "random_seed": 4995,
    "model": {
        "in_features": 19,
        "linear_layers": [32, 8, 4],
        "use_batch_norm": False
    },
    "learning_rate": 1e-4,
    "data": {
        "feature_path": f"{data_dir}/processed/{expt_name}/synth_feats.npy",
        "label_path": f"{data_dir}/processed/{expt_name}/synth_labels.npy",
        "train_proportion": 0.8,
        "train_batch_size": 64,
        "val_batch_size": 64
    },
    "trainer": {
        "gpus": 0,
        "max_epochs": 1000
    },

})

OmegaConf.save(config=config, f=f"{repo_path}/configs/model_configs/sigmoidnet_train.yaml")

In [8]:
%reload_ext tensorboard
%tensorboard --logdir=$expt_dir/$expt_name/lightning_logs

In [9]:
system, trainer = train(
    SigmoidNet,
    OmegaConf.to_container(config),
    experiment_dir=expt_dir,
    checkpoint_name="model")

Global seed set to 4995


LinearEmbedder(
  (layers): Sequential(
    (0): Linear(in_features=19, out_features=32, bias=True)
    (1): LeakyReLU(negative_slope=0.05)
    (2): Linear(in_features=32, out_features=8, bias=True)
    (3): LeakyReLU(negative_slope=0.05)
    (4): Linear(in_features=8, out_features=4, bias=True)
    (5): LeakyReLU(negative_slope=0.05)
    (6): Linear(in_features=4, out_features=2, bias=True)
  )
)


  rank_zero_deprecation(
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
  rank_zero_warn(f"Checkpoint directory {dirpath} exists and is not empty.")

  | Name  | Type           | Params
-----------------------------------------
0 | loss  | SupConLoss     | 0     
1 | model | LinearEmbedder | 950   
-----------------------------------------
950       Trainable params
0         Non-trainable params
950       Total params
0.004     Total estimated model params size (MB)


                                                                                      

  rank_zero_warn(
  rank_zero_warn(


Epoch 0:  72%|████████████▎    | 100/138 [00:00<00:00, 226.68it/s, loss=4.27, v_num=1]
Validation: 0it [00:00, ?it/s][A
Validation:   0%|                                              | 0/28 [00:00<?, ?it/s][A
Epoch 0:  87%|██████████████▊  | 120/138 [00:00<00:00, 218.61it/s, loss=4.27, v_num=1][A
Validation DataLoader 0:  71%|████████████████▍      | 20/28 [00:00<00:00, 217.94it/s][A
Epoch 0: 100%|█| 138/138 [00:00<00:00, 204.98it/s, loss=4.33, v_num=1, val_loss=4.300][A
Epoch 1:  72%|▋| 100/138 [00:00<00:00, 162.33it/s, loss=4.17, v_num=1, val_loss=4.300,[A
Validation: 0it [00:00, ?it/s][A
Validation:   0%|                                              | 0/28 [00:00<?, ?it/s][A
Epoch 1:  87%|▊| 120/138 [00:00<00:00, 164.77it/s, loss=4.17, v_num=1, val_loss=4.300,[A
Validation DataLoader 0:  71%|████████████████▍      | 20/28 [00:00<00:00, 246.76it/s][A
Epoch 1: 100%|█| 138/138 [00:00<00:00, 175.23it/s, loss=4.13, v_num=1, val_loss=4.150,[A
Epoch 2:  72%|▋| 100/138 [00:00<00:

  rank_zero_warn("Detected KeyboardInterrupt, attempting graceful shutdown...")


In [7]:
X_fname = f"{data_dir}/processed/{expt_name}/synth_feats.npy"
model_fname = f"{expt_dir}/linear2D/model-v13.ckpt"
system = SigmoidNet(config)

model = system.load_from_checkpoint(model_fname)

LinearEmbedder(
  (layers): Sequential(
    (0): Linear(in_features=19, out_features=32, bias=True)
    (1): LeakyReLU(negative_slope=0.05)
    (2): Linear(in_features=32, out_features=8, bias=True)
    (3): LeakyReLU(negative_slope=0.05)
    (4): Linear(in_features=8, out_features=4, bias=True)
    (5): LeakyReLU(negative_slope=0.05)
    (6): Linear(in_features=4, out_features=2, bias=True)
  )
)


FileNotFoundError: [Errno 2] No such file or directory: '/Users/johnzhou/research/decision-making/experiments/linear2D/model-v13.ckpt'

0.9 prew

In [None]:
X_embedded_mse = torch.squeeze(model(torch.unsqueeze(torch.from_numpy(np.load(X_fname)).float(), 1))).detach().numpy()
print(X_embedded_mse.shape)

plt.figure()
mse_boundary = 3995

mf = plt.scatter(X_embedded_mse[:mse_boundary, 0], X_embedded_mse[:mse_boundary, 1], color='blue', s=20)
mb = plt.scatter(X_embedded_mse[mse_boundary:, 0], X_embedded_mse[mse_boundary:, 1], color='red', s=10)

plt.legend((mf, mb), ('Synth MF', 'Synth MB'), loc='upper right')

border = -0.027
plt.hlines(border, -.55, -.2)

plt.show()

In [None]:

print("% of MF that are over:", np.sum(X_embedded_mse[:mse_boundary, 1] > border) / X_embedded_mse[:mse_boundary, 1].size)
print("% of MB that are under:", np.sum(X_embedded_mse[mse_boundary:, 1] < border) / X_embedded_mse[mse_boundary:, 1].size)

print("% of over that are MF:",np.sum(X_embedded_mse[:mse_boundary, 1] > border) / np.sum(X_embedded_mse[:, 1] > border))
print("% of under that are MB:",np.sum(X_embedded_mse[mse_boundary:, 1] < border) / np.sum(X_embedded_mse[:, 1] < border))



1.0 prew

In [None]:
X_embedded_mse = torch.squeeze(model(torch.unsqueeze(torch.from_numpy(np.load(X_fname)).float(), 1))).detach().numpy()
print(X_embedded_mse.shape)

plt.figure()
mse_boundary = 42171

# X_embedded_mse = normalize_features(X_embedded_mse)

mf = plt.scatter(X_embedded_mse[:mse_boundary, 0], X_embedded_mse[:mse_boundary, 1], color='blue', s=20)
mb = plt.scatter(X_embedded_mse[mse_boundary:, 0], X_embedded_mse[mse_boundary:, 1], color='red', s=10)
# real = plt.scatter(X_embedded_mse[mse_boundary1:, 0], X_embedded_mse[mse_boundary1:, 1], color='yellow', s=5)

borders = [-0.585, -0.57, -0.555, -0.51, -0.45, -0.35, -0.277]
plt.legend((mf, mb), ('Synth MF', 'Synth MB'), loc='upper right')
plt.vlines(borders, -0.6, 0, color='black')

plt.show()

In [None]:
print("% of MF that are left:", np.sum(X_embedded_mse[:mse_boundary, 0] < border) / X_embedded_mse[:mse_boundary, 0].size)
print("% of MB that are right:", np.sum(X_embedded_mse[mse_boundary:, 0] > border) / X_embedded_mse[mse_boundary:, 0].size)

print("% of left that are MF:",np.sum(X_embedded_mse[:mse_boundary, 0] < border) / np.sum(X_embedded_mse[:, 0] < border))
print("% of right that are MB:",np.sum(X_embedded_mse[mse_boundary:, 0] > border) / np.sum(X_embedded_mse[:, 0] > border))




In [None]:
plt.figure()
mse_boundary = 42171

mf = plt.scatter(X_embedded_mse[:mse_boundary, 0], X_embedded_mse[:mse_boundary, 1], color='blue', s=15)
mb = plt.scatter(X_embedded_mse[mse_boundary:, 0], X_embedded_mse[mse_boundary:, 1], color='red', s=10)
# real = plt.scatter(X_embedded_mse[mse_boundary1:, 0], X_embedded_mse[mse_boundary1:, 1], color='yellow', s=5)

plt.legend((mf, mb), ('Synth MF', 'Synth MB'), loc='upper right')

plt.show()