In [None]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [None]:
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt

In [None]:
import logging
import os
import re
import json
import sys
import pickle
from tqdm import tqdm

import sklearn
import sklearn.linear_model
import sklearn.preprocessing
from sklearn.pipeline import Pipeline

import dateutil.parser
from dateutil.relativedelta import relativedelta
from datetime import datetime, timedelta
import pytz

In [None]:
# HuggingFace packages
import transformers
import tokenizers
import torch

# more torch imports
import torchvision
import torchvision.transforms
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F

In [None]:
from pathlib import Path
git_root_dir = !git rev-parse --show-toplevel
git_root_dir = Path(git_root_dir[0].strip())
git_root_dir

In [None]:
import sys
sys.path.append(os.path.join(git_root_dir, 'src'))
import cbrec.genconfig

In [None]:
config = cbrec.genconfig.Config()
#config.metadata_filepath += "_old"
#config.feature_db_filepath += "_old"

In [None]:
import cbrec.featuredb
import cbrec.utils
import cbrec.data
import cbrec.reccontext
import cbrec.evaluation
import cbrec.torchmodel
import cbrec.text.embeddingdb
import cbrec.text.journalid

In [None]:
import cbrec.logutils
cbrec.logutils.set_up_logging()

In [None]:
# turn off matplotlib logging
import logging
logging.getLogger('matplotlib').setLevel(logging.WARNING)

In [None]:
import sys
caringbridge_core_path = "/home/lana/levon003/repos/caringbridge_core"
sys.path.append(caringbridge_core_path)
import cbcore.data.paths

In [None]:
# load train features
feature_cache_dir = os.path.join(cbcore.data.paths.projects_data_dir, 'recsys-peer-match', 'torch_experiments', 'feature_cache')
filenames = [
    ('X_train_raw.pkl', 'y_train_raw.pkl'),
    ('X_test2train_raw.pkl', 'y_test2train_raw.pkl'),
]

def get_features(x_filename, y_filename):
    with open(os.path.join(feature_cache_dir, x_filename), 'rb') as infile:
        X = pickle.load(infile)
    with open(os.path.join(feature_cache_dir, y_filename), 'rb') as infile:
        y = pickle.load(infile)
    return X, y

x_filename, y_filename = filenames[0]
X_train, y_train = get_features(x_filename, y_filename)
    
x_filename, y_filename = filenames[1]
X_test, y_test = get_features(x_filename, y_filename)
X_train.shape, X_test.shape, y_train.shape, y_test.shape


In [None]:
X_train[:,0]

In [None]:
# shuffle the data
inds = np.arange(len(X_train))
np.random.shuffle(inds)
X_train = X_train[inds]
y_train = y_train[inds]

In [None]:
# scale the data to speed up convergence
import sklearn.preprocessing
scaler = sklearn.preprocessing.StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
class LinearNet(nn.Module):
    """
    Simple neural net with 2 hidden layers.
    """
    def __init__(self, n_input, n_hidden, dropout_p=0.2):
        super(LinearNet, self).__init__()
        # note: 768 is the size of the roBERTa outputs
        self.fc1 = nn.Linear(n_input, n_hidden)
        self.fc2 = nn.Linear(n_hidden, n_hidden)
        self.fc3 = nn.Linear(n_hidden, 1, bias=False)
        self.dropout1 = nn.Dropout(p=dropout_p)
        self.dropout2 = nn.Dropout(p=dropout_p)
        
    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = self.dropout1(x)
        x = F.relu(self.fc2(x))
        x = self.dropout2(x)
        x = self.fc3(x)  # note: not using F.sigmoid here, as the loss used includes the Sigmoid transformation
        return x

In [None]:
logger = logging.getLogger("notebook.ZentTorchExperiment")
    
n_train = len(y_train)
n_test = len(y_test)

verbose = True
n_hidden = 100
n_epochs = 100
lr_init = 0.01
max_lr = 0.02  # 0.0155
dropout_p = 0.1
minibatch_size = len(y_train)
minibatch_size = min(n_train, minibatch_size) # if minibatch_size is larger than n_train, force it to n_train
n_minibatches = int(np.ceil(n_train / minibatch_size))

validation_rate = 0.1 # (vr) we will compute loss and accuracy against the validation set on vr of the epochs

n_input = X_train.shape[1]
# note: input dim is 27 for non-text features + 768 for text features
net = LinearNet(n_input, n_hidden, dropout_p)

#optimizer = optim.SGD(net.parameters(), lr=lr_init, momentum=0.9)
optimizer = optim.Adam(net.parameters(), lr=lr_init)
scheduler = optim.lr_scheduler.OneCycleLR(
    optimizer,
    max_lr=max_lr,
    steps_per_epoch=n_minibatches,
    epochs=n_epochs,
)

criterion = nn.BCEWithLogitsLoss()  # pointwise loss function


X_test_tensor = torch.from_numpy(X_test)
y_test_tensor = torch.from_numpy(y_test)
X_train_tensor = torch.from_numpy(X_train)
y_train_tensor = torch.from_numpy(y_train)
y_train_tensor = y_train_tensor.view(-1, 1)  # make labels 2-dimensional
y_train_tensor = y_train_tensor.type_as(X_train_tensor)
if verbose:
    logger.info(f"Input tensor sizes: {X_train_tensor.size()}, {y_train_tensor.size()}")
    logger.info(f"Validating model every {1/validation_rate} epochs.")

# _metrics[0] -> Epoch, metrics[1] -> loss, _metrics[2] -> accuracy
test_metrics = np.zeros((3,int(n_epochs*validation_rate+1))) #+1 to ensure space for final epoch metric
train_metrics = np.zeros((3,n_epochs))

net.train()
for epoch in range(n_epochs):
    s = datetime.now()
    optimizer.zero_grad()

    # shuffle the training data
    # This will randomize our minibatches at each epoch
    epoch_order = torch.randperm(n_train)

    mb_metrics = []  # store the minibatch_metrics, then average after
    for minibatch in range(n_minibatches):
        minibatch_start = minibatch * minibatch_size
        minibatch_end = min(minibatch_start + minibatch_size, n_train)
        if verbose and epoch == 0:
            logger.info(f"    Minibatch for inds in {minibatch_start} - {minibatch_end}.")
        minibatch_inds = epoch_order[minibatch_start:minibatch_end]

        inputs = X_train_tensor[minibatch_inds]
        train_labels = y_train_tensor[minibatch_inds]

        net.train()
        train_outputs = net(inputs)
        train_loss = criterion(train_outputs, train_labels)
        train_loss.backward()
        optimizer.step()
        scheduler.step()

        # compute and log the loss
        y_train_pred = torch.sigmoid(train_outputs.detach()).view((-1,)).numpy()
        y_train_pred = (y_train_pred >= 0.5).astype(int)  # binarize predictions with a 0.5 decision boundary
        y_train_minibatch = y_train[minibatch_inds.numpy()]
        train_acc = np.sum(y_train_pred == y_train_minibatch) / len(y_train_minibatch)
        
        mb_metrics.append((train_loss.item(), train_acc))
    train_loss, train_acc = np.mean(np.array(mb_metrics), axis=0)
    train_metrics[0,epoch] = epoch
    train_metrics[1,epoch] = train_loss
    train_metrics[2,epoch] = train_acc
    
    should_stop_early = train_loss < 0.001
    if verbose and (epoch < 5 or epoch == n_epochs - 1 or epoch % 10 == 0 or should_stop_early):
        logger.info(f"{epoch:>3} ({datetime.now() - s}): train loss={train_loss:.4f} train accuracy={train_acc*100:.2f}% LR={optimizer.param_groups[0]['lr']:.2E}")
    if should_stop_early:
        break
        
    if epoch % (1/validation_rate) == 0:
        net.eval()
        with torch.no_grad():
            test_outputs = net(X_test_tensor)
            test_loss = criterion(test_outputs.detach(), y_test_tensor.unsqueeze(1).float())
            y_test_pred = torch.sigmoid(test_outputs.detach()).view((-1,)).numpy()
            y_test_pred = (y_test_pred >= 0.5).astype(int)
            test_acc = np.sum(y_test_pred == y_test) / len(y_test)
        logger.info(f"    {epoch:>3}: test loss={test_loss:.4f} test accuracy={test_acc*100:.2f}%")
        metric_ind = int(epoch*validation_rate)
        test_metrics[0,metric_ind] = epoch
        test_metrics[1,metric_ind] = test_loss
        test_metrics[2,metric_ind] = test_acc

# this is a hack, but we store training results info back through the learner_config dictionary
final_train_loss = train_loss
final_epoch_count = epoch + 1
if verbose:
    logger.info(f"Completed {final_epoch_count} epochs with a final train loss of {final_train_loss:.4f}.")

net.eval()
with torch.no_grad():
    X_test_tensor = torch.from_numpy(X_test)
    outputs = net(X_test_tensor)
    test_loss = criterion(test_outputs.detach(), y_test_tensor.unsqueeze(1).float())
    y_test_pred = torch.sigmoid(outputs.detach()).view((-1,)).numpy()
    y_test_pred = (y_test_pred >= 0.5).astype(int)
    acc = np.sum(y_test_pred == y_test) / len(y_test)
    logger.info(f"Test acc: {acc*100:.2f}%")
    test_metrics[0, test_metrics.shape[1] - 1] = epoch
    test_metrics[1, test_metrics.shape[1] - 1] = test_loss
    test_metrics[2, test_metrics.shape[1] - 1 ] = acc


In [None]:
plt.plot(train_metrics[0],train_metrics[1])             
plt.plot(test_metrics[0],test_metrics[1])
plt.legend(["Train","Test"])
plt.title('Model Loss')
plt.xlabel("Epoch")
plt.show()

plt.plot(train_metrics[0],train_metrics[2])             
plt.plot(test_metrics[0],test_metrics[2])
plt.legend(["Train","Test"])
plt.title('Model Accuracy')
plt.xlabel("Epoch")
plt.show()


In [None]:
# load train features
feature_cache_dir = os.path.join(cbcore.data.paths.projects_data_dir, 'recsys-peer-match', 'torch_experiments', 'feature_cache')
filenames = [
    ('X_train_raw.pkl', 'y_train_raw.pkl'),
    ('X_test2train_raw.pkl', 'y_test2train_raw.pkl'),
]
Xs = []
ys = []
for x_filename, y_filename in filenames:
    with open(os.path.join(feature_cache_dir, x_filename), 'rb') as infile:
        X = pickle.load(infile)
        Xs.append(X)
    with open(os.path.join(feature_cache_dir, y_filename), 'rb') as infile:
        y = pickle.load(infile)
        ys.append(y)

X = np.concatenate(Xs, axis=0)
y_true = np.concatenate(ys, axis=0)

# shuffle the data
inds = np.arange(len(X))
np.random.shuffle(inds)
X = X[inds]
y_true = y_true[inds]

In [None]:
config = cbrec.genconfig.Config()
torch_model = cbrec.torchmodel.TorchModel(config)
torch_model.set_training_data(X, y_true)
logger.info(f"Using training data in shape X={torch_model.X.shape}, y={torch_model.y_true.shape}.")

logger.info("Training model")
torch_model.train_model()

logger.info("Model performance metrics")
torch_model.save_model_metrics(True)

In [None]:
import datetime
datetime.datetime.utcnow().strftime("%Y%m%d%H%M%S")

In [None]:
logger = logging.getLogger("notebook.ZentTorchExperiment")

In [None]:
logger.info("Model performance metrics")
torch_model.save_model_metrics(show_graph=False)

In [None]:
test_md_list = [md for md in cbrec.utils.stream_metadata_list(config.metadata_filepath) if md['type'] == 'test']
len(test_md_list)

In [None]:
# load rc_list from pickle
s = datetime.now()
feature_cache_dir = os.path.join(config.torch_experiments_dir, 'feature_cache')
with open(os.path.join(feature_cache_dir, 'rc_test_notext_2000.pkl'), 'rb') as infile:
    rc_list = pickle.load(infile)
print(f"Loaded {len(rc_list)} RecContexts in {datetime.now() - s}.")

In [None]:

class LinearNet(nn.Module):
    """
    Simple neural net with 2 hidden layers.
    """
    def __init__(self, n_input, n_hidden, dropout_p=0.2):
        super(LinearNet, self).__init__()
        # note: 768 is the size of the roBERTa outputs
        self.fc1 = nn.Linear(n_input, n_hidden)
        self.fc2 = nn.Linear(n_hidden, n_hidden)
        self.fc3 = nn.Linear(n_hidden, 1, bias=False)
        self.dropout1 = nn.Dropout(p=dropout_p)
        self.dropout2 = nn.Dropout(p=dropout_p)
        
    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = self.dropout1(x)
        x = F.relu(self.fc2(x))
        x = self.dropout2(x)
        x = self.fc3(x)  # note: not using F.sigmoid here, as the loss used includes the Sigmoid transformation
        return x
    

In [None]:
torch_model = cbrec.torchmodel.TorchModel(config)

dropout_p = 0.1
n_hidden = 100
n_input = 1563

model_cache_dir = os.path.join(config.torch_experiments_dir, 'model_cache')
torch_model.net = LinearNet(n_input, n_hidden, dropout_p)
torch_model.net.load_state_dict(torch.load(os.path.join(model_cache_dir, 'LinearNet_20211007_e1400.pt')))

In [None]:
import cbrec.evaluation
class CustomModelScorer(cbrec.evaluation.Scorer):
    def __init__(self, config, test_context: cbrec.reccontext.RecContext, 
                net, # TODO pass in a model object here, if appropriate
                model_name="CustomModel"):
        super().__init__(config, test_context, coverage_tracker=None, save_scores=True)
        self.model_name = model_name
        self.net = net

    def score(self):
        """
        Score the RecContext.
        
        Use self.text_context to produce a y_score_site list, and return a dictionary of metrics.
        
        """
        X = self.test_context.X_test
        self.net.eval()
        with torch.no_grad():
            X_test_tensor = torch.from_numpy(X)
            outputs = self.net(X_test_tensor)
            y_score = torch.sigmoid(outputs.detach()).view((-1,)).numpy()
        
        y_score_mat = self.get_empty_score_arr('full')
        y_score_mat = y_score.reshape((y_score_mat.shape[1], y_score_mat.shape[0])).T

        y_score_site = self.reduce_usp_ranking_to_site(self.merge_multisource_rankings(y_score_mat))
        self.compute_metrics(y_score_site, model_name=self.model_name)
        
        return self.metrics_dict[self.model_name]

In [None]:
scorer = CustomModelScorer(config, rc_list[0], torch_model.net)
scorer

In [None]:
scorer.score()

# Testing cbrec/modeling

In [None]:
import cbrec.modeling.modelconfig
import cbrec.modeling.scorer
import cbrec.modeling.manager

config = cbrec.genconfig.Config()
model_config = cbrec.modeling.modelconfig.ModelConfig()
manager = cbrec.modeling.manager.ModelManager(model_config, config)
manager.train_model(X_train, y_train)
#model_config.as_dict()

In [None]:
print(model_config.as_dict())
for col in model_config.preprocess_drop_columns:
    print(model_config.column_keys.index(col))

In [None]:
import cbrec.modeling.preprocess

preprocesser = cbrec.modeling.preprocess.FeaturePreprocessor(model_config)

In [None]:
print(X_train.shape)
print(X_train[:,4])
print(X_train[:,5])

X_train = preprocesser.preprocess(X_train)



print(X_train.shape)
print(X_train[:,4])
print(X_train[:,5])

In [None]:
#model_config.as_dict()
for hot_encoding in model_config.preprocess_encode_columns:
    print(hot_encoding)

In [None]:
print(X_train.shape)
print(X_train[:,1])

print(model_config.as_dict())

print("after:")
X_train = preprocesser.remove_feature_columns(X_train)
print(model_config.as_dict())

print(X_train.shape)
print(X_train[:,1])

In [None]:
# load train features
feature_cache_dir = os.path.join(cbcore.data.paths.projects_data_dir, 'recsys-peer-match', 'torch_experiments', 'feature_cache')
filenames = [
    ('X_train_raw.pkl', 'y_train_raw.pkl'),
    ('X_test2train_raw.pkl', 'y_test2train_raw.pkl'),
]

def get_features(x_filename, y_filename):
    with open(os.path.join(feature_cache_dir, x_filename), 'rb') as infile:
        X = pickle.load(infile)
    with open(os.path.join(feature_cache_dir, y_filename), 'rb') as infile:
        y = pickle.load(infile)
    return X, y

x_filename, y_filename = filenames[0]
X_train, y_train = get_features(x_filename, y_filename)
    
x_filename, y_filename = filenames[1]
X_test, y_test = get_features(x_filename, y_filename)
print("train/test shape")
X_train.shape, X_test.shape, y_train.shape, y_test.shape

# X = np.concatenate((X_train, X_test), axis=0)
# y_true = np.concatenate((y_train, y_test), axis=0)
# print("concat shape")
# X.shape, y_true.shape

In [None]:
import math
print(np.histogram(X_train[:,4], bins=[0, 1/60, 1, 24, 24*7, 24*7*365, math.inf]))
print(np.histogram(X_train[:,6], bins=[0, 1/60, 1, 24, 24*7, 24*7*365, math.inf]))
print(np.histogram(X_train[:,8], bins=[0, 1/60, 1, 24, 24*7, 24*7*365, math.inf]))
print(np.histogram(X_train[:,10], bins=[0, 1/60, 1, 24, 24*7, 24*7*365, math.inf]))
print(np.histogram(X_train[:,11], bins=[0, 1/60, 1, 24, 24*7, 24*7*365, math.inf]))

In [None]:
manager.set_training_data()

In [None]:
manager.train_model(X_train, y_train)

In [None]:
list = {"source_feature_arr": 12, "candidate_feature_arr": 12, "source_candidate_feature_arr": 3, "source_text_arr": 768, "candidate_text_arr": 768}
out_str = ""

for key, value in list.items():
    for i in range(value):
        out_str+="'" + key + "_" + str(i) + "',"
        
#print(out_str)


In [None]:
import cbrec.modeling.modelconfig
import cbrec.modeling.scorer
import cbrec.modeling.manager
import cbrec.modeling.preprocess

config = cbrec.genconfig.Config()
model_config = cbrec.modeling.modelconfig.ModelConfig()

feature_manager = cbrec.modeling.preprocess.FeatureManager(model_config)
print(feature_manager.get_feature_index("source-indegree"))
print(feature_manager.get_feature_indices("*", 'text', feature_descriptor_inverse = True))
print(feature_manager.get_feature_indices("source", 'indegree'))
print(feature_manager.get_feature_indices("source", '*'))

In [None]:
from collections import OrderedDict
temp = OrderedDict()
temp2 = []
print(type(temp2))
isinstance(temp, list)

In [None]:
# load train features
feature_cache_dir = os.path.join(cbcore.data.paths.projects_data_dir, 'recsys-peer-match', 'torch_experiments', 'feature_cache')
filenames = [
    ('X_train_raw.pkl', 'y_train_raw.pkl'),
    ('X_test2train_raw.pkl', 'y_test2train_raw.pkl'),
]
Xs = []
ys = []
for x_filename, y_filename in filenames:
    with open(os.path.join(feature_cache_dir, x_filename), 'rb') as infile:
        X = pickle.load(infile)
        Xs.append(X)
    with open(os.path.join(feature_cache_dir, y_filename), 'rb') as infile:
        y = pickle.load(infile)
        ys.append(y)

X = np.concatenate(Xs, axis=0)
y_true = np.concatenate(ys, axis=0)

# shuffle the data
inds = np.arange(len(X))
np.random.shuffle(inds)
X = X[inds]
y_true = y_true[inds]

In [None]:
new_values, bins = pd.qcut(X[:,1], 4, labels=False, retbins=True)
print(X[:,1])
print(new_values)
print(bins)
print(range(5))
print(range(0, 5))
print(type(5))

In [None]:
import cbrec.modeling.modelconfig
import cbrec.modeling.scorer
import cbrec.modeling.manager
from cbrec.modeling.preprocess import OneHotEncoding

logger = logging.getLogger("notebook.ZentTorchExperiment")

model_config = cbrec.modeling.modelconfig.ModelConfig()
model_config.preprocess_drop_columns = []
model_config.experiment_name = '6_quant_duration_2'
model_config.train_n_epochs = 1

# for usp_type in ['source', 'candidate']:
#     for int_type in ['journal', 'amp', 'comment', 'guestbook']:
#         model_config.preprocess_encode_columns.append(OneHotEncoding(usp_type + "-" + int_type + "_time_to_most_recent", 6))
#     model_config.preprocess_encode_columns.append(OneHotEncoding(usp_type + "-" + "time_to_first_update",6))
model_config.preprocess_encode_columns.append(OneHotEncoding("source-time_to_first_update",6))

model_manager = cbrec.modeling.manager.ModelManager(model_config)
logger.info(model_manager.model_config.output_basename)

model_manager.train_model(X, y_true)
model_manager.save_model()

logger.info("Finished training and saving model")

In [None]:
direct = "/home/lana/shared/caringbridge/data/projects/recsys-peer-match/torch_experiments/modeling/LinearNet_wo_duration_1_20211118184004.json"
# with open(direct, 'rb') as infile:
#     val_metrics = pickle.load(infile)
import cbrec.modeling.modelconfig
import cbrec.modeling.scorer
import cbrec.modeling.manager

#mm = cbrec.modeling.manager.ModelManager.load_from_model_name('LinearNet', experiment_name = "wo_encode_dur_1")
mm = cbrec.modeling.manager.ModelManager.load_from_filepath(direct)

mm.load_model(load_model_state_dict=True, load_training_metrics=True)
    
train_metrics, test_metrics = mm.model_trainer.get_train_metrics()

fig, ax = plt.subplots(1, 1, figsize=(6, 6))

xs = test_metrics.T[:,0]
ys = test_metrics.T[:,1]
ax.plot(xs, ys, label='Test')

xs = train_metrics.T[:,0]
ys = train_metrics.T[:,1]
ax.plot(xs, ys, label='Train')

ax.legend()

plt.show()

In [None]:
direct = "/home/lana/shared/caringbridge/data/projects/recsys-peer-match/torch_experiments/modeling/LinearNet_encode_dur_1_20211104231737.json"
# with open(direct, 'rb') as infile:
#     val_metrics = pickle.load(infile)
import cbrec.modeling.modelconfig
import cbrec.modeling.scorer
import cbrec.modeling.manager

#mm = cbrec.modeling.manager.ModelManager.load_from_model_name('LinearNet', experiment_name = "wo_encode_dur_1")
mm = cbrec.modeling.manager.ModelManager.load_from_filepath(direct)

mm.load_model(load_model_state_dict=True, load_training_metrics=True)
    
train_metrics, test_metrics = mm.model_trainer.get_train_metrics()

fig, ax = plt.subplots(1, 1, figsize=(6, 6))

xs = test_metrics.T[:,0]
ys = test_metrics.T[:,1]
ax.plot(xs, ys, label='Test')

xs = train_metrics.T[:,0]
ys = train_metrics.T[:,1]
ax.plot(xs, ys, label='Train')

ax.legend()

plt.show()

In [None]:
import math
subset = X[:,4] 
subset = subset[subset < (365*24)]
plt.hist(subset, bins=100)
print(f'max value:{max(subset)} of {len(subset)}')
subset = X[:,6] 
subset = subset[subset < (365*24)]
plt.hist(subset, bins=100)
print(f'max value:{max(subset)} of {len(subset)}')
subset = X[:,8] 
subset = subset[subset < (365*24)]
plt.hist(subset, bins=100)
print(f'max value:{max(subset)} of {len(subset)}')
subset = X[:,10] 
subset = subset[subset < (365*24)]
plt.hist(subset, bins=100)
print(f'max value:{max(subset)} of {len(subset)}')
plt.title(f"Distribution of duration features less than 1 year")
plt.xlabel("")
plt.ylabel("Count")
plt.show()

print(np.histogram(X[:,4], bins=[0, 1/60, 1, 24, 24*7, 24*7*365, math.inf]))

In [None]:
import math
subset = X[:,4] 
subset = subset[subset >= (365*24)]
plt.hist(subset, bins=100)
print(f'min value:{min(subset)} of {len(subset)}')
subset = X[:,6] 
subset = subset[subset >= (365*24)]
plt.hist(subset, bins=100)
print(f'min value:{min(subset)} of {len(subset)}')
subset = X[:,8] 
subset = subset[subset >= (365*24)]
plt.hist(subset, bins=100)
print(f'min value:{min(subset)} of {len(subset)}')
subset = X[:,10] 
subset = subset[subset >= (365*24)]
plt.hist(subset, bins=100)
print(f'min value:{min(subset)} of {len(subset)}')
plt.title(f"Distribution of duration features over 1 year")
plt.xlabel("")
plt.ylabel("Count")
plt.show()

print(np.histogram(X[:,4], bins=[0, 1/60, 1, 24, 24*7, 24*365, math.inf]))

In [None]:
plt.hist(X[:,8], bins=100)
plt.title(f"Distribution of {len(X)} feature 4")
plt.xlabel("")
plt.ylabel("Count")
plt.show()

In [None]:
plt.hist(X[:,10], bins=100)
plt.title(f"Distribution of {len(X)} feature 10")
plt.xlabel("")
plt.ylabel("Count")
plt.show()

In [None]:
plt.hist(X[:,11], bins=100)
plt.title(f"Distribution of {len(X)} feature 11")
plt.xlabel("")
plt.ylabel("Count")
plt.show()

In [None]:
import math
print(np.histogram(X[:,4], bins=[0, 1/60, 1, 24, 24*7, 24*365, math.inf]))
print(np.histogram(X[:,6], bins=[0, 1/60, 1, 24, 24*7, 24*365, math.inf]))
print(np.histogram(X[:,8], bins=[0, 1/60, 1, 24, 24*7, 24*365, math.inf]))
print(np.histogram(X[:,10], bins=[0, 1/60, 1, 24, 24*7, 24*365, math.inf]))
print(np.histogram(X[:,11], bins=[0, 1/60, 1, 24, 24*7, 24*365, math.inf]))