In [1]:
import pandas as pd
import numpy as np
import os

from gene_variation_effects import config
from gene_variation_effects.modeling import NNPipeLine, MLP, data_shuffle_split, run_training_loop
import torch

seed = 42

DATASET_NAME = 'dataset.csv'
TARGET_FEATURE = 'ClinSigSimple'
df = pd.read_csv(os.path.join(config.DATA_DIR, DATASET_NAME))
target_array = df[TARGET_FEATURE]
df.drop(columns=[TARGET_FEATURE], inplace=True)

ONEHOT_FEATURES = ['Type']
EMBEDDED_FEATURES = ['GeneSymbol']
NUMERIC_FEATURES = ['VariantLength', 'VariantLengthDifference']
pipeline = NNPipeLine(df.columns, ONEHOT_FEATURES, EMBEDDED_FEATURES, NUMERIC_FEATURES)

[32m2025-11-01 15:42:07.776[0m | [1mINFO    [0m | [36mgene_variation_effects.config[0m:[36m<module>[0m:[36m11[0m - [1mPROJ_ROOT path is: /Users/jeffbrin/Desktop/gene-variation-effects-ml[0m


In [2]:
# Split the data
TEST_SIZE = 0.05
VALIDATION_SIZE = 0.05
training_data, validation_data, test_data, training_target_array, validation_target_array, test_target_array = \
    data_shuffle_split(df.to_numpy(), target_array, TEST_SIZE, VALIDATION_SIZE, seed)

In [3]:
# Make sure input has no nan (it messed up the nn)
pd.isna(df.to_numpy()).any()

np.False_

In [4]:
# Fit pipeline and transform data
transformed_X_training, feature_processor = pipeline.fit_feature_transformations(training_data)
transformed_X_training = torch.Tensor(transformed_X_training)
transformed_X_validation = torch.Tensor(feature_processor.transform(validation_data))
transformed_X_test = torch.Tensor(feature_processor.transform(test_data))

training_target_array = torch.unsqueeze(torch.Tensor(training_target_array.to_numpy()), 1)
validation_target_array =  torch.unsqueeze(torch.Tensor(validation_target_array.to_numpy()), 1)
test_target_array =  torch.unsqueeze(torch.Tensor(test_target_array.to_numpy()), 1)

In [5]:
# TODO: Idk if I did something weird here
# TODO: Move to pipeline
# Getting input sizes for embedding layer
embedding_processor = feature_processor.named_transformers_['high_cardinality']
label_encoder = embedding_processor.named_steps['label_encode']

extra_cat_for_potential_unknown = 1 if label_encoder.handle_unknown == 'use_encoded_value' else 0
embedding_input_sizes = [cat.size + extra_cat_for_potential_unknown for cat in label_encoder.categories_]

In [6]:
# We only have one embedding input size right now, since we're doing this by hand anyways we can probably hard-code each one
# going forward. Maybe there is a way to automate this.
GENE_LABELS_EMBEDDING_SIZE = min(50, int(embedding_input_sizes[0] ** 0.25))
HIDDEN_SIZES = [100]
embedding_dimension_mapping = np.array([[embedding_input_sizes[0], GENE_LABELS_EMBEDDING_SIZE]])
model = MLP(embedding_dimension_mapping, HIDDEN_SIZES, transformed_X_training.shape[1])

In [7]:
# Get indices for columns using embedding
features_indices = {feat: i for i, feat in enumerate(feature_processor.get_feature_names_out())}
embedding_features_indices = [ind for feat, ind in features_indices.items() if feat.startswith('high_cardinality')]
EPOCHS = 100

best_trained_model = run_training_loop(model, EPOCHS, transformed_X_training, transformed_X_validation, training_target_array, validation_target_array, embedding_features_indices)

[32m2025-11-01 15:42:18.231[0m | [1mINFO    [0m | [36mgene_variation_effects.modeling.train[0m:[36mrun_training_loop[0m:[36m50[0m - [1mEpoch 0: loss=0.6644[0m
[32m2025-11-01 15:42:18.242[0m | [1mINFO    [0m | [36mgene_variation_effects.modeling.train[0m:[36mrun_training_loop[0m:[36m51[0m - [1mEpoch 0: validation loss=0.6644[0m
[32m2025-11-01 15:42:18.273[0m | [1mINFO    [0m | [36mgene_variation_effects.modeling.train[0m:[36mrun_training_loop[0m:[36m50[0m - [1mEpoch 1: loss=0.6517[0m
[32m2025-11-01 15:42:18.273[0m | [1mINFO    [0m | [36mgene_variation_effects.modeling.train[0m:[36mrun_training_loop[0m:[36m51[0m - [1mEpoch 1: validation loss=0.6517[0m
[32m2025-11-01 15:42:18.297[0m | [1mINFO    [0m | [36mgene_variation_effects.modeling.train[0m:[36mrun_training_loop[0m:[36m50[0m - [1mEpoch 2: loss=0.6400[0m
[32m2025-11-01 15:42:18.297[0m | [1mINFO    [0m | [36mgene_variation_effects.modeling.train[0m:[36mrun_training_loop