<a href="https://colab.research.google.com/github/jananibabu17112004-ctrl/variant-pathogenicity-streamlit17/blob/main/geneprj.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
!ls



drive  sample_data


In [None]:
import streamlit as st
import numpy as np
import pickle
import matplotlib.pyplot as plt
import tensorflow as tf

# ----------------------------------
# PAGE CONFIG
# ----------------------------------
st.set_page_config(page_title="Variant Classification Demo", layout="centered")
st.title("üß¨ Genetic Variant Classification")
st.write("Transformer vs BiLSTM ‚Äì Deep Learning Comparison")

BASE_PATH = "."

# ----------------------------------
# LOAD ENCODERS
# ----------------------------------
@st.cache_resource
def load_encoders():
    with open(f"{BASE_PATH}/encoders.pkl", "rb") as f:
        return pickle.load(f)

encoders = load_encoders()

gene_le = encoders["GeneSymbol"]
type_le = encoders["Type"]
review_le = encoders["ReviewStatus"]
assembly_le = encoders["Assembly"]

# ----------------------------------
# SAFE ENCODING FUNCTION
# ----------------------------------
def safe_encode(le, value):
    if value in le.classes_:
        return le.transform([value])[0]
    return 0

# ----------------------------------
# BUILD DEMO MODELS (LIGHTWEIGHT)
# ----------------------------------
@st.cache_resource
def build_transformer():
    inp = tf.keras.Input(shape=(4,))
    x = tf.keras.layers.Embedding(100000, 64)(inp)
    x = tf.keras.layers.Dense(256, activation="relu")(x)
    x = tf.keras.layers.Flatten()(x)
    x = tf.keras.layers.Dense(128, activation="relu")(x)
    out = tf.keras.layers.Dense(3, activation="softmax")(x)
    model = tf.keras.Model(inp, out)
    model.compile(loss="sparse_categorical_crossentropy")
    return model

@st.cache_resource
def build_bilstm():
    inp = tf.keras.Input(shape=(4,))
    x = tf.keras.layers.Embedding(100000, 64)(inp)
    x = tf.keras.layers.Bidirectional(
        tf.keras.layers.LSTM(64)
    )(x)
    x = tf.keras.layers.Dense(128, activation="relu")(x)
    out = tf.keras.layers.Dense(3, activation="softmax")(x)
    model = tf.keras.Model(inp, out)
    model.compile(loss="sparse_categorical_crossentropy")
    return model

transformer_model = build_transformer()
bilstm_model = build_bilstm()

# ----------------------------------
# USER INPUTS
# ----------------------------------
model_choice = st.selectbox(
    "Select Model",
    ["Transformer (Base Model)", "BiLSTM (Comparison Model)"]
)

gene = st.selectbox("Gene Symbol", sorted(gene_le.classes_))
variant_type = st.selectbox("Variant Type", sorted(type_le.classes_))
review_status = st.selectbox("Review Status", sorted(review_le.classes_))
assembly = st.selectbox("Genome Assembly", sorted(assembly_le.classes_))

# ----------------------------------
# PREDICTION
# ----------------------------------
if st.button("üîç Predict"):
    input_seq = np.array([[
        safe_encode(gene_le, gene),
        safe_encode(type_le, variant_type),
        safe_encode(review_le, review_status),
        safe_encode(assembly_le, assembly)
    ]])

    if model_choice.startswith("Transformer"):
        probs = transformer_model.predict(input_seq, verbose=0)[0]
    else:
        probs = bilstm_model.predict(input_seq, verbose=0)[0]

    labels = ["BENIGN", "VUS", "PATHOGENIC"]
    predicted_label = labels[int(np.argmax(probs))]
    confidence = float(np.max(probs))

    st.success(f"Prediction: **{predicted_label}**")
    st.info(f"Confidence: **{confidence:.2f}**")

    # ----------------------------------
    # PROBABILITY BAR CHART
    # ----------------------------------
    fig, ax = plt.subplots()
    ax.bar(labels, probs)
    ax.set_ylim(0, 1)
    ax.set_ylabel("Probability")
    ax.set_title("Prediction Probabilities")

    for i, v in enumerate(probs):
        ax.text(i, v + 0.02, f"{v:.2f}", ha="center")

    st.pyplot(fig)


In [None]:
!pip install streamlit


In [None]:
Runtime ‚Üí Restart runtime


In [None]:
import streamlit as st
print("Streamlit installed!")


In [None]:
def demo_predict(gene, variant_type, review_status, assembly):
    input_seq = np.array([[
        safe_encode(gene_le, gene),
        safe_encode(type_le, variant_type),
        safe_encode(review_le, review_status),
        safe_encode(assembly_le, assembly)
    ]])

    probs = model.predict(input_seq, verbose=0)[0]
    labels = ["BENIGN", "VUS", "PATHOGENIC"]

    for l, p in zip(labels, probs):
        print(f"{l}: {p:.2f}")

    print("\nPrediction:", labels[np.argmax(probs)])


In [None]:
demo_predict(
    gene="BRCA1",
    variant_type="single nucleotide variant",
    review_status="criteria provided, multiple submitters, no conflicts",
    assembly="GRCh38"
)


In [None]:
import numpy as np


In [None]:
!ls


In [None]:
demo_predict(
    gene="BRCA1",
    variant_type="single nucleotide variant",
    review_status="criteria provided, multiple submitters, no conflicts",
    assembly="GRCh38"
)


In [None]:
import numpy as np

# Safe encoding function
def safe_encode(le, value):
    if value in le.classes_:
        return le.transform([value])[0]
    return 0


In [None]:
gene_le = encoders["GeneSymbol"]
type_le = encoders["Type"]
review_le = encoders["ReviewStatus"]
assembly_le = encoders["Assembly"]


In [None]:
import pickle

with open("encoders.pkl", "rb") as f:
    encoders = pickle.load(f)

print("Encoders loaded successfully!")


In [None]:
from google.colab import drive
drive.mount('/content/drive')


In [None]:
!ls /content/drive/MyDrive/UG_Project


In [None]:
import pickle

ENCODER_PATH = "/content/drive/MyDrive/UG_Project/encoders.pkl"

with open(ENCODER_PATH, "rb") as f:
    encoders = pickle.load(f)

print("Encoders loaded successfully")
print(encoders.keys())


In [None]:
gene_le = encoders["GeneSymbol"]
type_le = encoders["Type"]
review_le = encoders["ReviewStatus"]
assembly_le = encoders["Assembly"]

print("All encoders ready")


In [None]:
import tensorflow as tf

MODEL_PATH = "/content/drive/MyDrive/UG_Project/transssv_style_model.keras"

model = tf.keras.models.load_model(
    MODEL_PATH,
    compile=False,
    safe_mode=False
)

print("Model loaded successfully")


In [None]:
import tensorflow as tf
import numpy as np

# Simple demo inference model
inp = tf.keras.Input(shape=(4,))
x = tf.keras.layers.Embedding(100000, 64)(inp)
x = tf.keras.layers.Bidirectional(
    tf.keras.layers.LSTM(64)
)(x)
x = tf.keras.layers.Dense(128, activation="relu")(x)
out = tf.keras.layers.Dense(3, activation="softmax")(x)

model = tf.keras.Model(inp, out)

model.compile(
    optimizer="adam",
    loss="sparse_categorical_crossentropy"
)

print("Demo inference model ready")


In [None]:
def demo_predict(gene, variant_type, review_status, assembly):
    input_data = np.array([[
        safe_encode(gene_le, gene),
        safe_encode(type_le, variant_type),
        safe_encode(review_le, review_status),
        safe_encode(assembly_le, assembly)
    ]])

    probs = model.predict(input_data, verbose=0)[0]
    labels = ["BENIGN", "VUS", "PATHOGENIC"]

    print("Prediction probabilities:")
    for l, p in zip(labels, probs):
        print(f"{l}: {p:.2f}")

    print("\nFinal Prediction:", labels[np.argmax(probs)])


In [None]:
demo_predict(
    gene="BRCA1",
    variant_type="single nucleotide variant",
    review_status="criteria provided, multiple submitters, no conflicts",
    assembly="GRCh38"
)


In [None]:
import pandas as pd

# Define the path to the gzipped text file
file_path = '/variant_summary.txt.gz'

# Read the gzipped text file into a pandas DataFrame.
# Assuming it's a tab-separated file. If it's comma-separated or another delimiter,
# you might need to adjust the 'delimiter' or 'sep' parameter.
try:
    df = pd.read_csv(file_path, compression='gzip', sep='\t')
    print(f"Successfully loaded '{file_path}' into a DataFrame.")
    print("Displaying the first 5 rows:")
    display(df.head())
    print("\nDataFrame Info:")
    df.info()
except Exception as e:
    print(f"Error loading the file: {e}")
    print("Please check if the file format (e.g., delimiter) is correct, or if the file is truly a text file.")

In [None]:
!ls



In [None]:
!ls


In [None]:
import numpy as np

def predict_variant(gene, variant_type, review_status, assembly):
    # Safe encoding (prevents crashes)
    def safe_encode(le, value):
        if value in le.classes_:
            return le.transform([value])[0]
        else:
            return 0  # fallback

    gene_enc = safe_encode(gene_le, gene)
    type_enc = safe_encode(type_le, variant_type)
    review_enc = safe_encode(review_le, review_status)
    assembly_enc = safe_encode(assembly_le, assembly)

    input_data = {
        "gene_input": np.array([gene_enc]),
        "type_input": np.array([type_enc]),
        "review_input": np.array([review_enc]),
        "assembly_input": np.array([assembly_enc])
    }

    pred = model.predict(input_data, verbose=0)
    pred_class = int(np.argmax(pred))
    confidence = float(np.max(pred))

    label_map = {
        0: "BENIGN",
        1: "VUS",
        2: "PATHOGENIC"
    }

    return label_map[pred_class], round(confidence, 3)


In [None]:
result, conf = predict_variant(
    gene="BRCA1",
    variant_type="single nucleotide variant",
    review_status="criteria provided, multiple submitters, no conflicts",
    assembly="GRCh38"
)

print("Prediction:", result)
print("Confidence:", conf)


In [None]:
import pickle

BASE_PATH = "/content/drive/MyDrive/UG_Project"

with open(f"{BASE_PATH}/encoders.pkl", "rb") as f:
    encoders = pickle.load(f)

gene_le = encoders["GeneSymbol"]
type_le = encoders["Type"]
review_le = encoders["ReviewStatus"]
assembly_le = encoders["Assembly"]

print("Encoders loaded successfully!")


In [None]:
import pandas as pd
import pickle
from sklearn.preprocessing import LabelEncoder

BASE_PATH = "/content/drive/MyDrive/UG_Project"

# Load cleaned dataset
df = pd.read_csv(f"{BASE_PATH}/variant_summary_cleaned1.csv")

encoders = {}

for col in ["GeneSymbol", "Type", "ReviewStatus", "Assembly"]:
    le = LabelEncoder()
    le.fit(df[col])
    encoders[col] = le

print("Encoders rebuilt successfully!")


In [None]:
with open(f"{BASE_PATH}/encoders.pkl", "wb") as f:
    pickle.dump(encoders, f)

print("Encoders saved successfully!")


In [None]:
with open(f"{BASE_PATH}/encoders.pkl", "rb") as f:
    encoders = pickle.load(f)

gene_le = encoders["GeneSymbol"]
type_le = encoders["Type"]
review_le = encoders["ReviewStatus"]
assembly_le = encoders["Assembly"]

print("Encoders loaded:", len(gene_le.classes_))


In [5]:
result, conf = predict_variant(
    gene="BRCA1",
    variant_type="single nucleotide variant",
    review_status="criteria provided, multiple submitters, no conflicts",
    assembly="GRCh38"
)

print(result, conf)


NameError: name 'predict_variant' is not defined

In [None]:
import tensorflow as tf
from tensorflow.keras.layers import (
    Input, Embedding, Dense, Concatenate, Dropout,
    LayerNormalization, Flatten, MultiHeadAttention, Reshape
)
from tensorflow.keras.models import Model

# Vocabulary sizes from encoders
num_genes = len(gene_le.classes_)
num_types = len(type_le.classes_)
num_reviews = len(review_le.classes_)
num_assembly = len(assembly_le.classes_)

# Inputs
gene_input = Input(shape=(1,), name="gene_input")
type_input = Input(shape=(1,), name="type_input")
review_input = Input(shape=(1,), name="review_input")
assembly_input = Input(shape=(1,), name="assembly_input")

# Embeddings
gene_emb = Embedding(num_genes, 64)(gene_input)
type_emb = Embedding(num_types, 8)(type_input)
review_emb = Embedding(num_reviews, 16)(review_input)
assembly_emb = Embedding(num_assembly, 4)(assembly_input)

# Concatenate
x = Concatenate()([
    Flatten()(gene_emb),
    Flatten()(type_emb),
    Flatten()(review_emb),
    Flatten()(assembly_emb)
])

# Projection
x_proj = Dense(256, activation="relu")(x)

# Attention block (NO Lambda)
x_seq = Reshape((1, 256))(x_proj)
attn = MultiHeadAttention(num_heads=4, key_dim=64)(x_seq, x_seq)
attn = Reshape((256,))(attn)

x_attn = LayerNormalization()(x_proj + attn)

# Feed-forward
ffn = Dense(256, activation="relu")(x_attn)
x_attn = LayerNormalization()(x_attn + ffn)

# Classifier
x = Dense(128, activation="relu")(x_attn)
x = Dropout(0.3)(x)
x = Dense(64, activation="relu")(x)
output = Dense(3, activation="softmax")(x)

# Build model
model = Model(
    inputs=[gene_input, type_input, review_input, assembly_input],
    outputs=output
)

print("Model rebuilt successfully!")


In [None]:
BASE_PATH = "/content/drive/MyDrive/UG_Project"

model.load_weights(f"{BASE_PATH}/transssv_style_model.keras")
print("Weights loaded successfully!")


In [6]:
# Use a SMALL subset just for demo
X_demo = X_train.sample(50000, random_state=42)
y_demo = y_train.loc[X_demo.index]

history = model.fit(
    {
        "gene_input": X_demo.iloc[:,0],
        "type_input": X_demo.iloc[:,1],
        "review_input": X_demo.iloc[:,2],
        "assembly_input": X_demo.iloc[:,3],
    },
    y_demo,
    epochs=3,
    batch_size=256
)


NameError: name 'X_train' is not defined

In [None]:
import pandas as pd

BASE_PATH = "/content/drive/MyDrive/UG_Project"

X_train = pd.read_csv(f"{BASE_PATH}/X_train.csv")
y_train = pd.read_csv(f"{BASE_PATH}/y_train.csv")

print(X_train.shape, y_train.shape)


In [None]:
# Small subset ONLY for demo
X_demo = X_train.sample(n=50000, random_state=42)
y_demo = y_train[X_demo.index]


In [None]:
# Convert y_train to 1D Series
if isinstance(y_train, pd.DataFrame):
    y_train = y_train.iloc[:, 0]


In [None]:
X_demo = X_train.sample(n=50000, random_state=42)
y_demo = y_train.loc[X_demo.index]


In [None]:
X_demo, y_demo = X_train.sample(
    n=50000,
    random_state=42
), y_train.sample(
    n=50000,
    random_state=42
)


In [None]:
history = model.fit(
    {
        "gene_input": X_demo.iloc[:, 0].values,
        "type_input": X_demo.iloc[:, 1].values,
        "review_input": X_demo.iloc[:, 2].values,
        "assembly_input": X_demo.iloc[:, 3].values,
    },
    y_demo.values,
    epochs=3,
    batch_size=256,
    verbose=1
)


In [None]:
model.compile(
    optimizer="adam",
    loss="sparse_categorical_crossentropy",
    metrics=["accuracy"]
)

print("Model compiled successfully!")


In [7]:
history = model.fit(
    {
        "gene_input": X_demo.iloc[:, 0].values,
        "type_input": X_demo.iloc[:, 1].values,
        "review_input": X_demo.iloc[:, 2].values,
        "assembly_input": X_demo.iloc[:, 3].values,
    },
    y_demo.values,
    epochs=3,
    batch_size=256,
    verbose=1
)


NameError: name 'model' is not defined

In [None]:
result, conf = predict_variant(
    gene="HFE",
    variant_type="single nucleotide variant",
    review_status="no assertion criteria provided",
    assembly="GRCh38"
)

print("Prediction:", result)
print("Confidence:", conf)


In [None]:
predict_variant(...)
# or
predict_variant_with_probs(...)


In [None]:
# BRCA2
predict_variant(
    gene="BRCA2",
    variant_type="single nucleotide variant",
    review_status="criteria provided, multiple submitters, no conflicts",
    assembly="GRCh38"
)

# TP53
predict_variant(
    gene="TP53",
    variant_type="single nucleotide variant",
    review_status="criteria provided, multiple submitters, no conflicts",
    assembly="GRCh38"
)

# MLH1
predict_variant(
    gene="MLH1",
    variant_type="single nucleotide variant",
    review_status="criteria provided, multiple submitters, no conflicts",
    assembly="GRCh37"
)


In [None]:
# ACTB
predict_variant(
    gene="ACTB",
    variant_type="single nucleotide variant",
    review_status="no assertion criteria provided",
    assembly="GRCh37"
)

# GAPDH
predict_variant(
    gene="GAPDH",
    variant_type="single nucleotide variant",
    review_status="no assertion criteria provided",
    assembly="GRCh38"
)

# RPLP0
predict_variant(
    gene="RPLP0",
    variant_type="single nucleotide variant",
    review_status="no assertion criteria provided",
    assembly="GRCh37"
)


In [8]:
# Deletion
predict_variant(
    gene="AP5Z1",
    variant_type="Deletion",
    review_status="no assertion criteria provided",
    assembly="GRCh37"
)

# Duplication
predict_variant(
    gene="DMD",
    variant_type="Duplication",
    review_status="criteria provided, multiple submitters, no conflicts",
    assembly="GRCh38"
)


NameError: name 'predict_variant' is not defined

In [None]:
predict_variant(
    gene="ZNF592",
    variant_type="single nucleotide variant",
    review_status="no assertion criteria provided",
    assembly="GRCh38"
)


In [None]:
predict_variant_with_probs(
    gene="BRCA1",
    variant_type="single nucleotide variant",
    review_status="criteria provided, multiple submitters, no conflicts",
    assembly="GRCh38"
)


In [9]:
import numpy as np
import matplotlib.pyplot as plt

def predict_and_plot(
    gene, variant_type, review_status, assembly
):
    def safe_encode(le, value):
        return le.transform([value])[0] if value in le.classes_ else 0

    input_data = {
        "gene_input": np.array([safe_encode(gene_le, gene)]),
        "type_input": np.array([safe_encode(type_le, variant_type)]),
        "review_input": np.array([safe_encode(review_le, review_status)]),
        "assembly_input": np.array([safe_encode(assembly_le, assembly)])
    }

    probs = model.predict(input_data, verbose=0)[0]

    labels = ["BENIGN", "VUS", "PATHOGENIC"]

    # ---- Plot ----
    plt.figure()
    plt.bar(labels, probs)
    plt.ylim(0, 1)
    plt.ylabel("Probability")
    plt.title("Clinical Significance Prediction")

    for i, v in enumerate(probs):
        plt.text(i, v + 0.02, f"{v:.2f}", ha="center")

    plt.show()

    pred_label = labels[int(np.argmax(probs))]
    return pred_label, probs


In [10]:
label, probs = predict_and_plot(
    gene="BRCA1",
    variant_type="single nucleotide variant",
    review_status="criteria provided, multiple submitters, no conflicts",
    assembly="GRCh38"
)

print("Predicted class:", label)


NameError: name 'gene_le' is not defined

In [None]:
predict_and_plot(
    gene="TP53",
    variant_type="single nucleotide variant",
    review_status="criteria provided, multiple submitters, no conflicts",
    assembly="GRCh38"
)


In [None]:
predict_and_plot(
    gene="ZNF592",
    variant_type="single nucleotide variant",
    review_status="no assertion criteria provided",
    assembly="GRCh38"
)


In [None]:
result, conf = predict_variant(
    gene="ACTB",
    variant_type="single nucleotide variant",
    review_status="no assertion criteria provided",
    assembly="GRCh37"
)

print(result, conf)


In [None]:
result, conf = predict_variant(
    gene="ACTB",
    variant_type="single nucleotide variant",
    review_status="no assertion criteria provided",
    assembly="GRCh37"
)

print(result, conf)


In [None]:
def predict_variant_with_probs(gene, variant_type, review_status, assembly):
    def safe_encode(le, value):
        return le.transform([value])[0] if value in le.classes_ else 0

    input_data = {
        "gene_input": np.array([safe_encode(gene_le, gene)]),
        "type_input": np.array([safe_encode(type_le, variant_type)]),
        "review_input": np.array([safe_encode(review_le, review_status)]),
        "assembly_input": np.array([safe_encode(assembly_le, assembly)])
    }

    probs = model.predict(input_data, verbose=0)[0]

    return {
        "BENIGN": round(float(probs[0]), 3),
        "VUS": round(float(probs[1]), 3),
        "PATHOGENIC": round(float(probs[2]), 3)
    }


In [None]:
predict_variant_with_probs(
    gene="BRCA1",
    variant_type="single nucleotide variant",
    review_status="criteria provided, multiple submitters, no conflicts",
    assembly="GRCh38"
)


In [None]:
!gunzip variant_summary.txt.gz


In [None]:
!gunzip variant_summary.txt.gz


In [None]:
!rm variant_summary.txt.gz


In [11]:
!ls -lh


total 8.0K
drwx------ 5 root root 4.0K Feb  3 17:27 drive
drwxr-xr-x 1 root root 4.0K Dec  9 14:42 sample_data


In [12]:
!rm variant_summary.txt.gz


rm: cannot remove 'variant_summary.txt.gz': No such file or directory


In [13]:
!ls


drive  sample_data


This code attempts to load your gzipped text file into a pandas DataFrame, assuming it's a tab-separated file. If the file uses a different delimiter (like commas), you'll need to modify `sep='\t'` to `sep=','` or another appropriate delimiter.

Once loaded, you can perform various operations like filtering, analysis, or visualization on this DataFrame for your project.

Let me know if this is what you were expecting or if you need help with further analysis!

In [14]:
!wget https://ftp.ncbi.nlm.nih.gov/pub/clinvar/tab_delimited/variant_summary.txt.gz



--2026-02-03 17:28:04--  https://ftp.ncbi.nlm.nih.gov/pub/clinvar/tab_delimited/variant_summary.txt.gz
Resolving ftp.ncbi.nlm.nih.gov (ftp.ncbi.nlm.nih.gov)... 130.14.250.31, 130.14.250.7, 130.14.250.10, ...
Connecting to ftp.ncbi.nlm.nih.gov (ftp.ncbi.nlm.nih.gov)|130.14.250.31|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 422086052 (403M) [application/x-gzip]
Saving to: ‚Äòvariant_summary.txt.gz‚Äô


2026-02-03 17:28:14 (39.9 MB/s) - ‚Äòvariant_summary.txt.gz‚Äô saved [422086052/422086052]



In [15]:
!gunzip variant_summary.txt.gz


In [16]:
rm variant_summary.txt.gz


rm: cannot remove 'variant_summary.txt.gz': No such file or directory


In [17]:
mv variant_summary.txt.gz.1 variant_summary.txt.gz


mv: cannot stat 'variant_summary.txt.gz.1': No such file or directory


In [18]:
!ls -lh

total 3.6G
drwx------ 5 root root 4.0K Feb  3 17:27 drive
drwxr-xr-x 1 root root 4.0K Dec  9 14:42 sample_data
-rw-r--r-- 1 root root 3.6G Feb  1 22:42 variant_summary.txt


In [19]:
# -------------------------------------------------------------
# ClinVar Variant Preprocessing Script
# Purpose: Clean and preprocess variant_summary.txt.gz for ML/DL analysis
# Author: janjas
# -------------------------------------------------------------

import pandas as pd

# ------------------ Step 1: Load Data -----------------------
file_path = "variant_summary.txt.gz"  # path to your ClinVar file

print("Loading gzipped file, this may take a few minutes...")
df = pd.read_csv(file_path, compression='gzip', sep='\t', low_memory=False)

print(f"Original data shape: {df.shape}")
print("Columns available:")
print(df.columns.tolist())

# ------------------ Step 2: Select Relevant Columns ------------------
# Keep only columns necessary for variant impact analysis
columns_to_keep = [
    'VariationID',         # Unique variant ID
    'GeneSymbol',          # Gene name
    'ClinicalSignificance',# Pathogenicity info
    'ReviewStatus',        # Review/curation status
    'Type',                # Variant type (SNV, deletion, insertion, etc.)
    'Assembly'             # Genome assembly (GRCh38/GRCh37)
]

df = df[columns_to_keep]
print(f"Data shape after column selection: {df.shape}")

# ------------------ Step 3: Handle Missing Data ------------------
# Drop rows where essential info is missing
df = df.dropna(subset=['GeneSymbol', 'ClinicalSignificance'])
print(f"Data shape after dropping missing values: {df.shape}")

# ------------------ Step 4: Remove Duplicates ------------------
df = df.drop_duplicates()
print(f"Data shape after removing duplicates: {df.shape}")

# ------------------ Step 5: Standardize Clinical Significance ------------------
# Convert all to uppercase and remove leading/trailing spaces
df['ClinicalSignificance'] = df['ClinicalSignificance'].str.upper().str.strip()

# Optional: unify common clinical significance labels
df['ClinicalSignificance'] = df['ClinicalSignificance'].replace({
    'LIKELY PATHOGENIC': 'PATHOGENIC',
    'UNCERTAIN SIGNIFICANCE': 'VUS',  # Variant of uncertain significance
    'LIKELY BENIGN': 'BENIGN'
})

# ------------------ Step 6: Optional Filtering ------------------
# Keep only variants of interest (e.g., pathogenic)
# Comment out this line if you want all variants
df_filtered = df[df['ClinicalSignificance'].isin(['PATHOGENIC', 'VUS', 'BENIGN'])]
print(f"Data shape after filtering clinical significance: {df_filtered.shape}")

# ------------------ Step 7: Save Cleaned Data ------------------
output_file = "variant_summary_cleaned.csv"
df_filtered.to_csv(output_file, index=False)
print(f"Cleaned data saved to {output_file}")

# ------------------ Step 8: Quick Summary ------------------
print("Summary of Clinical Significance counts:")
print(df_filtered['ClinicalSignificance'].value_counts())


Loading gzipped file, this may take a few minutes...


FileNotFoundError: [Errno 2] No such file or directory: 'variant_summary.txt.gz'

In [None]:
import pandas as pd

file_path = "variant_summary.txt.gz"
output_file = "variant_summary_cleaned1.csv"

# Define the columns to keep
columns_to_keep = [
    'VariationID', 'GeneSymbol', 'ClinicalSignificance',
    'ReviewStatus', 'Type', 'Assembly'
]

# Initialize CSV for writing cleaned data
header_written = False

# Process file in chunks
chunk_size = 500000  # number of rows per chunk (adjust if needed)
for chunk in pd.read_csv(file_path, compression='gzip', sep='\t', usecols=columns_to_keep, chunksize=chunk_size):

    # Drop rows with missing essential info
    chunk = chunk.dropna(subset=['GeneSymbol', 'ClinicalSignificance'])

    # Remove duplicates in this chunk
    chunk = chunk.drop_duplicates()

    # Standardize clinical significance
    chunk['ClinicalSignificance'] = chunk['ClinicalSignificance'].str.upper().str.strip()
    chunk['ClinicalSignificance'] = chunk['ClinicalSignificance'].replace({
        'LIKELY PATHOGENIC': 'PATHOGENIC',
        'UNCERTAIN SIGNIFICANCE': 'VUS',
        'LIKELY BENIGN': 'BENIGN'
    })

    # Optional filtering: keep only PATHOGENIC, VUS, BENIGN
    chunk = chunk[chunk['ClinicalSignificance'].isin(['PATHOGENIC', 'VUS', 'BENIGN'])]

    # Append cleaned chunk to CSV
    chunk.to_csv(output_file, mode='a', index=False, header=not header_written)
    header_written = True

print(f"Memory-efficient preprocessing done. Cleaned data saved to {output_file}")


In [None]:
import pandas as pd
df = pd.read_csv("variant_summary_cleaned1.csv")
print(df.shape)
print(df['ClinicalSignificance'].value_counts())
print(df.head())


In [None]:
# -------------------------------------------------------------
# ClinVar Preprocessing for Deep Learning
# Author: janjas
# Purpose: Encode categorical features, target, and split train/test
# -------------------------------------------------------------

import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

# ------------------ Step 1: Load Cleaned Data ------------------
df = pd.read_csv("variant_summary_cleaned1.csv")
print(f"Loaded data shape: {df.shape}")

# ------------------ Step 2: Encode Target ---------------------
# Map ClinicalSignificance to integers
target_mapping = {'BENIGN': 0, 'VUS': 1, 'PATHOGENIC': 2}
df['ClinicalSignificance_encoded'] = df['ClinicalSignificance'].map(target_mapping)

# ------------------ Step 3: Encode Categorical Features -------
# High-cardinality feature: GeneSymbol
gene_le = LabelEncoder()
df['GeneSymbol_encoded'] = gene_le.fit_transform(df['GeneSymbol'])

# Smaller categorical features
small_categorical = ['Type', 'ReviewStatus', 'Assembly']
for col in small_categorical:
    le = LabelEncoder()
    df[col + "_encoded"] = le.fit_transform(df[col])

# ------------------ Step 4: Prepare Features and Target -------
feature_cols = ['GeneSymbol_encoded', 'Type_encoded', 'ReviewStatus_encoded', 'Assembly_encoded']
X = df[feature_cols]
y = df['ClinicalSignificance_encoded']

print(f"Features shape: {X.shape}, Target shape: {y.shape}")

# ------------------ Step 5: Train/Test Split ------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Train shape: {X_train.shape}, Test shape: {X_test.shape}")

# ------------------ Step 6: Save Preprocessed Data -------------
X_train.to_csv("X_train.csv", index=False)
X_test.to_csv("X_test.csv", index=False)
y_train.to_csv("y_train.csv", index=False)
y_test.to_csv("y_test.csv", index=False)

print("Preprocessing complete. Train/test datasets saved!")



In [None]:
import tensorflow as tf
from tensorflow.keras.layers import Input, Embedding, Dense, Concatenate, Dropout, LayerNormalization
from tensorflow.keras.models import Model

# Get the number of unique categories for each feature from the LabelEncoders used previously
# The LabelEncoders are defined in the previous cell (CN6yqJ2fvq4Y).
num_genes = len(gene_le.classes_)
num_types = len(le.classes_) if 'Type_encoded' in df.columns else len(df['Type'].unique()) # assuming 'le' is the last used for small_categorical
num_reviews = len(le.classes_) if 'ReviewStatus_encoded' in df.columns else len(df['ReviewStatus'].unique())
num_assembly = len(le.classes_) if 'Assembly_encoded' in df.columns else len(df['Assembly'].unique())

# Define input shapes
gene_input = Input(shape=(1,), name="gene_input")
type_input = Input(shape=(1,), name="type_input")
review_input = Input(shape=(1,), name="review_input")
assembly_input = Input(shape=(1,), name="assembly_input")

# Embeddings
gene_emb = Embedding(input_dim=num_genes, output_dim=64)(gene_input)
type_emb = Embedding(input_dim=num_types, output_dim=8)(type_input)
review_emb = Embedding(input_dim=num_reviews, output_dim=16)(review_input)
assembly_emb = Embedding(input_dim=num_assembly, output_dim=4)(assembly_input)

# Flatten embedding outputs
gene_emb_flat = tf.keras.layers.Flatten()(gene_emb)
type_emb_flat = tf.keras.layers.Flatten()(type_emb)
review_emb_flat = tf.keras.layers.Flatten()(review_emb)
assembly_emb_flat = tf.keras.layers.Flatten()(assembly_emb)

# Concatenate all features
x = Concatenate()([
    gene_emb_flat,
    type_emb_flat,
    review_emb_flat,
    assembly_emb_flat
])

# Optional Transformer-style block (self-attention)
# Project to higher dim
x_proj = Dense(256, activation="relu")(x)
# Self-attention
attn_output = tf.keras.layers.MultiHeadAttention(num_heads=4, key_dim=64)(x_proj[:, None, :], x_proj[:, None, :])
# Flatten back
attn_flat = tf.keras.layers.Flatten()(attn_output)

# Combine and normalize
x_combined = Concatenate()([x, attn_flat])
x_norm = LayerNormalization()(x_combined)

# Dense layers
x_dense = Dense(128, activation="relu")(x_norm)
x_drop = Dropout(0.3)(x_dense)
x_dense2 = Dense(64, activation="relu")(x_drop)

# Output: 3 classes (benign, VUS, pathogenic)
output = Dense(3, activation="softmax")(x_dense2)

# Build and compile
model = Model(
    inputs=[gene_input, type_input, review_input, assembly_input],
    outputs=output
)

model.compile(
    optimizer="adam",
    loss="sparse_categorical_crossentropy",
    metrics=["accuracy"]
)

model.summary()

In [None]:
import pandas as pd
import tensorflow as tf
from tensorflow.keras.layers import Input, Embedding, Dense, Concatenate, Dropout, LayerNormalization
from tensorflow.keras.models import Model
from sklearn.metrics import classification_report, confusion_matrix

# ------------------ Load Train/Test Data ------------------
X_train = pd.read_csv("X_train.csv")
X_test = pd.read_csv("X_test.csv")
y_train = pd.read_csv("y_train.csv")
y_test = pd.read_csv("y_test.csv")

# ------------------ Define Input Sizes --------------------
# Use the full dataset's unique counts to define embedding dimensions
# This ensures all possible encoded values (from both train and test sets) are covered
# df is available in the kernel state from a previous cell where it was loaded (variant_summary_cleaned.csv).
num_genes = df['GeneSymbol_encoded'].max() + 1
num_types = df['Type_encoded'].max() + 1
num_reviews = df['ReviewStatus_encoded'].max() + 1
num_assembly = df['Assembly_encoded'].max() + 1

# ------------------ Build Transformer-style Model ---------
gene_input = Input(shape=(1,), name="gene_input")
type_input = Input(shape=(1,), name="type_input")
review_input = Input(shape=(1,), name="review_input")
assembly_input = Input(shape=(1,), name="assembly_input")

gene_emb = Embedding(input_dim=num_genes, output_dim=64)(gene_input)
type_emb = Embedding(input_dim=num_types, output_dim=8)(type_input)
review_emb = Embedding(input_dim=num_reviews, output_dim=16)(review_input)
assembly_emb = Embedding(input_dim=num_assembly, output_dim=4)(assembly_input)

gene_flat = tf.keras.layers.Flatten()(gene_emb)
type_flat = tf.keras.layers.Flatten()(type_emb)
review_flat = tf.keras.layers.Flatten()(review_emb)
assembly_flat = tf.keras.layers.Flatten()(assembly_emb)

x = Concatenate()([gene_flat, type_flat, review_flat, assembly_flat])

# Transformer-style attention block (corrected to use Lambda layers)
x_proj = Dense(256, activation='relu')(x)

# Expand dims to simulate sequence length = 1 for MultiHeadAttention
x_seq = tf.keras.layers.Lambda(lambda t: tf.expand_dims(t, axis=1), name="expand_dims_for_mha")(x_proj)

# Apply MultiHeadAttention
attn_output = tf.keras.layers.MultiHeadAttention(num_heads=4, key_dim=64)(x_seq, x_seq)

# Remove sequence dimension
attn_output = tf.keras.layers.Lambda(lambda t: tf.squeeze(t, axis=1), name="squeeze_after_mha")(attn_output)

# Residual connection + Layer Normalization
x_norm = LayerNormalization()(x_proj + attn_output)

# Dense layers after attention
x_dense = Dense(128, activation='relu')(x_norm)
x_drop = Dropout(0.3)(x_dense)
x_dense2 = Dense(64, activation='relu')(x_drop)
output = Dense(3, activation='softmax')(x_dense2)

model = Model(inputs=[gene_input, type_input, review_input, assembly_input], outputs=output)

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.summary()

# ------------------ Prepare Inputs ------------------------
train_inputs = {
    "gene_input": X_train['GeneSymbol_encoded'].values,
    "type_input": X_train['Type_encoded'].values,
    "review_input": X_train['ReviewStatus_encoded'].values,
    "assembly_input": X_train['Assembly_encoded'].values
}

test_inputs = {
    "gene_input": X_test['GeneSymbol_encoded'].values,
    "type_input": X_test['Type_encoded'].values,
    "review_input": X_test['ReviewStatus_encoded'].values,
    "assembly_input": X_test['Assembly_encoded'].values
}

# ------------------ Train Model ---------------------------
history = model.fit(
    train_inputs,
    y_train.values,
    validation_split=0.1,
    epochs=15,          # start small; increase after testing
    batch_size=512
)

In [None]:
from sklearn.metrics import classification_report, confusion_matrix

# Predict on test set
y_pred_probs = model.predict(test_inputs, batch_size=256)
y_pred = y_pred_probs.argmax(axis=1)

print("Classification Report:")
print(classification_report(
    y_test,
    y_pred,
    target_names=["BENIGN", "VUS", "PATHOGENIC"]
))

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))


In [None]:
model.save("baseline_dl_model.h5")


In [None]:
model.save("baseline_dl_model.keras")


In [None]:
from tensorflow.keras.models import load_model
model = load_model("baseline_dl_model.keras")


In [None]:
import tensorflow as tf
from tensorflow.keras.layers import LayerNormalization, MultiHeadAttention

# Fix: Define the 'mha' object using parameters from the previous model definition.
mha = MultiHeadAttention(num_heads=4, key_dim=64)

attn_out = mha(x, x)
x = LayerNormalization()(x + attn_out)


In [None]:
from tensorflow.keras.layers import MultiHeadAttention, LayerNormalization, Dense


In [None]:
import tensorflow as tf
from tensorflow.keras.layers import (
    Input, Embedding, Dense, Concatenate, Dropout,
    LayerNormalization, Flatten, MultiHeadAttention
)
from tensorflow.keras.models import Model
import pandas as pd

# ------------------ Load SMALL DATA ------------------
X_train = pd.read_csv("X_train_small.csv")
y_train = pd.read_csv("y_train_small.csv")
X_test  = pd.read_csv("X_test_small.csv")
y_test  = pd.read_csv("y_test_small.csv")

# ------------------ Vocabulary sizes -----------------
num_genes    = X_train['GeneSymbol_encoded'].nunique()
num_types    = X_train['Type_encoded'].nunique()
num_reviews  = X_train['ReviewStatus_encoded'].nunique()
num_assembly = X_train['Assembly_encoded'].nunique()

# ------------------ Inputs ---------------------------
gene_input     = Input(shape=(1,), name="gene_input")
type_input     = Input(shape=(1,), name="type_input")
review_input   = Input(shape=(1,), name="review_input")
assembly_input = Input(shape=(1,), name="assembly_input")

# ------------------ Embeddings -----------------------
gene_emb     = Embedding(num_genes, 64)(gene_input)
type_emb     = Embedding(num_types, 8)(type_input)
review_emb   = Embedding(num_reviews, 16)(review_input)
assembly_emb = Embedding(num_assembly, 4)(assembly_input)

gene_flat     = Flatten()(gene_emb)
type_flat     = Flatten()(type_emb)
review_flat   = Flatten()(review_emb)
assembly_flat = Flatten()(assembly_emb)

# ------------------ Concatenate ----------------------
x = Concatenate()([gene_flat, type_flat, review_flat, assembly_flat])

# ------------------ Projection -----------------------
x_proj = Dense(256, activation="relu")(x)

# ------------------ Transformer Attention Block ------
# Define MHA
mha = MultiHeadAttention(num_heads=4, key_dim=64)

# Expand dims to (batch, seq_len=1, features)
x_seq = tf.expand_dims(x_proj, axis=1)

# Self-attention
attn_out = mha(x_seq, x_seq)

# Remove seq dimension
attn_out = tf.squeeze(attn_out, axis=1)

# Residual + LayerNorm
x_attn = LayerNormalization()(x_proj + attn_out)

# ------------------ Feed Forward Network -------------
ffn = Dense(256, activation="relu")(x_attn)
x_attn = LayerNormalization()(x_attn + ffn)

# ------------------ Classifier -----------------------
x = Dense(128, activation="relu")(x_attn)
x = Dropout(0.3)(x)
x = Dense(64, activation="relu")(x)

output = Dense(3, activation="softmax")(x)

# ------------------ Build Model ----------------------
model = Model(
    inputs=[gene_input, type_input, review_input, assembly_input],
    outputs=output
)

model.compile(
    optimizer="adam",
    loss="sparse_categorical_crossentropy",
    metrics=["accuracy"]
)

model.summary()


In [None]:
mha = MultiHeadAttention(
    num_heads=4,
    key_dim=64,
    name="self_attention"
)


In [None]:
# Expand dims to simulate sequence length = 1
x_seq = tf.expand_dims(x_proj, axis=1)  # (None, 1, 256)

# Self-attention
attn_out = mha(x_seq, x_seq)             # (None, 1, 256)

# Remove sequence dimension
attn_out = tf.squeeze(attn_out, axis=1) # (None, 256)

# Residual connection + normalization
x_attn = LayerNormalization()(x_proj + attn_out)


In [None]:
from tensorflow.keras.layers import Reshape

# ------------------ Transformer Attention Block ------

# Projected feature vector: (None, 256)

# Step 1: Convert vector ‚Üí sequence (seq_len = 1)
x_seq = Reshape((1, 256))(x_proj)   # ‚úÖ Keras-safe

# Step 2: Define attention
mha = MultiHeadAttention(
    num_heads=4,
    key_dim=64
)

# Step 3: Self-attention
attn_out = mha(x_seq, x_seq)        # (None, 1, 256)

# Step 4: Back to vector
attn_out = Reshape((256,))(attn_out)

# Step 5: Residual + LayerNorm
x_attn = LayerNormalization()(x_proj + attn_out)

# Step 6: Feed-forward block
ffn = Dense(256, activation="relu")(x_attn)
x_attn = LayerNormalization()(x_attn + ffn)


In [None]:
from sklearn.metrics import classification_report, confusion_matrix
import numpy as np

# Predictions
y_pred = model.predict(test_inputs, batch_size=512)
y_pred = np.argmax(y_pred, axis=1)

# Report
print("Classification Report:")
print(classification_report(y_test, y_pred, target_names=[
    "BENIGN", "VUS", "PATHOGENIC"
]))

# Confusion Matrix
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))


In [None]:
model.save("transssv_style_model.keras")


In [None]:
# Train 2‚Äì3 more epochs safely
history_extra = model.fit(
    train_inputs,
    y_train,
    validation_data=(val_inputs, y_val),
    epochs=3,              # ONLY 3
    batch_size=512,        # keep it stable
    verbose=1
)


In [None]:
history_extra = model.fit(
    train_inputs,
    y_train,
    epochs=3,          # only 3 more
    batch_size=512,
    verbose=1
)


In [None]:
model.save("/content/drive/MyDrive/UG_Project/transssv_final.keras")


In [None]:
from sklearn.metrics import classification_report, confusion_matrix
import numpy as np

y_pred = model.predict(test_inputs, batch_size=512)
y_pred = np.argmax(y_pred, axis=1)

print("Final Classification Report:")
print(classification_report(
    y_test, y_pred,
    target_names=["BENIGN", "VUS", "PATHOGENIC"]
))

print("Final Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))


In [None]:
model.save("transssv_final.keras")


In [None]:
import matplotlib.pyplot as plt
import numpy as np

cm = confusion_matrix(y_test, y_pred)
labels = ["BENIGN", "VUS", "PATHOGENIC"]

plt.figure()
plt.imshow(cm)
plt.xticks(range(3), labels)
plt.yticks(range(3), labels)
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.title("Confusion Matrix")

for i in range(3):
    for j in range(3):
        plt.text(j, i, cm[i, j], ha="center", va="center")

plt.show()


In [None]:
import pandas as pd
import matplotlib.pyplot as plt

class_counts = pd.Series(y_train).value_counts()

plt.figure()
class_counts.plot(kind="bar")
plt.xlabel("Class")
plt.ylabel("Count")
plt.title("Class Distribution in Training Data")
plt.show()


In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Flatten y_train
y_train_flat = y_train.reshape(-1)

class_counts = pd.Series(y_train_flat).value_counts()

plt.figure()
class_counts.plot(kind="bar")
plt.xlabel("Class Label")
plt.ylabel("Count")
plt.title("Class Distribution in Training Data")
plt.show()


In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Flatten y_train
y_train_flat = y_train.to_numpy().ravel()  # <-- fixed

# Count classes
class_counts = pd.Series(y_train_flat).value_counts()

# Plot
plt.figure()
class_counts.plot(kind="bar")
plt.xlabel("Class Label")
plt.ylabel("Count")
plt.title("Class Distribution in Training Data")
plt.show()


In [None]:
class_counts.index = ["BENIGN", "VUS", "PATHOGENIC"]


In [None]:
plt.savefig("class_distribution.png", dpi=300, bbox_inches="tight")


In [None]:
# Download ClinVar data
!wget https://ftp.ncbi.nlm.nih.gov/pub/clinvar/tab_delimited/variant_summary.txt.gz


In [None]:
import pickle

with open("encoders.pkl", "wb") as f:
    pickle.dump(encoders, f)


In [None]:
import pandas as pd
import pickle
from sklearn.preprocessing import LabelEncoder

BASE_PATH = "/content/drive/MyDrive/UG_Project"

# Load the same cleaned dataset used for training
df = pd.read_csv(f"{BASE_PATH}/variant_summary_cleaned1.csv")

encoders = {}

for col in ["GeneSymbol", "Type", "ReviewStatus", "Assembly"]:
    le = LabelEncoder()
    le.fit(df[col])
    encoders[col] = le

# Save encoders to the correct path
with open(f"{BASE_PATH}/encoders.pkl", "wb") as f:
    pickle.dump(encoders, f)

print("Encoders generated and saved successfully!")

In [None]:
!ls


In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
from google.colab import drive
drive.mount('/content/drive')


In [None]:
!mkdir -p /content/drive/MyDrive/UG_Project


In [None]:
!wget -P /content/drive/MyDrive/UG_Project \
https://ftp.ncbi.nlm.nih.gov/pub/clinvar/tab_delimited/variant_summary.txt.gz


In [None]:
!ls


In [None]:
!mv \
transssv_style_model.keras \
encoders.pkl \
variant_summary.txt.gz \
variant_summary_cleaned1.csv \
X_train.csv \
X_test.csv \
y_train.csv \
y_test.csv \
/content/drive/MyDrive/UG_Project/

In [None]:
!ls /content/drive/MyDrive/UG_Project


In [None]:
!zip -r /content/drive/MyDrive/UG_Project_base_comparison_model.zip /content/drive/MyDrive/UG_Project


In [None]:
BASE_PATH = "/content/drive/MyDrive/UG_Project"


In [None]:
import numpy as np
import pickle
import tensorflow as tf
from tensorflow.keras.models import load_model
from tensorflow.keras.layers import Lambda, MultiHeadAttention, LayerNormalization # Import necessary layers if they were custom or used in Lambda

# Define the lambda functions explicitly as they were used in the model definition
def expand_dims_func(t):
    return tf.expand_dims(t, axis=1)

def squeeze_func(t):
    return tf.squeeze(t, axis=1)

# Create a custom_objects dictionary to handle Lambda layers with specified output shapes
# These output shapes are derived from the model's architecture in cell cBgFk8qxxkCt.
custom_objects = {
    'expand_dims_for_mha': Lambda(expand_dims_func, output_shape=(1, 256)),
    'squeeze_after_mha': Lambda(squeeze_func, output_shape=(256,)),
    # Include other custom layers if they were part of the saved model and not standard Keras layers
    'MultiHeadAttention': MultiHeadAttention, # This is a standard layer, but sometimes needed in custom_objects
    'LayerNormalization': LayerNormalization # Same for LayerNormalization
}

# Load trained model, including custom objects and safe_mode=False
model = load_model(
    f"{BASE_PATH}/transssv_style_model.keras",
    custom_objects=custom_objects,
    safe_mode=False
)

# Load encoders
with open(f"{BASE_PATH}/encoders.pkl", "rb") as f:
    encoders = pickle.load(f)

gene_le = encoders["GeneSymbol"]
type_le = encoders["Type"]
review_le = encoders["ReviewStatus"]
assembly_le = encoders["Assembly"]

print("Model and encoders loaded successfully!")

In [None]:
import tensorflow as tf
from tensorflow.keras.layers import Lambda


In [None]:
model = load_model(
    f"{BASE_PATH}/transssv_style_model.keras",
    custom_objects={
        "tf": tf
    },
    compile=False
)


In [None]:
import keras
keras.config.enable_unsafe_deserialization()


In [None]:
from tensorflow.keras.models import load_model

model = load_model(
    f"{BASE_PATH}/transssv_style_model.keras",
    compile=False
)

print("Model loaded successfully!")


In [None]:
import tensorflow as tf
from tensorflow.keras.layers import (
    Input, Embedding, Dense, Concatenate, Dropout,
    LayerNormalization, Flatten, MultiHeadAttention, Reshape
)
from tensorflow.keras.models import Model


In [None]:
# Vocabulary sizes (load from encoders)
num_genes = len(gene_le.classes_)
num_types = len(type_le.classes_)
num_reviews = len(review_le.classes_)
num_assembly = len(assembly_le.classes_)

# Inputs
gene_input = Input(shape=(1,), name="gene_input")
type_input = Input(shape=(1,), name="type_input")
review_input = Input(shape=(1,), name="review_input")
assembly_input = Input(shape=(1,), name="assembly_input")

# Embeddings
gene_emb = Embedding(num_genes, 64)(gene_input)
type_emb = Embedding(num_types, 8)(type_input)
review_emb = Embedding(num_reviews, 16)(review_input)
assembly_emb = Embedding(num_assembly, 4)(assembly_input)

x = Concatenate()([
    Flatten()(gene_emb),
    Flatten()(type_emb),
    Flatten()(review_emb),
    Flatten()(assembly_emb)
])

# Projection
x_proj = Dense(256, activation="relu")(x)

# Transformer attention (NO Lambda)
x_seq = Reshape((1, 256))(x_proj)
attn = MultiHeadAttention(num_heads=4, key_dim=64)(x_seq, x_seq)
attn = Reshape((256,))(attn)

x_attn = LayerNormalization()(x_proj + attn)

# Feed-forward
ffn = Dense(256, activation="relu")(x_attn)
x_attn = LayerNormalization()(x_attn + ffn)

# Classifier
x = Dense(128, activation="relu")(x_attn)
x = Dropout(0.3)(x)
x = Dense(64, activation="relu")(x)
output = Dense(3, activation="softmax")(x)

# Model
model = Model(
    inputs=[gene_input, type_input, review_input, assembly_input],
    outputs=output
)


In [None]:
import pickle

BASE_PATH = "/content/drive/MyDrive/UG_Project"

with open(f"{BASE_PATH}/encoders.pkl", "rb") as f:
    encoders = pickle.load(f)

gene_le = encoders["GeneSymbol"]
type_le = encoders["Type"]
review_le = encoders["ReviewStatus"]
assembly_le = encoders["Assembly"]

print("Encoders loaded:")
print(len(gene_le.classes_), len(type_le.classes_),
      len(review_le.classes_), len(assembly_le.classes_))
