In [1]:
import pandas as pd
import random

# Define alleles
alleles = ["A", "T", "G", "C"]

# Define gene sets for each disease
disease_genes = {
    "Cardiovascular phenotype": ["DSP", "SCN10A", "TTN", "MYH6", "LMNA"],
    "Hereditary cancer-predisposing syndrome": ["BRCA1", "BRCA2", "TP53", "PTEN", "PRKAR1A"],
    "Inborn genetic diseases": ["KRT14", "ERCC2", "COL7A1", "ABCA1", "CFTR"],
    "Fanconi anemia": ["FANCA", "FANCB", "FANCC", "FANCD2", "FANCE"],
    "Developmental and epileptic encephalopathy": ["SCN1A", "SCN2A", "KCNQ2", "STXBP1", "CDKL5"]
}

def generate_variant(gene, disease):
    reference_allele = random.choice(alleles)

    alternate_alleles = [a for a in alleles if a != reference_allele]
    alternate_allele = random.choice(alternate_alleles)

    return {
        "Type": random.choice(["single nucleotide variant", "Deletion"]),
        "GeneSymbol": gene,
        "HGNC_ID": f"HGNC:{random.randint(100, 99999)}",
        "PhenotypeList": disease,
        "Assembly": random.choice(["GRCh37", "GRCh38"]),
        "ChromosomeAccession": f"NC_{random.randint(1, 22):06d}.{random.randint(1,20)}",
        "Chromosome": random.choice([str(i) for i in range(1,23)] + ["X","Y"]),
        "Cytogenetic": f"{random.randint(1,22)}q{random.randint(1,36)}.{random.randint(1,3)}",
        "PositionVCF": random.randint(10000, 200000000),
        "ReferenceAlleleVCF": reference_allele,
        "AlternateAlleleVCF": alternate_allele
    }

# Build dataset
rows = []
for disease, genes in disease_genes.items():
    for _ in range(800):
        gene = random.choice(genes)
        rows.append(generate_variant(gene, disease))

# Create DataFrame
balanced_df = pd.DataFrame(rows)

# Shuffle
balanced_df = balanced_df.sample(frac=1, random_state=42).reset_index(drop=True)

# Save dataset
balanced_df.to_csv("balanced_genetics_dataset.csv", index=False)

print(balanced_df["PhenotypeList"].value_counts())

PhenotypeList
Cardiovascular phenotype                      800
Developmental and epileptic encephalopathy    800
Fanconi anemia                                800
Inborn genetic diseases                       800
Hereditary cancer-predisposing syndrome       800
Name: count, dtype: int64


In [2]:
from google.colab import files
files.download("balanced_genetics_dataset.csv")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from xgboost import XGBClassifier
import pickle

In [4]:
# Load dataset (from previous step)
df = pd.read_csv("balanced_genetics_dataset.csv")

# Features and Target
X = df.drop(columns=["PhenotypeList"])  # input features
y = df["PhenotypeList"]  # target column


In [5]:
# Categorical and numerical features
categorical_features = ["Type", "GeneSymbol", "Assembly", "ChromosomeAccession",
                        "Chromosome", "Cytogenetic", "ReferenceAlleleVCF", "AlternateAlleleVCF"]
numeric_features = ["PositionVCF"]

# Column Transformer
preprocessor = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_features),
        ("num", StandardScaler(), numeric_features)
    ]
)


In [6]:
xgb = XGBClassifier(
    objective="multi:softprob",
    eval_metric="mlogloss",
    use_label_encoder=False,
    random_state=42
)

# Build pipeline
pipe = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("classifier", xgb)
])


In [7]:
# Load dataset (from previous step)
df = pd.read_csv("balanced_genetics_dataset.csv")

# Features and Target
X = df.drop(columns=["PhenotypeList"])  # input features
y = df["PhenotypeList"]  # target column

# Split dataset
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Encode target labels
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y_train_encoded = le.fit_transform(y_train)
y_test_encoded = le.transform(y_test)

# Grid of hyperparameters
param_grid = {
    "classifier__n_estimators": [200, 400],
    "classifier__max_depth": [4, 6, 8],
    "classifier__learning_rate": [0.05, 0.1],
    "classifier__subsample": [0.8, 1.0],
    "classifier__colsample_bytree": [0.8, 1.0]
}

# GridSearchCV
grid_search = GridSearchCV(
    estimator=pipe,
    param_grid=param_grid,
    cv=3,
    scoring="accuracy",
    n_jobs=-1,
    verbose=2
)

# Fit the model
grid_search.fit(X_train, y_train_encoded)

print("Best Parameters:", grid_search.best_params_)
print("Best CV Accuracy:", grid_search.best_score_)

Fitting 3 folds for each of 48 candidates, totalling 144 fits


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Best Parameters: {'classifier__colsample_bytree': 0.8, 'classifier__learning_rate': 0.05, 'classifier__max_depth': 4, 'classifier__n_estimators': 200, 'classifier__subsample': 0.8}
Best CV Accuracy: 1.0


In [8]:
from sklearn.metrics import classification_report, confusion_matrix

# Predictions on test set
y_pred = grid_search.predict(X_test)

print("\nClassification Report:\n", classification_report(y_test_encoded, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test_encoded, y_pred))

# Save the trained model as pickle
with open("xgboost_genetics_model.pkl", "wb") as f:
    pickle.dump(grid_search.best_estimator_, f)

print("✅ Model saved as xgboost_genetics_model.pkl")


Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00       160
           1       1.00      1.00      1.00       160
           2       1.00      1.00      1.00       160
           3       1.00      1.00      1.00       160
           4       1.00      1.00      1.00       160

    accuracy                           1.00       800
   macro avg       1.00      1.00      1.00       800
weighted avg       1.00      1.00      1.00       800


Confusion Matrix:
 [[160   0   0   0   0]
 [  0 160   0   0   0]
 [  0   0 160   0   0]
 [  0   0   0 160   0]
 [  0   0   0   0 160]]
✅ Model saved as xgboost_genetics_model.pkl


In [9]:
with open("xgboost_genetics_model.pkl", "rb") as f:
    loaded_model = pickle.load(f)

# Example test
example = pd.DataFrame([{
    "Type": "single nucleotide variant",
    "GeneSymbol": "SCN10A",
    "HGNC_ID": "HGNC:10582",
    "Assembly": "GRCh37",
    "ChromosomeAccession": "NC_000003.11",
    "Chromosome": "3",
    "Cytogenetic": "3p22.2",
    "PositionVCF": 38835431,
    "ReferenceAlleleVCF": "A",
    "AlternateAlleleVCF": "G"
}])

print("Prediction:", loaded_model.predict(example))


Prediction: [0]


In [10]:
import pandas as pd

test_examples = pd.DataFrame([
    # Cardiovascular phenotype
    {
        "Type": "single nucleotide variant",
        "GeneSymbol": "DSP",
        "HGNC_ID": "HGNC:3052",
        "Assembly": "GRCh37",
        "ChromosomeAccession": "NC_000006.11",
        "Chromosome": "6",
        "Cytogenetic": "6p24.3",
        "PositionVCF": 7584161,
        "ReferenceAlleleVCF": "A",
        "AlternateAlleleVCF": "G"
    },
    {
        "Type": "single nucleotide variant",
        "GeneSymbol": "SCN10A",
        "HGNC_ID": "HGNC:10582",
        "Assembly": "GRCh37",
        "ChromosomeAccession": "NC_000003.11",
        "Chromosome": "3",
        "Cytogenetic": "3p22.2",
        "PositionVCF": 38835431,
        "ReferenceAlleleVCF": "A",
        "AlternateAlleleVCF": "G"
    },

    # Hereditary cancer-predisposing syndrome
    {
        "Type": "Deletion",
        "GeneSymbol": "BRCA1",
        "HGNC_ID": "HGNC:1100",
        "Assembly": "GRCh38",
        "ChromosomeAccession": "NC_000017.11",
        "Chromosome": "17",
        "Cytogenetic": "17q21.31",
        "PositionVCF": 43091818,
        "ReferenceAlleleVCF": "GGT",
        "AlternateAlleleVCF": "G"
    },
    {
        "Type": "single nucleotide variant",
        "GeneSymbol": "PRKAR1A",
        "HGNC_ID": "HGNC:9388",
        "Assembly": "GRCh38",
        "ChromosomeAccession": "NC_000017.11",
        "Chromosome": "17",
        "Cytogenetic": "17q24.2",
        "PositionVCF": 68525750,
        "ReferenceAlleleVCF": "T",
        "AlternateAlleleVCF": "G"
    },

    # Inborn genetic diseases
    {
        "Type": "single nucleotide variant",
        "GeneSymbol": "KRT14",
        "HGNC_ID": "HGNC:6416",
        "Assembly": "GRCh38",
        "ChromosomeAccession": "NC_000017.11",
        "Chromosome": "17",
        "Cytogenetic": "17q21.2",
        "PositionVCF": 41584300,
        "ReferenceAlleleVCF": "C",
        "AlternateAlleleVCF": "T"
    },
    {
        "Type": "single nucleotide variant",
        "GeneSymbol": "ERCC2",
        "HGNC_ID": "HGNC:3434",
        "Assembly": "GRCh38",
        "ChromosomeAccession": "NC_000019.10",
        "Chromosome": "19",
        "Cytogenetic": "19q13.32",
        "PositionVCF": 45364260,
        "ReferenceAlleleVCF": "C",
        "AlternateAlleleVCF": "G"
    },

    # Developmental and epileptic encephalopathy
    {
        "Type": "single nucleotide variant",
        "GeneSymbol": "SCN1A",
        "HGNC_ID": "HGNC:10585",
        "Assembly": "GRCh37",
        "ChromosomeAccession": "NC_000002.11",
        "Chromosome": "2",
        "Cytogenetic": "2q24.3",
        "PositionVCF": 166850411,
        "ReferenceAlleleVCF": "A",
        "AlternateAlleleVCF": "G"
    },
    {
        "Type": "single nucleotide variant",
        "GeneSymbol": "KCNQ2",
        "HGNC_ID": "HGNC:6295",
        "Assembly": "GRCh37",
        "ChromosomeAccession": "NC_000020.11",
        "Chromosome": "20",
        "Cytogenetic": "20q13.33",
        "PositionVCF": 63445631,
        "ReferenceAlleleVCF": "C",
        "AlternateAlleleVCF": "T"
    }
])

print(test_examples)


                        Type GeneSymbol     HGNC_ID Assembly  \
0  single nucleotide variant        DSP   HGNC:3052   GRCh37   
1  single nucleotide variant     SCN10A  HGNC:10582   GRCh37   
2                   Deletion      BRCA1   HGNC:1100   GRCh38   
3  single nucleotide variant    PRKAR1A   HGNC:9388   GRCh38   
4  single nucleotide variant      KRT14   HGNC:6416   GRCh38   
5  single nucleotide variant      ERCC2   HGNC:3434   GRCh38   
6  single nucleotide variant      SCN1A  HGNC:10585   GRCh37   
7  single nucleotide variant      KCNQ2   HGNC:6295   GRCh37   

  ChromosomeAccession Chromosome Cytogenetic  PositionVCF ReferenceAlleleVCF  \
0        NC_000006.11          6      6p24.3      7584161                  A   
1        NC_000003.11          3      3p22.2     38835431                  A   
2        NC_000017.11         17    17q21.31     43091818                GGT   
3        NC_000017.11         17     17q24.2     68525750                  T   
4        NC_000017.11  

In [11]:
import pickle

# Load model
with open("xgboost_genetics_model.pkl", "rb") as f:
    loaded_model = pickle.load(f)

# Predict
preds = loaded_model.predict(test_examples)
results = pd.DataFrame({"GeneSymbol": test_examples["GeneSymbol"], "Predicted Phenotype": preds})

print(results)


  GeneSymbol  Predicted Phenotype
0        DSP                    0
1     SCN10A                    0
2      BRCA1                    3
3    PRKAR1A                    3
4      KRT14                    4
5      ERCC2                    4
6      SCN1A                    1
7      KCNQ2                    1


In [12]:
# Get back the original class labels
class_labels = loaded_model.named_steps["classifier"].classes_

# Convert numeric predictions to phenotype names
decoded_preds = [class_labels[p] for p in preds]

results = pd.DataFrame({
    "GeneSymbol": test_examples["GeneSymbol"],
    "Predicted Phenotype": decoded_preds
})

print(results)


  GeneSymbol  Predicted Phenotype
0        DSP                    0
1     SCN10A                    0
2      BRCA1                    3
3    PRKAR1A                    3
4      KRT14                    4
5      ERCC2                    4
6      SCN1A                    1
7      KCNQ2                    1
