In [None]:
!pip install -q xgboost scikit-learn pandas streamlit pyngrok


[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.3/44.3 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.9/9.9 MB[0m [31m99.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.9/6.9 MB[0m [31m122.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m79.1/79.1 kB[0m [31m6.7 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
import os
import pickle
from collections import Counter

import numpy as np
import pandas as pd

from sklearn.feature_extraction import DictVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.multioutput import MultiOutputClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, f1_score, classification_report

# 1) Load a FASTA (.fna) file into one string
def load_fna(path):
    with open(path, 'r') as f:
        return ''.join(line.strip() for line in f if not line.startswith('>'))

# 2) Sliding‑window augmentation
def augment_and_vectorize(seqs, k=6, window=300, step=150, stride=1):
    X_dicts, y = [], []
    for gene, seq in seqs.items():
        for i in range(0, len(seq) - window + 1, step):
            sub = seq[i:i+window]
            kmers = [sub[j:j+k] for j in range(0, len(sub)-k+1, stride)]
            X_dicts.append(dict(Counter(kmers)))
            y.append([gene])
    return X_dicts, y

# 3) Full‑sequence (“new logic”) vectorization
def fullseq_vectorize(seqs, k=6):
    X_dicts, y = [], []
    for gene, seq in seqs.items():
        kmers = [seq[i:i+k] for i in range(len(seq)-k+1)]
        X_dicts.append(dict(Counter(kmers)))
        y.append([gene])
    return X_dicts, y

# 4) Your gene files (in the notebook’s working directory)
gene_files = {
    'TP53':  'tp53.fna',
    'BRCA1': 'brca1.fna',
    'BRCA2': 'brca2.fna',
    'STK11': 'stk11.fna',
    'CDH1':  'cdh1.fna'
}

# Read sequences
gene_seqs = {g: load_fna(fname) for g, fname in gene_files.items()}


In [None]:
# 1) Augment + vectorize
Xb_dicts, yb_raw = augment_and_vectorize(gene_seqs,
                                         k=6, window=300, step=150, stride=1)

# 2) Vectorizer & MultiLabelBinarizer
vec_base = DictVectorizer(sparse=False)
Xb = vec_base.fit_transform(Xb_dicts)

mlb = MultiLabelBinarizer()
Yb = mlb.fit_transform(yb_raw)

print("Baseline shapes:", Xb.shape, Yb.shape)

# 3) Split
Xb_tr, Xb_te, Yb_tr, Yb_te = train_test_split(
    Xb, Yb, test_size=0.25, random_state=42
)

# 4) Models
models = {
    'LogisticRegression': MultiOutputClassifier(LogisticRegression(max_iter=200, random_state=42)),
    'NaiveBayes':         MultiOutputClassifier(GaussianNB()),
    'RandomForest':       MultiOutputClassifier(RandomForestClassifier(n_estimators=100, random_state=42)),
    'MLP':                MultiOutputClassifier(MLPClassifier(hidden_layer_sizes=(128,64),
                                                               max_iter=300, random_state=42)),
    'XGBoost':            MultiOutputClassifier(XGBClassifier(eval_metric='logloss',
                                                               use_label_encoder=False,
                                                               random_state=42))
}

# 5) Train & evaluate
print("=== Baseline Results ===")
for name, mdl in models.items():
    mdl.fit(Xb_tr, Yb_tr)
    pred = mdl.predict(Xb_te)
    print(f"\n{name} Accuracy: {accuracy_score(Yb_te, pred):.4f}, F1: {f1_score(Yb_te, pred, average='micro'):.4f}")
    print(classification_report(Yb_te, pred, target_names=mlb.classes_))


Baseline shapes: (4677, 4093) (4677, 5)
=== Baseline Results ===

LogisticRegression Accuracy: 0.9154, F1: 0.9502
              precision    recall  f1-score   support

       BRCA1       0.96      0.93      0.95       427
       BRCA2       0.94      0.95      0.95       274
        CDH1       0.97      0.94      0.95       313
       STK11       1.00      0.95      0.98        84
        TP53       1.00      0.92      0.96        72

   micro avg       0.96      0.94      0.95      1170
   macro avg       0.97      0.94      0.96      1170
weighted avg       0.96      0.94      0.95      1170
 samples avg       0.93      0.94      0.93      1170



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))



NaiveBayes Accuracy: 0.2752, F1: 0.6303
              precision    recall  f1-score   support

       BRCA1       0.50      0.93      0.65       427
       BRCA2       0.42      0.98      0.59       274
        CDH1       0.39      0.96      0.55       313
       STK11       1.00      0.88      0.94        84
        TP53       0.91      1.00      0.95        72

   micro avg       0.47      0.95      0.63      1170
   macro avg       0.64      0.95      0.74      1170
weighted avg       0.51      0.95      0.65      1170
 samples avg       0.55      0.95      0.67      1170



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))



RandomForest Accuracy: 0.8060, F1: 0.8917
              precision    recall  f1-score   support

       BRCA1       0.99      0.84      0.91       427
       BRCA2       1.00      0.83      0.91       274
        CDH1       1.00      0.80      0.89       313
       STK11       1.00      0.80      0.89        84
        TP53       1.00      0.53      0.69        72

   micro avg       1.00      0.81      0.89      1170
   macro avg       1.00      0.76      0.86      1170
weighted avg       1.00      0.81      0.89      1170
 samples avg       0.81      0.81      0.81      1170



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))



MLP Accuracy: 0.9427, F1: 0.9672
              precision    recall  f1-score   support

       BRCA1       0.98      0.96      0.97       427
       BRCA2       0.96      0.96      0.96       274
        CDH1       0.98      0.94      0.96       313
       STK11       1.00      0.95      0.98        84
        TP53       1.00      0.97      0.99        72

   micro avg       0.98      0.96      0.97      1170
   macro avg       0.98      0.96      0.97      1170
weighted avg       0.98      0.96      0.97      1170
 samples avg       0.95      0.96      0.95      1170



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.




XGBoost Accuracy: 0.8880, F1: 0.9357
              precision    recall  f1-score   support

       BRCA1       0.97      0.91      0.94       427
       BRCA2       0.99      0.92      0.95       274
        CDH1       0.97      0.89      0.93       313
       STK11       1.00      0.95      0.98        84
        TP53       1.00      0.65      0.79        72

   micro avg       0.98      0.90      0.94      1170
   macro avg       0.99      0.87      0.92      1170
weighted avg       0.98      0.90      0.93      1170
 samples avg       0.89      0.90      0.89      1170



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


better trainiing logic


In [None]:
import numpy as np
import time
from collections import Counter
from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from sklearn.feature_extraction import DictVectorizer
from sklearn.preprocessing import MultiLabelBinarizer, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.multioutput import MultiOutputClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
from sklearn.metrics import f1_score, accuracy_score, classification_report

# --- 1) k-mer feature extractor ---
def load_data(gene_sequences, k=6, window=300, step=150):
    X_dict, y_raw = [], []
    for gene, seq in gene_sequences.items():
        for i in range(0, len(seq) - window + 1, step):
            sub = seq[i:i + window]
            kmers = [sub[j:j + k] for j in range(window - k + 1)]
            X_dict.append(dict(Counter(kmers)))
            y_raw.append([gene])
    return X_dict, y_raw

# --- 2) Data prep ---
X_dict, y_raw = load_data(gene_seqs)
vectorizer = DictVectorizer(sparse=False)
X = vectorizer.fit_transform(X_dict)

mlb = MultiLabelBinarizer()
Y = mlb.fit_transform(y_raw)

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

def make_pipeline(estimator):
    return Pipeline([
        ('scale', StandardScaler()),
        ('clf', MultiOutputClassifier(estimator))
    ])

# --- 3) Models and param grids ---
models = {
    'LogisticRegression': make_pipeline(LogisticRegression(max_iter=300, solver='liblinear')),
    'GaussianNB':         make_pipeline(GaussianNB()),
    'RandomForest':       make_pipeline(RandomForestClassifier(n_jobs=-1, random_state=42)),
    'XGBoost':            make_pipeline(XGBClassifier(eval_metric='logloss', n_jobs=-1, random_state=42)),
    'MLP':                make_pipeline(MLPClassifier(hidden_layer_sizes=(128,), max_iter=300, early_stopping=True, random_state=42))
}

param_grids = {
    'LogisticRegression': {'clf__estimator__C': [1]},
    'RandomForest':       {'clf__estimator__n_estimators': [100]},
    'XGBoost':            {'clf__estimator__n_estimators': [100], 'clf__estimator__max_depth': [5]},
    'MLP':                {'clf__estimator__alpha': [1e-4]}
}

kf = KFold(n_splits=2, shuffle=True, random_state=42)
trained_models = {}

# --- 4) Training ---
for name, pipeline in models.items():
    print(f"\nTraining {name}...")
    start = time.time()

    if name in param_grids:
        grid = GridSearchCV(pipeline, param_grids[name], cv=kf, scoring='f1_micro', n_jobs=-1, verbose=1)
        grid.fit(X_train, Y_train)
        trained_models[name] = grid.best_estimator_
        print(f"{name} best params →", grid.best_params_)
    else:
        pipeline.fit(X_train, Y_train)
        trained_models[name] = pipeline

    print(f"{name} training time: {time.time() - start:.2f} seconds")

# --- 5) Evaluation ---
def evaluate_model(name, model, X_test, Y_test):
    print(f"\n{name} Evaluation:")
    Y_pred = model.predict(X_test)
    print("F1 Score (micro):", f1_score(Y_test, Y_pred, average='micro'))
    print("Accuracy:", accuracy_score(Y_test, Y_pred))
    print("Classification Report:\n", classification_report(Y_test, Y_pred, target_names=mlb.classes_))

for name, model in trained_models.items():
    evaluate_model(name, model, X_test, Y_test)


Training LogisticRegression...
Fitting 2 folds for each of 1 candidates, totalling 2 fits
LogisticRegression best params → {'clf__estimator__C': 1}
LogisticRegression training time: 70.41 seconds

Training GaussianNB...
GaussianNB training time: 1.25 seconds

Training RandomForest...
Fitting 2 folds for each of 1 candidates, totalling 2 fits
RandomForest best params → {'clf__estimator__n_estimators': 100}
RandomForest training time: 33.01 seconds

Training XGBoost...
Fitting 2 folds for each of 1 candidates, totalling 2 fits
XGBoost best params → {'clf__estimator__max_depth': 5, 'clf__estimator__n_estimators': 100}
XGBoost training time: 37.56 seconds

Training MLP...
Fitting 2 folds for each of 1 candidates, totalling 2 fits
MLP best params → {'clf__estimator__alpha': 0.0001}
MLP training time: 81.98 seconds

LogisticRegression Evaluation:
F1 Score (micro): 0.9687002652519894
Accuracy: 0.9444444444444444
Classification Report:
               precision    recall  f1-score   support

 

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


F1 Score (micro): 0.6091915464609191
Accuracy: 0.22008547008547008
Classification Report:
               precision    recall  f1-score   support

       BRCA1       0.46      0.95      0.62       342
       BRCA2       0.40      0.99      0.57       210
        CDH1       0.36      1.00      0.53       253
       STK11       1.00      0.90      0.95        68
        TP53       0.93      1.00      0.96        63

   micro avg       0.44      0.97      0.61       936
   macro avg       0.63      0.97      0.73       936
weighted avg       0.49      0.97      0.63       936
 samples avg       0.52      0.97      0.65       936


RandomForest Evaluation:


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


F1 Score (micro): 0.9181084198385236
Accuracy: 0.8504273504273504
Classification Report:
               precision    recall  f1-score   support

       BRCA1       0.99      0.91      0.95       342
       BRCA2       1.00      0.88      0.94       210
        CDH1       1.00      0.82      0.90       253
       STK11       1.00      0.88      0.94        68
        TP53       1.00      0.54      0.70        63

   micro avg       1.00      0.85      0.92       936
   macro avg       1.00      0.81      0.88       936
weighted avg       1.00      0.85      0.92       936
 samples avg       0.85      0.85      0.85       936


XGBoost Evaluation:
F1 Score (micro): 0.9466666666666667
Accuracy: 0.9038461538461539
Classification Report:
               precision    recall  f1-score   support

       BRCA1       0.98      0.92      0.95       342
       BRCA2       0.98      0.95      0.97       210
        CDH1       0.99      0.88      0.93       253
       STK11       1.00      1.00      

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


F1 Score (micro): 0.9655172413793104
Accuracy: 0.9401709401709402
Classification Report:
               precision    recall  f1-score   support

       BRCA1       0.98      0.96      0.97       342
       BRCA2       0.96      0.96      0.96       210
        CDH1       0.97      0.96      0.97       253
       STK11       1.00      1.00      1.00        68
        TP53       1.00      0.89      0.94        63

   micro avg       0.97      0.96      0.97       936
   macro avg       0.98      0.95      0.97       936
weighted avg       0.97      0.96      0.97       936
 samples avg       0.95      0.96      0.95       936



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
import os
import pickle

# Make directories
os.makedirs('base_line_model', exist_ok=True)
os.makedirs('new_logic', exist_ok=True)

# --- Save vectorizers and binarizer ---
pickle.dump(vec_base, open('base_line_model/vec_base.pkl', 'wb'))
pickle.dump(vectorizer, open('new_logic/vec_new.pkl', 'wb'))
pickle.dump(mlb, open('base_line_model/mlb.pkl', 'wb'))  # shared mlb

# --- Retrain and save Baseline Models ---
for name, model in models.items():
    print(f"Saving Baseline model: {name}")
    model.fit(Xb, Yb)
    with open(f'base_line_model/{name}.pkl', 'wb') as f:
        pickle.dump(model, f)

# --- Retrain and save New Logic Models ---
for name, model in trained_models.items():
    print(f"Saving New Logic model: {name}")
    model.fit(X, Y)  # Full new logic data
    with open(f'new_logic/{name}.pkl', 'wb') as f:
        pickle.dump(model, f)

print("✅ Saved vectorizers, mlb, and all models.")


Saving Baseline model: LogisticRegression
Saving Baseline model: GaussianNB
Saving Baseline model: RandomForest
Saving Baseline model: XGBoost
Saving Baseline model: MLP
Saving New Logic model: LogisticRegression
Saving New Logic model: GaussianNB
Saving New Logic model: RandomForest
Saving New Logic model: XGBoost
Saving New Logic model: MLP
✅ Saved vectorizers, mlb, and all models.


In [None]:
%%writefile app.py
import streamlit as st
import pickle
import numpy as np
import pandas as pd
from collections import Counter

# Load vectorizers and models
vec_base = pickle.load(open('base_line_model/vec_base.pkl', 'rb'))
vec_new  = pickle.load(open('new_logic/vec_new.pkl', 'rb'))
mlb      = pickle.load(open('base_line_model/mlb.pkl', 'rb'))

models_base = {name: pickle.load(open(f'base_line_model/{name}.pkl', 'rb'))
               for name in ['LogisticRegression', 'GaussianNB', 'RandomForest', 'MLP', 'XGBoost']}
models_new  = {name: pickle.load(open(f'new_logic/{name}.pkl', 'rb'))
               for name in ['LogisticRegression', 'GaussianNB', 'RandomForest', 'MLP', 'XGBoost']}

st.set_page_config(page_title="HGSC Disease predictor", layout="wide")
st.title("🔬 Gene Multi‑Label Classifier using k-mer + ML Models")

st.markdown("Enter a DNA sequence (A/C/G/T) to predict gene mutations using different models.")

seq = st.text_area("🧬 Paste DNA Sequence Below", height=150)

def kmer_freqs(seq, k=6):
    return dict(Counter(seq[i:i+k] for i in range(len(seq) - k + 1)))

def predict_all_models(seq, vec, models, label, threshold=2):
    freqs = kmer_freqs(seq, k=6)
    X_vec = vec.transform([freqs])
    results = []

    for name, model in models.items():
        y_pred = model.predict(X_vec)[0]
        genes = mlb.classes_
        gene_results = dict(zip(genes, y_pred.astype(int)))
        gene_results['HGSC'] = int(y_pred.sum() >= threshold)
        gene_results['Confidence'] = round(y_pred.sum() / len(genes), 3)
        gene_results['Model'] = name
        gene_results['Type'] = label
        results.append(gene_results)
    return results

if st.button("🔍 Classify Sequence"):
    s = seq.strip().upper().replace("\n", "").replace(" ", "")
    if len(s) < 6:
        st.error("Sequence too short. Please input at least 6 base pairs.")
    else:
        with st.spinner("Running predictions..."):
            base_results = predict_all_models(s, vec_base, models_base, "Baseline")
            new_results  = predict_all_models(s, vec_new, models_new, "New Logic")

            df_all = pd.DataFrame(base_results + new_results)

            st.subheader("📊 Classification Results Table")
            st.dataframe(df_all.set_index(['Type', 'Model']))


Overwriting app.py


In [None]:
from pyngrok import ngrok
import subprocess
import time

# Kill any existing process
!fuser -k 8501/tcp

# Set your new working ngrok token
ngrok.set_auth_token  ("2xHG91dZ6Zfk4XVbkg12NjsNzuw_5nn4kfz7PznQYtnTf1KKy")

# Start Streamlit in background
process = subprocess.Popen(["streamlit", "run", "app.py"])

# Wait for the server to boot up
time.sleep(10)

# Create public URL
public_url = ngrok.connect(8501)
print("🔗 Public URL:", public_url)


🔗 Public URL: NgrokTunnel: "https://e395-34-19-31-243.ngrok-free.app" -> "http://localhost:8501"


download the models


In [None]:
import shutil
from google.colab import files

# Paths to the folders
folder_path1 = '/content/base_line_model'
folder_path2 = '/content/new_logic'

# Names for the zip files
zip_filename1 = 'base_line_model.zip'
zip_filename2 = 'new_logic.zip'

# Zip and download the first folder
shutil.make_archive(zip_filename1.replace('.zip', ''), 'zip', folder_path1)
files.download(zip_filename1)

# Zip and download the second folder
shutil.make_archive(zip_filename2.replace('.zip', ''), 'zip', folder_path2)
files.download(zip_filename2)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>