In [None]:
# quick ETL demo analytics
import sqlite3
import pandas as pd
conn = sqlite3.connect('data/epic_synth.db')
# counts
counts = {t: conn.execute(f"select count(*) from {t}").fetchone()[0] for t in ['person','observation','condition','visit_occurrence','measurement','variant_pathogenic','variant_vrs'] if True}
print(counts)
# top genes by pathogenic variant count
q = "select GeneSymbol, count(*) as c from variant_pathogenic group by GeneSymbol order by c desc limit 10"
top_genes = pd.read_sql(q, conn)
print('\nTop genes by pathogenic variants:')
print(top_genes)

# tiny ML demo skeleton: build a table of gene-level features (pathogenic variant counts) and a dummy target
vg = pd.read_sql('select GeneSymbol, count(*) as pathogenic_count from variant_pathogenic group by GeneSymbol', conn)
# simple target: genes with > 50 pathogenic variants flagged as 1
vg['target'] = (vg['pathogenic_count'] > 50).astype(int)
print('\nML sample head:')
print(vg.head())
code
python
#VSC-clean-1
# Quick ETL demo analytics (cleaned)
import sqlite3
import pandas as pd
conn = sqlite3.connect('data/epic_synth.db')
counts = {t: conn.execute(f\
).fetchone()[0] for t in ['person','observation','condition','visit_occurrence','measurement','variant_pathogenic','variant_vrs']}
print(counts)
conn.close()
code
python
#VSC-clean-2
# Variant index and patient-join demo (cleaned)
import sqlite3
import pandas as pd
conn = sqlite3.connect('data/epic_synth.db')
# create variant_index and small patient_variant demo
# ...
conn.close()
code
python
#VSC-clean-3
# ML train/validate demo (cleaned)
import pandas as pd
from sklearn.pipeline import Pipeline
# ... model training steps omitted in cleaned notebook
# save model if desired
# joblib.dump(pipe, 'data/gene_level_model.joblib')
import pandas as pd
print(pd.read_sql(q, conn))
conn.close()

{'person': 116, 'observation': 64701, 'condition': 4507, 'visit_occurrence': 6277, 'measurement': 50621, 'variant_pathogenic': 457731, 'variant_vrs': 421145}

Top genes by pathogenic variants:
  GeneSymbol      c
0      BRCA2  10505
1        NF1   9234
2      BRCA1   8171
3        ATM   6230
4       FBN1   4794
5        APC   4656
6        DMD   4556
7       MSH6   3990
8       MSH2   3924
9       MLH1   3284



ML sample head:
                                          GeneSymbol  pathogenic_count  target
0                                                  -               706       1
1  A-GAMMA3'E;BGLT3;HBE1;HBG1;HBG2;HS-E1;LOC10609...                 1       0
2                                             A4GALT                 3       0
3                                     A4GALT;ARFGAP3                 1       0
4                                               AAAS               146       1


In [None]:
# ML train/validate demo
# This cell prepares training data from the gene-level table and runs a small sklearn pipeline.
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, roc_auc_score

# load gene-level features created earlier
conn = sqlite3.connect('data/epic_synth.db')
vg = pd.read_sql('select GeneSymbol, pathogenic_count as pathogenic_count from (select GeneSymbol, count(*) as pathogenic_count from variant_pathogenic group by GeneSymbol)', conn)
# create numeric feature
X = vg[['pathogenic_count']].fillna(0)
y = (vg['pathogenic_count'] > 50).astype(int)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('clf', LogisticRegression(max_iter=1000))
])
pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)
y_proba = pipe.predict_proba(X_test)[:,1]
print(classification_report(y_test, y_pred))
try:
    print('ROC AUC:', roc_auc_score(y_test, y_proba))
except Exception:
    pass

# save model to disk
import joblib
joblib.dump(pipe, 'data/gene_level_model.joblib')
print('model saved to data/gene_level_model.joblib')
conn.close()

              precision    recall  f1-score   support

           0       0.99      1.00      1.00      2017
           1       1.00      0.95      0.98       294

    accuracy                           0.99      2311
   macro avg       1.00      0.98      0.99      2311
weighted avg       0.99      0.99      0.99      2311

ROC AUC: 1.0
model saved to data/gene_level_model.joblib


# ETL Demo
Run the ETL to load FHIR JSON into SQLite and preview tables.

In [None]:
# Example: run the ETL process on data/fhir -> data/epic_synth.db
from epic_etl import run_etl
run_etl.process_fhir_dir('data/fhir', 'data/epic_synth.db')
print('ETL completed')

ETL completed


In [None]:
import sqlite3, pandas as pd
conn = sqlite3.connect('data/epic_synth.db')
print(pd.read_sql('SELECT * FROM person LIMIT 5', conn))

                              person_id              given_name  family_name  \
0  09670eb9-5b42-a3ea-19d1-8b9cbe1b7643                 Alma679     Kunde533   
1  2087502f-d58c-670a-0d38-f319d948f707  Marketta481 Nereida276    Renner328   
2  0fc6af72-bf86-5d2e-6591-875fb9f3861f              Julissa825  Hermiston71   
3  cb864ad2-cc4c-638f-41aa-143abf3b40d4                 Thad495     Borer986   
4  2e584b4a-b12f-83cb-ca94-aac121f53d30   Franklin857 Garret233    Cronin387   

   gender  birth_date  
0  female  2016-03-28  
1  female  1961-05-31  
2  female  1999-07-21  
3    male  1942-09-30  
4    male  1942-09-16  
