In [None]:
# quick ETL demo analytics
import sqlite3
import pandas as pd
conn = sqlite3.connect('data/epic_synth.db')
# counts
counts = {t: conn.execute(f"select count(*) from {t}").fetchone()[0] for t in ['person','observation','condition','visit_occurrence','measurement','variant_pathogenic','variant_vrs'] if True}
print(counts)
# top genes by pathogenic variant count
q = "select GeneSymbol, count(*) as c from variant_pathogenic group by GeneSymbol order by c desc limit 10"
top_genes = pd.read_sql(q, conn)
print('\nTop genes by pathogenic variants:')
print(top_genes)

# tiny ML demo skeleton: build a table of gene-level features (pathogenic variant counts) and a dummy target
vg = pd.read_sql('select GeneSymbol, count(*) as pathogenic_count from variant_pathogenic group by GeneSymbol', conn)
# simple target: genes with > 50 pathogenic variants flagged as 1
vg['target'] = (vg['pathogenic_count'] > 50).astype(int)
print('\nML sample head:')
print(vg.head())
conn.close()

In [None]:
# Variant index and patient-join demo
# Create a compact variant_index table from variant_vrs and a small synthetic patient_variant mapping for demo joins.
import sqlite3
conn = sqlite3.connect('data/epic_synth.db')
cur = conn.cursor()
# create variant_index: distinct vrs_id, gene, clinical_count
cur.execute('''
CREATE TABLE IF NOT EXISTS variant_index AS
SELECT vrs_id, GeneSymbol, count(*) as n_occurrences
FROM variant_vrs
GROUP BY vrs_id, GeneSymbol
''')
conn.commit()
# create a tiny patient_variant mapping for demo: sample first 100 vrs_ids and assign randomly to patients
cur.execute('CREATE TABLE IF NOT EXISTS patient_variant (person_id INTEGER, vrs_id TEXT)')
cur.execute('DELETE FROM patient_variant')
# pick up to 100 vrs_ids
v = cur.execute('select vrs_id from variant_vrs limit 100').fetchall()
import random
persons = [r[0] for r in cur.execute('select rowid from person').fetchall()]
for i, (vrs,) in enumerate(v):
    pid = persons[i % len(persons)]
    cur.execute('insert into patient_variant (person_id,vrs_id) values (?,?)', (pid, vrs))
conn.commit()
# example join: list persons with pathogenic variant counts via variant_index
q = '''
select p.rowid as person_id, p.* , vi.n_occurrences
from person p
join patient_variant pv on pv.person_id = p.rowid
join variant_index vi on vi.vrs_id = pv.vrs_id
limit 20
'''
import pandas as pd
print(pd.read_sql(q, conn))
conn.close()

In [None]:
# ML train/validate demo
# This cell prepares training data from the gene-level table and runs a small sklearn pipeline.
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, roc_auc_score

# load gene-level features created earlier
conn = sqlite3.connect('data/epic_synth.db')
vg = pd.read_sql('select GeneSymbol, pathogenic_count as pathogenic_count from (select GeneSymbol, count(*) as pathogenic_count from variant_pathogenic group by GeneSymbol)', conn)
# create numeric feature
X = vg[['pathogenic_count']].fillna(0)
y = (vg['pathogenic_count'] > 50).astype(int)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('clf', LogisticRegression(max_iter=1000))
])
pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)
y_proba = pipe.predict_proba(X_test)[:,1]
print(classification_report(y_test, y_pred))
try:
    print('ROC AUC:', roc_auc_score(y_test, y_proba))
except Exception:
    pass

# save model to disk
import joblib
joblib.dump(pipe, 'data/gene_level_model.joblib')
print('model saved to data/gene_level_model.joblib')
conn.close()

# ETL Demo
Run the ETL to load FHIR JSON into SQLite and preview tables.

In [4]:
# Example: run the ETL process on data/fhir -> data/epic_synth.db
from epic_etl import run_etl
run_etl.process_fhir_dir('data/fhir', 'data/epic_synth.db')
print('ETL completed')

ModuleNotFoundError: No module named 'epic_etl'

In [None]:
import sqlite3, pandas as pd
conn = sqlite3.connect('data/epic_synth.db')
print(pd.read_sql('SELECT * FROM person LIMIT 5', conn))