In [None]:
import pyspark
import pandas as pd
import dxpy
import dxdata
import numpy as np
import matplotlib.pyplot as plt
from bokeh.io import show, output_notebook
from bokeh.layouts import gridplot
import seaborn as sns
import random
output_notebook()

In [None]:
sc = pyspark.SparkContext()
spark = pyspark.sql.SparkSession(sc)

In [None]:
import hail as hl
hl.init(sc=sc, default_reference='GRCh38')

In [None]:
db_name = "mdd_db"
db_uri = dxpy.find_one_data_object(name=f"{db_name}", classname="database")['id']
url = f"dnax://{db_uri}/all_presc.ht"
full = hl.read_table(url)

In [None]:
full.describe()

In [None]:
records_per_person = list(dict(full.aggregate(hl.agg.counter(full.eid))).values())

In [None]:
#Mean number of prescriptions per person:
np.mean(records_per_person)

In [None]:
#Median for number of prescriptions per person:
np.median(records_per_person)

In [None]:
sns.histplot(data=records_per_person, binwidth=10)
plt.xlabel('Number of prescriptions')
plt.xlim(0,250)
plt.ylabel('Frequency')
plt.title('Histogram of number of prescriptions per person')
plt.show()

In [None]:
full.aggregate(hl.agg.counter(full.system))

In [None]:
full = full.annotate(term=full.term.lower())
counts_per_drug = list(full.aggregate(hl.agg.counter(full.term)).items())
counts_per_drug = pd.DataFrame(counts_per_drug, columns=['term', 'count'])
counts_per_drug.sort_values(by='count', ascending=False, inplace=True)

palette_terms = {}
color_palette = sns.color_palette("husl", len(counts_per_drug))
for i, term in enumerate(counts_per_drug['term']):
    palette_terms[term] = color_palette[i]

colors = [palette_terms[term] for term in counts_per_drug['term']]

plt.figure(figsize=(12, 8))
bar_plot = sns.barplot(data=counts_per_drug, x='term', y='count', palette=colors)
bar_plot.bar_label(bar_plot.containers[0])

plt.xlabel('Drugs')
plt.ylabel('Number of prescriptions')
plt.title('Number of prescriptions per drugs')
plt.xticks(rotation=90)
plt.show()

In [None]:
grouped_by_drug = full.group_by(full.term).aggregate(
    unique_eids=hl.agg.collect_as_set(full.eid)
)
counts_person_per_drug = grouped_by_drug.annotate(
    num_unique_eids=hl.len(grouped_by_drug.unique_eids)
)
counts_person_per_drug_pd = counts_person_per_drug.to_pandas()
counts_person_per_drug_pd.sort_values(by='num_unique_eids', ascending=False, inplace=True)

colors_person = [palette_terms[term] for term in counts_person_per_drug_pd['term']]

plt.figure(figsize=(12, 8))
bar_plot = sns.barplot(data=counts_person_per_drug_pd, x='term', y='num_unique_eids', palette=colors_person)
bar_plot.set_xlabel('Drugs')
bar_plot.set_ylabel('Number of persons')
bar_plot.set_title('Number of persons per drug')
bar_plot.set_xticklabels(bar_plot.get_xticklabels(), rotation=90, ha='right')

for container in bar_plot.containers:
    bar_plot.bar_label(container)

plt.tight_layout()
plt.show()