In [None]:
import pyspark
import pandas as pd
import dxpy
import dxdata
import numpy as np
import matplotlib.pyplot as plt
from bokeh.io import show, output_notebook
from bokeh.layouts import gridplot
import seaborn as sns
import random
import re
output_notebook()

In [None]:
sc = pyspark.SparkContext()
spark = pyspark.sql.SparkSession(sc)

In [None]:
import hail as hl
hl.init(sc=sc, default_reference='GRCh38')

In [None]:
db_name = "mdd_db"
db_uri = dxpy.find_one_data_object(name=f"{db_name}", classname="database")['id']
url = f"dnax://{db_uri}/all_presc.ht"
full = hl.read_table(url)

In [None]:
full.describe()

In [None]:
bnf = full.filter(full.system == 'bnf')
bnf.count()

In [None]:
def extract_dose(trade_name):
    # Match x mg/ x mL
    mg_ml_matches = re.findall(r'(\d+\.?\d*)\s?mg/(\d+\.?\d*)\s?ml\b', trade_name, re.IGNORECASE)
    if mg_ml_matches:
        mg_value, ml_value = mg_ml_matches[0]
        return int(float(mg_value) / float(ml_value))

    # Match mg/mL
    mg_ml_matches = re.findall(r'(\d+\.?\d*)\s?mg/mL\b', trade_name, re.IGNORECASE)
    if mg_ml_matches:
        return int(mg_ml_matches[0])

    # Match mg
    mg_matches = re.findall(r'(\d+\.?\d*)\s?[mM][gG](?:\b|_)', trade_name, re.IGNORECASE)
    if mg_matches:
        return int(float(mg_matches[0]))

    # Match MGS
    mg_matches = re.findall(r'(\d+\.?\d*)\s?MGS(?:\b|_)', trade_name, re.IGNORECASE)
    if mg_matches:
        return int(mg_matches[0])

    return None

In [None]:
unique_infos = bnf.aggregate(hl.agg.collect_as_set(bnf.info))
unique_infos = list(unique_infos)

In [None]:
bnf_dose_dict = {info: extract_dose(info) for info in unique_infos}

In [None]:
bnf = bnf.annotate(dose=hl.literal(bnf_dose_dict).get(bnf.info))

In [None]:
#bnf system rows without doses
bnf.aggregate(hl.agg.count_where(hl.is_missing(bnf.dose)))

In [None]:
bnf.describe()

In [None]:
def extract_tablets(trade_name):
    tablet_matches = re.findall(r'(\d+) packs of (\d+)', trade_name, re.IGNORECASE)
    if tablet_matches:
        number1 = int(tablet_matches[0][0])
        number2 = int(tablet_matches[0][1])
        return number1 * number2
    tablet_matches = re.findall(r'(\d+) x (\d+)', trade_name, re.IGNORECASE)
    if tablet_matches:
        number1 = int(tablet_matches[0][0])
        number2 = int(tablet_matches[0][1])
        return number1 * number2
    tablet_matches = re.findall(r'(\d+)x(\d+)', trade_name, re.IGNORECASE)
    if tablet_matches:
        number1 = int(tablet_matches[0][0])
        number2 = int(tablet_matches[0][1])
        return number1 * number2
    tablet_matches = re.findall(r'(\d+) X (\d+)', trade_name, re.IGNORECASE)
    if tablet_matches:
        number1 = int(tablet_matches[0][0])
        number2 = int(tablet_matches[0][1])
        return number1 * number2
    tablet_matches = re.findall(r'(\d+)X(\d+)', trade_name, re.IGNORECASE)
    if tablet_matches:
        number1 = int(tablet_matches[0][0])
        number2 = int(tablet_matches[0][1])
        return number1 * number2
    tablet_matches = re.findall(r'(\d+) Packs of (\d+)', trade_name, re.IGNORECASE)
    if tablet_matches:
        number1 = int(tablet_matches[0][0])
        number2 = int(tablet_matches[0][1])
        return number1 * number2
    tablet_matches = re.findall(r'(\d+) pack of (\d+)', trade_name, re.IGNORECASE)
    if tablet_matches:
        number1 = int(tablet_matches[0][0])
        number2 = int(tablet_matches[0][1])
        return number1 * number2
    tablet_matches = re.findall(r'(\d+) Pack of (\d+)', trade_name, re.IGNORECASE)
    if tablet_matches:
        number1 = int(tablet_matches[0][0])
        number2 = int(tablet_matches[0][1])
        return number1 * number2
    tablet_matches = re.findall(r'(\d+)\*(\d+)', trade_name, re.IGNORECASE)
    if tablet_matches:
        number1 = int(tablet_matches[0][0])
        number2 = int(tablet_matches[0][1])
        return number1 * number2
    tablet_matches = re.findall(r'(\d+)\s?(?:tabs?|tablets?|tab?| - tablet?|cap)\b', trade_name, re.IGNORECASE)
    if tablet_matches:
        return int(tablet_matches[0])
    tablet_matches = re.findall(r'capsules_\[(\d+)\]', trade_name, re.IGNORECASE)
    if tablet_matches:
        return int(tablet_matches[0])
    tablet_matches = re.findall(r'(oral solution sugar free_?|cap_?|caps s/r_?|tablet_?|capsules_?|days=?|days = ?|CAPS_?|tablets_mitte x ?|Tabs_?|oral solution_)(\d+)', trade_name, re.IGNORECASE)
    if tablet_matches:
        return int(tablet_matches[0][1])
    tablet_matches = re.findall(r'MG_(-?\d+)', trade_name, re.IGNORECASE)
    if tablet_matches:
        return int(tablet_matches[0])
    tablet_matches = re.findall(r'tab_(-?\d+)', trade_name, re.IGNORECASE)
    if tablet_matches:
        return int(tablet_matches[0])
    tablet_matches = re.findall(r'MGS_(-?\d+)', trade_name, re.IGNORECASE)
    if tablet_matches:
        return int(tablet_matches[0])
    tablet_matches = re.findall(r'ml_(\d+)', trade_name, re.IGNORECASE)
    if tablet_matches:
        return int(tablet_matches[0])
    tablet_matches = re.findall(r'\((Teva UK Ltd|Dexcel-Pharma Ltd|Bluefish Pharmaceuticals AB|Morningside Healthcare Ltd|Aspire Pharma Ltd|Ethypharm UK Ltd|Dallas Burston Ashbourne Ltd|Mercury Pharma Group Ltd|Wyeth Pharmaceuticals|Actavis UK Ltd|Merck Sharp & Dohme Ltd|Organon Laboratories Ltd|AMCo|Wockhardt UK Ltd|Pfizer Ltd|Novartis Pharmaceuticals UK Ltd|Lundbeck Ltd|Zentiva|Eli Lilly and Company Ltd|GlaxoSmithKline UK Ltd|DB Ashbourne Ltd|Sanofi)\)_(\d+).*', trade_name, re.IGNORECASE)
    if tablet_matches:
        return int(tablet_matches[0][1])
    tablet_matches = re.findall(r'tablets_x (\d+).*', trade_name, re.IGNORECASE)
    if tablet_matches:
        return int(tablet_matches[0])
    tablet_matches = re.findall(r'tablets_x(\d+).*', trade_name, re.IGNORECASE)
    if tablet_matches:
        return int(tablet_matches[0])
    tablet_matches = re.findall(r'tablets_(\d+).*', trade_name, re.IGNORECASE) 
    if tablet_matches:
        return int(tablet_matches[0])
    tablet_matches = re.findall(r'tablets_mitte (\d+).*', trade_name, re.IGNORECASE)
    if tablet_matches:
        return int(tablet_matches[0])
    tablet_matches = re.findall(r'tablets_\[(\d+)\]', trade_name, re.IGNORECASE)
    if tablet_matches:
        return int(tablet_matches[0])
    tablet_matches = re.findall(r'(tablets_|capsules_)\((\d+)', trade_name, re.IGNORECASE)
    if tablet_matches:
        return int(tablet_matches[0][1])
    tablet_matches = re.findall(r'(\d+) (days?|tablet)', trade_name, re.IGNORECASE)
    if tablet_matches:
        return int(tablet_matches[0][0])
    tablet_matches = re.findall(r'(\d+) (capsules|capsule)', trade_name, re.IGNORECASE)
    if tablet_matches:
        return int(tablet_matches[0][0])
    tablet_matches = re.findall(r'(\d+)(millilitres?|ml|mls)', trade_name, re.IGNORECASE)
    if tablet_matches:
        return int(tablet_matches[0][0])
    tablet_matches = re.findall(r'(\d+) (millilitres?|ml|mls)', trade_name, re.IGNORECASE)
    if tablet_matches:
        return int(tablet_matches[0][0])
    return hl.missing(hl.tstr)

In [None]:
bnf_tabs_dict = {}
for info in unique_infos:
    bnf_tabs_dict[info] = hl.eval(extract_tablets(info))

In [None]:
bnf = bnf.annotate(tablets=hl.literal(bnf_tabs_dict).get(bnf.info))
missing_tablets = bnf.filter(hl.is_missing(bnf.tablets))
count_info = missing_tablets.group_by(missing_tablets.info).aggregate(count=hl.agg.count())
count_info=count_info.to_pandas()
count_info=count_info.sort_values(by='count', ascending=False)

In [None]:
#bnf system data without quantity
bnf.aggregate(hl.agg.count_where(hl.is_missing(bnf.tablets)))

In [None]:
read_2 = full.filter(full.system == 'read_2')
read_2.count()

In [None]:
read_2_df = pd.read_csv('../data/read_name_drug.csv',  dtype=str)
read_2_dose_dict = read_2_df.set_index('read_code')['dose'].to_dict()
read_2_dose_dict = {k: v for k, v in read_2_dose_dict.items() if not pd.isna(v)}
read_2_dose_dict = {key: extract_dose(value) for key, value in read_2_dose_dict.items()}
read_2_dose_dict = hl.literal(read_2_dose_dict)

In [None]:
read_2 = read_2.annotate(dose=read_2_dose_dict.get(read_2.code))

In [None]:
missing_dose_infos = read_2.filter(hl.is_missing(read_2.dose))
unique_infos = missing_dose_infos.aggregate(hl.agg.collect_as_set(missing_dose_infos.info))
unique_infos = list(unique_infos)
read_2_dose_dict = {info: extract_dose(info) for info in unique_infos}
read_2_dose_dict = hl.literal(read_2_dose_dict)
read_2 = read_2.annotate(dose=hl.or_else(read_2.dose, read_2_dose_dict.get(read_2.info)))

In [None]:
#read 2 system data without doses
read_2.aggregate(hl.agg.count_where(hl.is_missing(read_2.dose)))

In [None]:
unique_infos = read_2.aggregate(hl.agg.collect_as_set(read_2.info))
read_2_tabs_dict = {}
for info in unique_infos:
    read_2_tabs_dict[info] = hl.eval(extract_tablets(info))

In [None]:
read_2 = read_2.annotate(tablets=hl.literal(read_2_tabs_dict).get(read_2.info))
missing_tablets = read_2.filter(hl.is_missing(read_2.tablets))
count_info = missing_tablets.group_by(missing_tablets.info).aggregate(count=hl.agg.count())
count_info=count_info.to_pandas()
count_info=count_info.sort_values(by='count', ascending=False)

In [None]:
#read 2 system data without quantity
read_2.aggregate(hl.agg.count_where(hl.is_missing(read_2.tablets)))

In [None]:
ctv3 = full.filter(full.system == 'read_3')
ctv3.count()

In [None]:
ctv3_df = pd.read_csv('../data/ctv3_drug.csv', dtype=str)
ctv3_dose_dict = ctv3_df.set_index('read_code')['dose'].to_dict()
ctv3_dose_dict = {k: v for k, v in ctv3_dose_dict.items() if not pd.isna(v)}
ctv3_dose_dict = {key: extract_dose(value) for key, value in ctv3_dose_dict.items()}
ctv3_dose_dict = hl.literal(ctv3_dose_dict)

In [None]:
ctv3 = ctv3.annotate(dose=ctv3_dose_dict.get(ctv3.code))

In [None]:
#ctv3 system data without doses
ctv3.aggregate(hl.agg.count_where(hl.is_missing(ctv3.dose)))

In [None]:
dmd = full.filter(full.system == 'dmd')
dmd.count()

In [None]:
dmd_df = pd.read_csv('../data/dmd_name.csv',  dtype=str)
dmd_dose_dict = dmd_df.set_index('dmd_code')['dose'].to_dict()
dmd_dose_dict = {k: v for k, v in dmd_dose_dict.items() if not pd.isna(v)}
dmd_dose_dict = {key: extract_dose(value) for key, value in dmd_dose_dict.items()}
dmd_dose_dict = hl.literal(dmd_dose_dict)

In [None]:
dmd = dmd.annotate(dose=dmd_dose_dict.get(dmd.code))

In [None]:
missing_dose_infos = dmd.filter(hl.is_missing(dmd.dose))
unique_infos = missing_dose_infos.aggregate(hl.agg.collect_as_set(missing_dose_infos.info))
unique_infos = list(unique_infos)
dmd_dose_dict = {info: extract_dose(info) for info in unique_infos}
dmd_dose_dict = hl.literal(dmd_dose_dict)
dmd = dmd.annotate(dose=hl.or_else(dmd.dose, dmd_dose_dict.get(dmd.info)))

In [None]:
#dmd system data without doses
dmd.aggregate(hl.agg.count_where(hl.is_missing(dmd.dose)))

In [None]:
unique_infos = dmd.aggregate(hl.agg.collect_as_set(dmd.info))
dmd_tabs_dict = {}
for info in unique_infos:
    dmd_tabs_dict[info] = hl.eval(extract_tablets(info))

In [None]:
dmd = dmd.annotate(tablets=hl.literal(dmd_tabs_dict).get(dmd.info))
missing_tablets = dmd.filter(hl.is_missing(dmd.tablets))


In [None]:
#dmd system data without quantity
dmd.aggregate(hl.agg.count_where(hl.is_missing(dmd.tablets)))

In [None]:
bnf.describe()

In [None]:
read_2.describe()

In [None]:
ctv3.describe()

In [None]:
dmd.describe()

In [None]:
all = bnf.union(read_2, unify=True).union(ctv3, unify=True).union(dmd, unify=True)

In [None]:
db_name = "mdd_db"
full_tb_name = "all_presc_v2.ht"

stmt = f"CREATE DATABASE IF NOT EXISTS {db_name} LOCATION 'dnax://'"
print(stmt)

spark.sql(stmt).show()

In [None]:
db_uri = dxpy.find_one_data_object(name=f"{db_name}", classname="database")['id']
url = f"dnax://{db_uri}/{full_tb_name}"

In [None]:
all.write(url, overwrite=True)