# Setup

In [5]:
import pandas as pd
import numpy as np
from pandasql import sqldf
pysqldf = lambda q: sqldf(q, globals())

In [6]:
data_dir = '../core/dataset'

# Import and Load Datasets

In [7]:
drug_ratings_train_df = pd.read_csv(f'{data_dir}/input/drugsComTrain_raw.tsv', sep='\t')
drug_ratings_test_df = pd.read_csv(f'{data_dir}/input/drugsComTest_raw.tsv', sep='\t')

drug_ratings_train_df = drug_ratings_train_df.drop('Unnamed: 0', axis=1)
drug_ratings_test_df = drug_ratings_test_df.drop('Unnamed: 0', axis=1)

drug_ratings_df = pd.concat((drug_ratings_train_df, drug_ratings_test_df))

In [8]:
drug_info_df = pd.read_csv(f'{data_dir}/output/drug.tsv', sep='\t')
drug_all_names_df = pd.read_csv(f'{data_dir}/output/drug-name.tsv', sep='\t')

In [9]:
provider_drug_names_df = pd.read_csv(f'{data_dir}/input/Medicare_Part_D_Prescribers_by_Provider_and_Drug_2019.csv')

In [10]:
all_provider_drug_names = pd.concat((provider_drug_names_df['Brnd_Name'], provider_drug_names_df['Gnrc_Name'])).drop_duplicates()

# Calculate Drug Name Overlap

Here we calculate the pairwise overlap in lowercase drug names between the Prescriber-Drug, drug rating, and drugbank datasets, as well as the overall overlap.

In [41]:
print('Num unique in drug rating train file', len(set(drug_ratings_train_df['drugName'])))
print('Num unique in drug rating test file', len(set(drug_ratings_test_df['drugName'])))
print('Num unique in drug rating file', len(set(drug_ratings_df['drugName'])))
print('Num unique in drug info file (including synonyms)', len(set(drug_all_names_df['Name'])))
print()

drug_info_name_set = set(drug_all_names_df['Name'].str.lower())
ratings_name_set = set(drug_ratings_df['drugName'].str.lower())
provider_drug_name_set = set(all_provider_drug_names.str.lower())
provider_drug_generic_name_set = set(provider_drug_names_df['Gnrc_Name'].str.lower())
provider_drug_brand_name_set = set(provider_drug_names_df['Brnd_Name'].str.lower())

provider_drug_info_overlap = provider_drug_name_set.intersection(drug_info_name_set)
info_ratings_overlap = drug_info_name_set.intersection(ratings_name_set)
provider_drug_ratings_overlap = provider_drug_name_set.intersection(ratings_name_set)
provider_drug_ratings_overlap_generic = provider_drug_generic_name_set.intersection(ratings_name_set)
provider_drug_ratings_overlap_brand = provider_drug_brand_name_set.intersection(ratings_name_set)

all_overlap = provider_drug_name_set.intersection(ratings_name_set).intersection(drug_info_name_set)

print('Overlap ratings v. drug info all names (lowercase):', len(info_ratings_overlap))
print('Overlap provider drug v. drug info all names (lowercase):', len(provider_drug_info_overlap))
print('Overlap provider drug v. ratings generic:', len(provider_drug_ratings_overlap_generic))
print('Overlap provider drug v. ratings brand:', len(provider_drug_ratings_overlap_brand))

print('Overlap provider drug v. ratings all names (lowercase):', len(provider_drug_ratings_overlap))

print('Overlap all:', len(all_overlap))

Num unique in drug rating train file 3436
Num unique in drug rating test file 2637
Num unique in drug rating file 3671
Num unique in drug info file (including synonyms) 42016

Overlap ratings v. drug info all names (lowercase): 1033
Overlap provider drug v. drug info all names (lowercase): 699
Overlap provider drug v. ratings generic: 448
Overlap provider drug v. ratings brand: 1322
Overlap provider drug v. ratings all names (lowercase): 1490
Overlap all: 465


### Drugs by Number of high & low rated reviews

In [60]:
drug_ratings_df[drug_ratings_df['rating'] >= 8][['drugName', 'rating']].groupby(by=['drugName']).count().sort_values(by='rating', ascending=False)[:20]
drug_ratings_df[drug_ratings_df['rating'] <= 3][['drugName', 'rating']].groupby(by=['drugName']).count().sort_values(by='rating', ascending=False)[:20]

# global ratings count by drug name (8-10)

Unnamed: 0_level_0,rating
drugName,Unnamed: 1_level_1
Levonorgestrel,3181
Etonogestrel,1834
Phentermine,1775
Ethinyl estradiol / norethindrone,1523
Escitalopram,1252
Sertraline,1242
Ethinyl estradiol / norgestimate,1171
Nexplanon,1125
Ethinyl estradiol / levonorgestrel,1025
Gabapentin,950


### Drug rating histogram

In [11]:
drug_rating_hist_df = pysqldf("select drugName, rating, count(*) as cnt from drug_ratings_df group by drugName, rating")

Unnamed: 0,drugName,rating,cnt
0,A + D Cracked Skin Relief,10.0,1
1,A / B Otic,10.0,2
2,Abacavir / dolutegravir / lamivudine,1.0,2
3,Abacavir / dolutegravir / lamivudine,3.0,2
4,Abacavir / dolutegravir / lamivudine,4.0,3
...,...,...,...
16646,femhrt,1.0,2
16647,femhrt,8.0,1
16648,femhrt,9.0,1
16649,femhrt,10.0,2


### Drug quantile and count

In [28]:
rating_quantile_df = drug_ratings_df[['drugName', 'rating']].groupby(by='drugName').quantile(0.9)
rating_count_df = drug_ratings_df[['drugName', 'rating']].groupby(by='drugName').count()

rating_quantile_df.merge(rating_count_df, on='drugName')

Unnamed: 0_level_0,rating_x,rating_y
drugName,Unnamed: 1_level_1,Unnamed: 2_level_1
A + D Cracked Skin Relief,10.0,1
A / B Otic,10.0,2
Abacavir / dolutegravir / lamivudine,10.0,70
Abacavir / lamivudine,10.0,3
Abacavir / lamivudine / zidovudine,9.0,1
...,...,...
ZzzQuil,3.4,3
depo-subQ provera 104,9.1,2
ella,10.0,71
femhrt,10.0,6


In [11]:
total_avg_rating = drug_ratings_df['rating'].mean()
drug_rating_avg_df = drug_ratings_df[['drugName', 'rating']].groupby(by='drugName').mean() / total_avg_rating

print(total_avg_rating)

6.990007579174474


In [75]:
condition_count_high_df = pysqldf("select condition, count(*) as cnt from drug_ratings_df where rating >= 8.0 group by condition order by cnt desc")
condition_count_low_df = pysqldf("select condition, count(*) as cnt from drug_ratings_df where rating <= 3.0 group by condition order by cnt desc")

In [None]:
prescriber_ratings_df = pysqldf("select b.*, a.rating, a.condition from drug_ratings_df as a inner join provider_drug_names_df as b on lower(a.drugName) == lower(b.Brnd_Name) or lower(a.drugName) == lower(b.Gnrc_Name)")