In [1]:
import pandas as pd
from wordcloud import WordCloud
import numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
filename = "data/cleaned.csv"

## Check columns names

In [3]:
med = pd.read_csv(filename, header=0, index_col=0)
med.columns

Index(['Brand Name', 'Generic Name', 'Number of Manufacturers',
       'Total Spending', 'Total Dosage Units', 'Total Claims',
       'Average Spending Per Dosage Unit (Weighted)',
       'Average Spending Per Claim', 'year', 'coverage_type'],
      dtype='object')

In [4]:
med.columns = [col.replace(' ', '_').lower() for col in list(med.columns)]

In [5]:
med.columns

Index(['brand_name', 'generic_name', 'number_of_manufacturers',
       'total_spending', 'total_dosage_units', 'total_claims',
       'average_spending_per_dosage_unit_(weighted)',
       'average_spending_per_claim', 'year', 'coverage_type'],
      dtype='object')

In [6]:
drugs = pd.read_csv('data/drug_info.csv', header=0, index_col=0)
drugs["Brand"] = drugs["Brand"].apply(lambda x: x.strip('*'))

In [57]:
years = [2012, 2013]

In [58]:
for year in years:
    print('\n', year, '\t')
    data_by_year = med[med['year']==year].copy()
    data_by_year.sort_values("total_spending", ascending=False, inplace=True)
    data_by_year['total_spending'] = data_by_year['total_spending'].astype(str).apply(
                                                            lambda x: x[0] + "."+ x[1:3] + " Billion")
    
    print(data_by_year[['brand_name', 'total_spending', 'coverage_type']].head(10))
    names = data_by_year["brand_name"].head(10).apply(lambda x: x.strip().lower()).unique()
    text = ''.join(drugs[drugs["Brand"].isin(names)].Uses)
    wc = WordCloud().generate(text)
    
    plt.imshow(wc, interpolation='bilinear')
    plt.axis("off")
    plt.show()


 2012 	
                 brand_name total_spending coverage_type
58389                nexium   2.12 Billion      medicare
56722         advair diskus   1.88 Billion      medicare
21979               abilify   1.88 Billion      medicaid
57270               crestor   1.78 Billion      medicare
56675               abilify   1.75 Billion      medicare
58631                plavix   1.69 Billion      medicare
58979               spiriva   1.60 Billion      medicare
57290              cymbalta   1.43 Billion      medicare
58341               namenda   1.32 Billion      medicare
56875  atorvastatin calcium   1.27 Billion      medicare


KeyError: 'Brand'

In [9]:
med.loc[:,"brand_name"] = med["brand_name"].str.lower()
med.loc[:,"generic_name"] = med["generic_name"].str.lower()

In [10]:
med = med.dropna()

In [17]:
med.drop_duplicates(subset=['brand_name', 'generic_name', 'year', 'coverage_type'], inplace=True)

In [27]:
med['coverage_type'] = med['coverage_type'].str.strip()
med['brand_name'] = med['brand_name'].str.strip()
med['generic_name'] = med['generic_name'].str.strip()

In [28]:
med.drop_duplicates(subset=['brand_name', 'generic_name', 'year', 'coverage_type'], inplace=True)

In [30]:
med[med['year']==2016].sort_values('total_spending', ascending=False)

Unnamed: 0,brand_name,generic_name,number_of_manufacturers,total_spending,total_dosage_units,total_claims,average_spending_per_dosage_unit_(weighted),average_spending_per_claim,year,coverage_type
40170,harvoni,ledipasvir/sofosbuvir,1,4.398692e+09,3.932401e+06,141670.0,1118.58,31048.86,2016,medicare
41476,revlimid,lenalidomide,1,2.661153e+09,4.895379e+06,239052.0,543.59,11132.11,2016,medicare
40469,lantus solostar,"insulin glargine,hum.rec.anlog",1,2.526126e+09,1.026791e+08,5028646.0,24.60,502.35,2016,medicare
40366,januvia,sitagliptin phosphate,1,2.440092e+09,2.036297e+08,4742650.0,11.99,514.50,2016,medicare
39497,crestor,rosuvastatin calcium,1,2.322796e+09,2.856839e+08,6012644.0,8.13,386.32,2016,medicare
...,...,...,...,...,...,...,...,...,...,...
315,aspirin ec,aspirin,3,2.056000e+01,1.529000e+03,51.0,0.01,0.40,2016,medicaid
251,antacid,mag hydrox/aluminum hyd/simeth,1,9.460000e+00,1.200000e+03,11.0,0.01,0.86,2016,medicaid
35867,camphor,camphor,1,2.440000e+00,1.191200e+02,54.0,0.02,0.05,2016,medicaid
976,daily vitamin formula-iron,multivitamin/iron/folic acid,1,2.240000e+00,5.100000e+02,17.0,0.00,0.13,2016,medicaid


In [34]:
drugs.loc[:,'Uses'] = drugs.loc[:,'Uses'].apply(lambda x: np.nan if x == "Drug uses not available" else x)

In [14]:
med[med['average_spending_per_claim'] == med['average_spending_per_claim'].max()]

Unnamed: 0,brand_name,generic_name,number_of_manufacturers,total_spending,total_dosage_units,total_claims,average_spending_per_dosage_unit_(weighted),average_spending_per_claim,year,coverage_type
21931,zolgensma,onasemnogene abeparvovec-xioi,1,59674421.27,36.59,34.0,1631028.0,1755130.04,2020,medicaid


In [33]:
med.columns

Index(['brand_name', 'generic_name', 'number_of_manufacturers',
       'total_spending', 'total_dosage_units', 'total_claims',
       'average_spending_per_dosage_unit_(weighted)',
       'average_spending_per_claim', 'year', 'coverage_type'],
      dtype='object')

In [49]:
drugs.columns = [x.lower() for x in drugs.columns.values]

In [51]:
drugs.loc[:,"generic"] = drugs["generic"].str.lower()

In [53]:
drugs['treatment'] = drugs['uses'].split(".").apply

0    This drug is used to treat mild to moderate pa...
1    This medication is used as a moisturizer to tr...
2                                                  NaN
3    This medication is used to treat various condi...
4    This medication is used to treat conditions su...
Name: uses, dtype: object

In [77]:
med[med['total_spending'] == med[med['average_spending_per_claim']<=10]['total_spending'].max()]

Unnamed: 0,brand_name,generic_name,number_of_manufacturers,total_spending,total_dosage_units,total_claims,average_spending_per_dosage_unit_(weighted),average_spending_per_claim,year,coverage_type
59625,amlodipine besylate,amlodipine besylate,27,328929800.0,1636371000.0,33300649.0,0.21,9.88,2013,medicare


In [84]:
print(drugs[drugs['brand']=="amlodipine besylate"]['uses'].values)

['Amlodipine is used with or without other medications to treat high blood pressure. Lowering high blood pressure helps prevent strokes, heart attacks, and kidney problems. Amlodipine belongs to a class of drugs known as calcium channel blockers. It works by relaxing blood vessels so blood can flow more easily. Amlodipine is also used to prevent certain types of chest pain (angina). It may help to increase your ability to exercise and decrease the frequency of angina attacks. It should not be used to treat attacks of chest pain when they occur. Use other medications (such as sublingual nitroglycerin) to relieve attacks of chest pain as directed by your doctor.']
