In [1]:
import pandas as pd
import re
import math
import numpy as np
import ast


In [2]:
gp_df = pd.read_csv('data/priority_claims.csv', sep='\t')

In [3]:
def clean_text(text):
    if isinstance(text, float):
        return text
    processed = text.lower()
    processed = re.sub(r'[#|\!|\-|\+|:|//|,|\"|\[|\]]', " ", processed)
    processed = re.sub(r"'", "", processed)
    processed = re.sub(r'"', "", processed)
    processed = re.sub(r'(?:(?:\d+,?)+(?:\.?\d+)?)', ' ', processed)
    processed = re.sub('[\s]+', ' ', processed)
    return processed

In [4]:
gp_df["abstract_clean"] = gp_df["abstract_text"].apply(clean_text)
gp_df["title_clean"] = gp_df["title_text"].apply(clean_text)
gp_df.to_csv("priority_claims_clean.csv",index=False, sep='\t')

In [5]:
disease_syn = pd.read_excel("mortality_data/disease_synonyms.xlsx")
disease_syn = disease_syn[["ICD_code", "entry_term"]]
# list of ICD disease groups
codes = list(disease_syn.ICD_code.unique())

In [6]:
def search_patent(df, terms, ob=False):
    
    result=[]
    for term in terms:
        clean_term = clean_text(term)
        result = result + df.loc[(~df.abstract_clean.isnull()) & (df.abstract_clean.str.contains(clean_term))].publication_number.tolist()
        result = result + df.loc[(~df.title_clean.isnull()) & (df.title_clean.str.contains(clean_term))].publication_number.tolist()
        if ob:
            result = result + df.loc[(~df.mesh_clean.isnull()) & (df.mesh_clean.str.contains(clean_term))].publication_number.tolist()

    result = list(set(result))
    return result

In [7]:
# keyword search through google patent data
for c in codes:
    print("Searching patents for %s" % c)
    search_terms = disease_syn.loc[disease_syn.ICD_code == c].entry_term.values
    patents = search_patent(gp_df, search_terms)
    fname = 'data/result/%s.txt' % c
    with open(fname, 'w') as f:
        for p in patents:
            f.write("%s\n" % p)
    
    print("Wrote %s" % fname)
    print("...")

Searching patents for A00
Wrote data/result/A00.txt
...
Searching patents for A01
Wrote data/result/A01.txt
...
Searching patents for A02
Wrote data/result/A02.txt
...
Searching patents for A03
Wrote data/result/A03.txt
...
Searching patents for A04
Wrote data/result/A04.txt
...
Searching patents for A05
Wrote data/result/A05.txt
...
Searching patents for A06,A07
Wrote data/result/A06,A07.txt
...
Searching patents for A08
Wrote data/result/A08.txt
...
Searching patents for A09
Wrote data/result/A09.txt
...
Searching patents for A15,A16
Wrote data/result/A15,A16.txt
...
Searching patents for A17,A18,A19
Wrote data/result/A17,A18,A19.txt
...
Searching patents for A20
Wrote data/result/A20.txt
...
Searching patents for A21
Wrote data/result/A21.txt
...
Searching patents for A22
Wrote data/result/A22.txt
...
Searching patents for A23
Wrote data/result/A23.txt
...
Searching patents for A24
Wrote data/result/A24.txt
...
Searching patents for A25
Wrote data/result/A25.txt
...
Searching patent

  
  import sys


Wrote data/result/B01.txt
...
Searching patents for B02
Wrote data/result/B02.txt
...
Searching patents for B03
Wrote data/result/B03.txt
...
Searching patents for B04
Wrote data/result/B04.txt
...
Searching patents for B05,B06
Wrote data/result/B05,B06.txt
...
Searching patents for B07
Wrote data/result/B07.txt
...
Searching patents for B08,B09
Wrote data/result/B08,B09.txt
...
Searching patents for B15,B16,B17,B18,B19
Wrote data/result/B15,B16,B17,B18,B19.txt
...
Searching patents for B20,B21,B22,B23,B24
Wrote data/result/B20,B21,B22,B23,B24.txt
...
Searching patents for B25,B27,B30
Wrote data/result/B25,B27,B30.txt
...
Searching patents for B26
Wrote data/result/B26.txt
...
Searching patents for B33
Wrote data/result/B33.txt
...
Searching patents for B34
Wrote data/result/B34.txt
...
Searching patents for B35,B36,B37,B38,B39,B40,B41,B42,B43,B44,B45,B46,B47,B48,B49
Wrote data/result/B35,B36,B37,B38,B39,B40,B41,B42,B43,B44,B45,B46,B47,B48,B49.txt
...
Searching patents for B50,B51,B52,

In [11]:
ob_df = pd.read_csv("data/ob_patents.csv",sep='\t')
ob_df["title_clean"] = ob_df["title_text"].apply(clean_text)
ob_df["abstract_clean"] = ob_df["abstract_text"].apply(clean_text)
ob_df["mesh_clean"] = ob_df["mesh_heading"].apply(clean_text)


In [52]:
gp_df.loc[(gp_df.filing_year >= 1988) & (gp_df.filing_year < 2018)].shape

(1307665, 17)

In [9]:
gp_df.country_code.nunique()

94

In [None]:
ob_df.shape

In [None]:
both = pd.concat([gp_df, ob_df])

In [None]:
both = both.drop_duplicates("publication_number")

In [None]:
both.loc[(both.filing_year >= 1988) & (both.filing_year <=2017)].publication_number.nunique()

In [None]:
both.loc[(both.filing_year >= 1988) & (both.filing_year <=2017)].publication_number.nunique()

In [53]:
1308272 - 1307665

607

In [None]:
gp_df.groupby("country_code")["publication_number"].count().reset_index().sort_values("publication_number")

In [None]:
# keyword search through orangebook data
for c in codes:
    print("Searching patents for %s" % c)
    search_terms = disease_syn.loc[disease_syn.ICD_code == c].entry_term.values
    patents = search_patent(ob_df, search_terms, ob=True)
    fname = 'data/ob_result/%s.txt' % c
    with open(fname, 'w') as f:
        for p in patents:
            f.write("%s\n" % p)
    
    print("Wrote %s" % fname)
    print("...")

In [54]:
ob_apl_df = pd.read_csv("data/ob_apl.csv",sep='\t')
pharma_apl_df = pd.read_csv("data/pharma_apl.csv",sep='\t')
apl_df = pd.concat([ob_apl_df, pharma_apl_df])
apl_df["title_clean"] = apl_df["title_text"].apply(clean_text)
apl_df["abstract_clean"] = apl_df["abstract_text"].apply(clean_text)
apl_df = apl_df.drop_duplicates("publication_number")

In [57]:
apl_df.loc[(apl_df.filing_year>=1998) & (apl_df.filing_year <=2017) & 
           (~apl_df.publication_number.isin(gp_df.publication_number.values)) & 
           (~apl_df.publication_number.isin(ob_df.publication_number.values))].shape

(17772, 17)

In [39]:
#### keyword search through APL data
for c in codes:
    print("Searching patents for %s" % c)
    search_terms = disease_syn.loc[disease_syn.ICD_code == c].entry_term.values
    patents = search_patent(apl_df, search_terms, ob=False)
    fname = 'data/apl_result/%s.txt' % c
    with open(fname, 'w') as f:
        for p in patents:
            f.write("%s\n" % p)
    
    print("Wrote %s" % fname)
    print("...")

Searching patents for A00
Wrote data/apl_result/A00.txt
...
Searching patents for A01
Wrote data/apl_result/A01.txt
...
Searching patents for A02
Wrote data/apl_result/A02.txt
...
Searching patents for A03
Wrote data/apl_result/A03.txt
...
Searching patents for A04
Wrote data/apl_result/A04.txt
...
Searching patents for A05
Wrote data/apl_result/A05.txt
...
Searching patents for A06,A07
Wrote data/apl_result/A06,A07.txt
...
Searching patents for A08
Wrote data/apl_result/A08.txt
...
Searching patents for A09
Wrote data/apl_result/A09.txt
...
Searching patents for A15,A16
Wrote data/apl_result/A15,A16.txt
...
Searching patents for A17,A18,A19
Wrote data/apl_result/A17,A18,A19.txt
...
Searching patents for A20
Wrote data/apl_result/A20.txt
...
Searching patents for A21
Wrote data/apl_result/A21.txt
...
Searching patents for A22
Wrote data/apl_result/A22.txt
...
Searching patents for A23
Wrote data/apl_result/A23.txt
...
Searching patents for A24
Wrote data/apl_result/A24.txt
...
Searchin

  
  import sys


Wrote data/apl_result/B01.txt
...
Searching patents for B02
Wrote data/apl_result/B02.txt
...
Searching patents for B03
Wrote data/apl_result/B03.txt
...
Searching patents for B04
Wrote data/apl_result/B04.txt
...
Searching patents for B05,B06
Wrote data/apl_result/B05,B06.txt
...
Searching patents for B07
Wrote data/apl_result/B07.txt
...
Searching patents for B08,B09
Wrote data/apl_result/B08,B09.txt
...
Searching patents for B15,B16,B17,B18,B19
Wrote data/apl_result/B15,B16,B17,B18,B19.txt
...
Searching patents for B20,B21,B22,B23,B24
Wrote data/apl_result/B20,B21,B22,B23,B24.txt
...
Searching patents for B25,B27,B30
Wrote data/apl_result/B25,B27,B30.txt
...
Searching patents for B26
Wrote data/apl_result/B26.txt
...
Searching patents for B33
Wrote data/apl_result/B33.txt
...
Searching patents for B34
Wrote data/apl_result/B34.txt
...
Searching patents for B35,B36,B37,B38,B39,B40,B41,B42,B43,B44,B45,B46,B47,B48,B49
Wrote data/apl_result/B35,B36,B37,B38,B39,B40,B41,B42,B43,B44,B45,B4

In [65]:
# read the list of patents that match keyword search!
matched_patents = pd.DataFrame(columns=["ICD_code", "publication_number"])
for c in codes:
    fname = 'data/result/%s.txt' % c
    with open(fname) as f:
        patents = f.read().splitlines()
    
    searched = pd.DataFrame({"ICD_code": [c]*len(patents), "publication_number":patents})
    matched_patents = matched_patents.append(searched, ignore_index=True)
    

In [66]:
for c in codes:
    fname = 'data/ob_result/%s.txt' % c
    with open(fname) as f:
        patents = f.read().splitlines()
    
    searched = pd.DataFrame({"ICD_code": [c]*len(patents), "publication_number":patents})
    matched_patents = matched_patents.append(searched, ignore_index=True)
    

In [68]:

for c in codes:
    fname = 'data/apl_result/%s.txt' % c
    with open(fname) as f:
        patents = f.read().splitlines()
    
    searched = pd.DataFrame({"ICD_code": [c]*len(patents), "publication_number":patents})
    matched_patents = matched_patents.append(searched, ignore_index=True)
    

In [69]:
matched_patents.shape

(166608, 2)

In [70]:
matched_patents = matched_patents.drop_duplicates().reset_index(drop=True)
matched_patents = matched_patents.sort_values(by="ICD_code").reset_index(drop=True)
matched_patents.to_csv("data/disease_patent_match.csv", sep='\t', index=False)

In [71]:
# get list of patents and their families
families = pd.concat([gp_df[["publication_number","family_id"]], ob_df[["publication_number","family_id"]],
                     apl_df[["publication_number","family_id"]]])
families = families.drop_duplicates()
families= families.reset_index(drop=True)

In [72]:
# find family Ids of each matched patent
matched_patents_fam = pd.merge(left=matched_patents, right=families, on="publication_number", how="left")
# find other patents in the same family 
matched_patents_fam = pd.merge(left = matched_patents_fam, right=families, on="family_id", how="left")
matched_patents_fam = matched_patents_fam.drop("publication_number_x", axis=1)
matched_patents_fam = matched_patents_fam.drop("family_id", axis=1)
matched_patents_fam = matched_patents_fam.rename(columns={"publication_number_y":"publication_number"})
matched_patents_fam = matched_patents_fam.drop_duplicates().reset_index(drop=True)

In [73]:
del(families)

In [74]:
patent_cols = ["publication_number","country_code", "publication_date", "priority_year", "filing_year",
               "assignees_harmonized", "citations", "ipc_codes"]
all_patents = pd.concat([gp_df[patent_cols], ob_df[patent_cols], apl_df[patent_cols]])

In [75]:
all_patents = all_patents.drop_duplicates("publication_number")
all_patents = all_patents.reset_index(drop=True)
all_patents["publication_year"] = all_patents["publication_date"].apply(lambda x: int(str(x)[0:4]))

In [76]:
def count_citation(x):
    if isinstance(x, str):
        return len(x.split(','))
    else:
        return 0
all_patents["citation_count"]= all_patents["citations"].apply(count_citation)
all_patents = all_patents.drop(["publication_date"], axis=1)
matched_patents_fam=pd.merge(left = matched_patents_fam,right=all_patents, on="publication_number", how="left")

In [77]:
matched_patents_fam.to_csv("data/disease_patent_match_extended.csv", sep='\t', index=False)

In [78]:
matched_patents_fam.shape

(194070, 10)

## back...

In [80]:
matched_patents_fam.loc[(matched_patents_fam.ICD_code=="A42,A43,A48,A49") | 
                        (matched_patents_fam.ICD_code=="A44"), "ICD_code"] = "A42,A43,A44,A48,A49"

In [81]:
disease_group = pd.read_excel("mortality_data/disease_code.xlsx")
disease_custom_group = disease_group[["neglected", "custom_group", 
                                      "custom_group_code"]].drop_duplicates().reset_index(drop=True)

In [175]:
matched_patents_full = pd.merge(left = matched_patents_fam, right = disease_custom_group,
        left_on="ICD_code", right_on="custom_group_code", how="left")

In [176]:
matched_patents_full = matched_patents_full.loc[(matched_patents_full.filing_year>=1988) & 
                         (matched_patents_full.filing_year<2018)]

In [203]:
matched_patents_full = matched_patents_full.loc[(matched_patents_full.ICD_code!="A26") & 
                                                (matched_patents_full.ICD_code!="B04")].reset_index(drop=True)

In [205]:
matched_patents_full.publication_number.shape

(181845,)

In [206]:
group_count = matched_patents_full.loc[(matched_patents_full.filing_year>=1988) & 
                                      (matched_patents_full.filing_year<2018)].groupby("ICD_code")["publication_number"].count().reset_index()

In [207]:
group_count.sort_values("publication_number")

Unnamed: 0,ICD_code,publication_number
78,B87,16
20,A32,30
15,A24,31
24,A38,32
12,A21,36
43,"A87,A88,A89",84
6,"A06,A07",101
16,A27,112
38,"A75,A76,A77,A78,A79",120
76,B85,122


In [208]:
by_disease_group = matched_patents_full.groupby(["ICD_code","custom_group","neglected", "filing_year"])\
    .agg({"publication_number":"count"}).reset_index()
#by_disease_group["log_patent_count"] = by_disease_group["publication_number"].apply(math.log)
by_disease_group = by_disease_group.loc[by_disease_group.filing_year >= 1984]
by_disease_group = by_disease_group.reset_index(drop=True)
by_disease_group =  by_disease_group.rename(columns={"publication_number":"patent_count"})
by_disease_group["filing_year"] = by_disease_group["filing_year"].apply(int)

In [209]:
by_disease_group.to_csv("data/by_disease_group.csv",index=False, sep="\t")

In [210]:
# fill in missing year
by_disease_group_complete = by_disease_group.copy()

dgs = by_disease_group.ICD_code.unique()
for dg in dgs:
    dg_df = by_disease_group.loc[by_disease_group.ICD_code == dg]
    dg_start_year = max([dg_df.filing_year.min(), 1984])
    year_list = list(range(dg_start_year, 2018))
    dg_name = dg_df.custom_group.values[0]
    dg_neglected = dg_df.neglected.values[0]
    zero_row = dict({"ICD_code":dg, "custom_group":dg_name, "neglected":dg_neglected, 
                     "patent_count":0})
    for y in year_list:
        if y not in dg_df.filing_year.values:
            zero_row["filing_year"] = y
            by_disease_group_complete = by_disease_group_complete.append(zero_row, ignore_index=True)
        

In [211]:
by_disease_group.ICD_code.nunique()

80

In [212]:
by_disease_group_complete.loc[(by_disease_group_complete.ICD_code=="A04") & 
                              (by_disease_group_complete.filing_year>=1988) &
                              (by_disease_group_complete.filing_year<=2017)
                             ].patent_count.mean()

1340.5

In [213]:
by_disease_group_complete = by_disease_group_complete.sort_values(["ICD_code",  "filing_year"]).reset_index(drop=True)

In [214]:
by_disease_group_complete["log_patent_count0"] = np.log(by_disease_group_complete["patent_count"] )
by_disease_group_complete.loc[by_disease_group_complete.patent_count == 0, "log_patent_count0"] = np.NaN
by_disease_group_complete["log_patent_count1"] = np.log(by_disease_group_complete["patent_count"] + 1)

  """Entry point for launching an IPython kernel.


In [215]:
by_disease_group_complete.to_csv("data/by_disease_group_complete.csv", sep='\t', index=False)

In [216]:
by_disease_group_complete  = pd.read_csv("data/by_disease_group_complete.csv", sep='\t')

In [217]:
summ_stats = by_disease_group_complete.loc[(by_disease_group_complete.filing_year >= 1988) & 
                                           (by_disease_group_complete.filing_year < 2018)]

## get mortality data

In [None]:
mort = pd.read_csv("data/mortality_data.csv", sep="\t")
mort = pd.merge(left=mort, right=disease_group[["ICD_code", "custom_group_code"]], 
                left_on="cause", right_on="ICD_code")
mort = mort.groupby(["Year", "custom_group_code", "country"]).Deaths1.sum().reset_index()

In [None]:
mort = mort.rename(columns={"custom_group_code":"ICD_code", "Deaths1":"deaths", "Year":"year"})

In [None]:
mort.loc[mort.country=="United States of America", "country"] = "United States"

In [None]:
# fill in missing year
mort_complete = mort.copy()
#mort_complete = pd.DataFrame(columns=mort.columns)
mort_countries = mort.country.unique()
dgs = mort.ICD_code.unique()
count = 0
for mc in mort_countries:
    mc_df = mort.loc[mort.country == mc].copy()
    dgs = mc_df.ICD_code.unique()
    
    for dg in dgs:
        mc_dg = mc_df.loc[mc_df.ICD_code==dg].sort_values("year", ascending=True).reset_index(drop=True)
        if len(mc_dg) > 0:
            
            imputed_row = dict({"ICD_code":dg, "country":mc})
            for y in range(1984, 2019):
                if y not in mc_dg.year.values:
                    count = count+1
                    imputed_row["year"] = y
                    right_after = mc_dg.loc[mc_dg.year > y]
                    if len(right_after) > 0:
                        imputed_row["deaths"] = mc_dg.iloc[0]["deaths"]
                    else:
                        right_before = mc_dg.loc[mc_dg.year < y]
                        imputed_row["deaths"] = right_before.iloc[-1]["deaths"]

                    mort_complete = mort_complete.append(imputed_row, ignore_index=True)



In [None]:
mort = mort_complete


In [None]:
mort["total_deaths"] = mort.groupby(["year", "ICD_code"])["deaths"].transform("sum")

In [None]:
mort["deaths"] = mort["deaths"].apply(float)
mort["total_deaths"] = mort["total_deaths"].apply(float)
mort["death_share"] = mort["deaths"] / mort["total_deaths"]

In [218]:
mort = pd.read_csv("data/mortality_by_disease.csv", sep="\t")

In [219]:
mort_agg  = mort.groupby(["year","ICD_code"])["deaths"].sum().reset_index()

In [220]:
mort_agg["total_deaths"] = mort_agg.groupby("year")["deaths"].transform("sum")
mort_agg["prevalence"] = mort_agg["deaths"] / mort_agg["total_deaths"]

In [221]:
mort_agg = mort_agg[["year", "ICD_code", "prevalence"]]

In [222]:
mort.to_csv("data/mortality_by_disease.csv", sep="\t", index=False)

## Get population data

In [223]:
pop = pd.read_csv("population.csv")

In [224]:
pop = pop.loc[~pop["Series Code"].isnull()]

In [225]:
pop = pop.drop(["Series Name", "Series Code"], axis=1)\
    .rename(columns={"Country Name":"country", "Country Code":"country_code"})

In [226]:
pop = pop.drop("2018 [YR2018]",axis=1)

In [227]:
pop_world = pop.loc[pop.country == "World"]
pop = pop.loc[:216]

In [228]:
pop_world = pd.melt(pop_world, id_vars=["country", "country_code"])
pop = pd.melt(pop, id_vars=["country", "country_code"])

In [229]:
pop_world = pop_world.rename(columns={"variable": "year", "value":"population"})
pop = pop.rename(columns={"variable": "year", "value":"population"})


In [230]:
pop_world["year"] = pop_world["year"].apply(lambda x: int(str(x)[0:4]))
pop["year"] = pop["year"].apply(lambda x: int(str(x)[0:4]))

In [231]:
pop.loc[pop.country=="Eswatini", "country"] = "Swaziland"


In [232]:
pop_world = pop_world.drop(["country", "country_code"], axis=1)
pop = pop.drop("country_code", axis=1)
pop_world = pop_world.rename(columns={"population":"world_population"})

In [233]:
pop = pop.loc[pop.population !=  ".."]

In [234]:
pop = pd.merge(left=pop, right=pop_world, on="year",how="left")
pop["population"] = pop["population"].apply(float)
pop["world_population"] = pop["world_population"].apply(float)

pop["population_share"] = pop["population"] / pop["world_population"]

In [235]:
pop = pop.sort_values(["country", "year"], ascending=True).reset_index(drop=True)

In [None]:
pop.to_csv("data/population.csv", sep="\t", index=False)

In [None]:
pop=pd.read_csv("data/population.csv", sep="\t")

In [274]:
pop

Unnamed: 0,country,year,population,world_population,population_share
0,Afghanistan,1984,12047115.0,4.763043e+09,0.002529
1,Afghanistan,1985,11783050.0,4.846338e+09,0.002431
2,Afghanistan,1986,11601041.0,4.932114e+09,0.002352
3,Afghanistan,1987,11502761.0,5.020001e+09,0.002291
4,Afghanistan,1988,11540888.0,5.108813e+09,0.002259
5,Afghanistan,1989,11777609.0,5.197758e+09,0.002266
6,Afghanistan,1990,12249114.0,5.288103e+09,0.002316
7,Afghanistan,1991,12993657.0,5.375489e+09,0.002417
8,Afghanistan,1992,13981231.0,5.459754e+09,0.002561
9,Afghanistan,1993,15095099.0,5.544873e+09,0.002722


## Get Income Level data

In [236]:
income_df = pd.read_excel("income/OGHIST.xls", sheet_name="2002")

## get CL episodes

In [237]:
cl_ep = pd.read_excel("CL_episodes.xlsx")

In [238]:
cl_ep.country.unique()

array(['Brazil', 'USA', 'Germany', 'Canada', 'South Africa',
       'Dominican Republic', 'South Korea', 'Zimbabwe', 'Ecuador',
       'Rwanda', 'Malaysia', 'Indonesia', 'Mozambique', 'Zambia',
       'Swaziland', 'Philippines', 'Chile', 'Taiwan', 'Cameroon',
       'Eretria', 'Guinea', 'Ghana', 'Italy', 'China', 'India',
       'Thailand', 'Colombia'], dtype=object)

In [239]:
cl_ep.groupby(["year", "country"]).count().shape

(48, 6)

In [240]:
cl_ep = cl_ep.loc[~cl_ep.disease_group.isnull()]

In [241]:
cl_ep = cl_ep[["year", "country", "disease_group", "outcome"]]

In [242]:
cl_ep.head()

Unnamed: 0,year,country,disease_group,outcome
0,2001,Brazil,"B20,B21,B22,B23,B24",discount
1,2001,Brazil,"B20,B21,B22,B23,B24",discount
2,2001,Brazil,"B20,B21,B22,B23,B24",discount
3,2001,USA,A01||A02||A04||A09||A20||A22||A54,discount
4,2001,Germany,"B20,B21,B22,B23,B24",CL


In [243]:
cl_ep_unstacked = pd.DataFrame(columns=["year", "country", "ICD_code", "outcome"])
for i, row in cl_ep.iterrows():
    new_row=dict()
    new_row["year"] = row["year"]
    new_row["country"] = row["country"]
    new_row["outcome"] = row["outcome"]
    dgs = row["disease_group"].split("||")
    for dg in dgs:
        new_row["ICD_code"] = dg
        cl_ep_unstacked = cl_ep_unstacked.append(new_row, ignore_index=True)
    

In [244]:
cl_ep_unstacked["CL"] = 0
cl_ep_unstacked.loc[cl_ep_unstacked.outcome=="CL","CL"] = 1

cl_ep_unstacked["discount"] = 0
cl_ep_unstacked.loc[cl_ep_unstacked.outcome=="discount","discount"] = 1

In [245]:
cl_ep_unstacked["request_cumulative"] = cl_ep_unstacked.groupby(["ICD_code"]).cumcount() + 1
cl_ep_unstacked["CL_cumulative"] = cl_ep_unstacked.groupby(["ICD_code"])["CL"].cumsum()
cl_ep_unstacked["discount_cumulative"] = cl_ep_unstacked.groupby(["ICD_code"])["discount"].cumsum()


In [246]:
cl_ep_unstacked.loc[cl_ep_unstacked.country=="USA", "country"] = "United States"
cl_ep_unstacked.loc[cl_ep_unstacked.country=="Eretria", "country"] = "Eritrea"

cl_ep_unstacked.to_csv("data/cl_ep_unstacked.csv", sep="\t", index=False)

In [247]:
cl_ep_cumulative = cl_ep_unstacked.groupby(["ICD_code","year"]).agg({"request_cumulative":"max",
                                                "CL_cumulative":"max",
                                                "discount_cumulative":"max"
                                               }).reset_index()

In [248]:
def find_cum_country(row, eps, outcome):
    search_code = row["ICD_code"]
    search_year = row["year"]
    # all countries that have requested so far
    if outcome == "request":
        countries = eps.loc[(eps.ICD_code == search_code) & (eps.year <= search_year)].country.unique()
    # all countries that have issued CLs so far
    if outcome == "CL":
        countries = eps.loc[(eps.ICD_code == search_code) & (eps.year <= search_year) & 
                           (eps.outcome == "CL")].country.unique()
    if outcome == "discount":
    # all countries that have issued discounts so far
        countries = eps.loc[(eps.ICD_code == search_code) & (eps.year <= search_year) & 
                           (eps.outcome == "discount")].country.unique()
    
    countries.sort()
    return countries

In [249]:
cl_ep_cumulative["request_country"] = cl_ep_cumulative.apply(lambda x: find_cum_country(x,cl_ep_unstacked, "request"),
                                                             axis=1)

In [250]:
cl_ep_cumulative["CL_country"] = cl_ep_cumulative.apply(lambda x: find_cum_country(x,cl_ep_unstacked, "CL"),
                                                             axis=1)

In [251]:
cl_ep_cumulative["discount_country"] = cl_ep_cumulative.apply(lambda x: find_cum_country(x,cl_ep_unstacked, "discount"),
                                                             axis=1)

In [252]:
cl_ep_cumulative.to_csv("data/cl_ep_cumulative.csv", sep='\t', index=False)

In [253]:
# if there is a missing year after the first issuance of CL, replicate the previous row
cl_ep_cumulative_complete = pd.DataFrame(columns=cl_ep_cumulative.columns)
codes = cl_ep_cumulative.ICD_code.unique()

for c in codes:
    c_cum = cl_ep_cumulative.loc[cl_ep_cumulative.ICD_code == c].copy()
    year = c_cum.year.min()
    for y in range(year, 2019):
        if y in c_cum.year.values:
            row=c_cum.loc[c_cum.year==y].copy()
        else:
            row["year"] = y
            
        cl_ep_cumulative_complete = cl_ep_cumulative_complete.append(row, ignore_index=True)
    

In [254]:
cl_ep_cumulative_complete

Unnamed: 0,ICD_code,year,request_cumulative,CL_cumulative,discount_cumulative,request_country,CL_country,discount_country
0,A01,2001,2,0,2,"[Canada, United States]",[],"[Canada, United States]"
1,A01,2002,2,0,2,"[Canada, United States]",[],"[Canada, United States]"
2,A01,2003,2,0,2,"[Canada, United States]",[],"[Canada, United States]"
3,A01,2004,2,0,2,"[Canada, United States]",[],"[Canada, United States]"
4,A01,2005,2,0,2,"[Canada, United States]",[],"[Canada, United States]"
5,A01,2006,2,0,2,"[Canada, United States]",[],"[Canada, United States]"
6,A01,2007,2,0,2,"[Canada, United States]",[],"[Canada, United States]"
7,A01,2008,2,0,2,"[Canada, United States]",[],"[Canada, United States]"
8,A01,2009,2,0,2,"[Canada, United States]",[],"[Canada, United States]"
9,A01,2010,2,0,2,"[Canada, United States]",[],"[Canada, United States]"


In [255]:
cl_ep_cumulative_complete.to_csv("data/cl_ep_cumulative_complete.csv", sep='\t', index=False)

# combine patent data with CL data

In [256]:
patent_CL_df = pd.merge(left = by_disease_group_complete, right=cl_ep_cumulative_complete, 
                        left_on=["ICD_code", "filing_year"], right_on=["ICD_code", "year"], how="left"
                        ).drop("year", axis=1)

In [257]:
patent_CL_df[["request_cumulative", "CL_cumulative", "discount_cumulative"]] = patent_CL_df[["request_cumulative", "CL_cumulative", "discount_cumulative"]].fillna(0)

In [258]:
request_treated = cl_ep_cumulative_complete.loc[cl_ep_cumulative_complete.request_cumulative > 0].ICD_code.unique()
CL_treated = cl_ep_cumulative_complete.loc[cl_ep_cumulative_complete.CL_cumulative > 0].ICD_code.unique()
discount_treated = cl_ep_cumulative_complete.loc[cl_ep_cumulative_complete.discount_cumulative > 0].ICD_code.unique()
patent_CL_df["request_treated"] = 0
patent_CL_df["CL_treated"] = 0
patent_CL_df["discount_treated"] = 0
patent_CL_df.loc[patent_CL_df.ICD_code.isin(request_treated), "request_treated"] = 1
patent_CL_df.loc[patent_CL_df.ICD_code.isin(CL_treated), "CL_treated"] = 1
patent_CL_df.loc[patent_CL_df.ICD_code.isin(discount_treated), "discount_treated"] = 1


In [259]:
patent_CL_df = patent_CL_df.reset_index()
patent_CL_df = patent_CL_df.rename(columns={"index":"orig_index"})

In [260]:
def impute_market_data(row, field):
    search_code = row["ICD_code"]
    search_year = row["year"]
    search_country = row["country"]
    
    #outcomes
    imputed = np.nan
    
    if (field == "deaths") | (field=="death_share"): 
        match_mort = mort.loc[(mort.ICD_code == search_code) & (mort.country == search_country)].copy()

        # in mortality data, search first row after or last row before
        if len(match_mort) > 0:
            print(search_code, search_year, search_country)
            match_mort = match_mort.sort_values("year", ascending=True).reset_index(drop=True)

            mort_right_after = match_mort.loc[match_mort.year > search_year]
            if len(mort_right_after) > 0: 
                imputed = mort_right_after.iloc[0][field]

            mort_right_before = match_mort.loc[match_mort.year < search_year]
            if len(mort_right_before) > 0:
                imputed = mort_right_before.iloc[-1][field]
    
    if (field == "population_share"):
        match_pop = pop.loc[(pop.country == search_country)].copy()
    
        if len(match_pop) > 0:
            match_pop = match_pop.sort_values("year", ascending=True).reset_index(drop=True)

            pop_right_after = match_pop.loc[match_pop.year > search_year]
            if len(pop_right_after) > 0:
                imputed = pop_right_after.iloc[0][field]

            pop_right_before = match_pop.loc[match_pop.year < search_year]
            if len(pop_right_before) > 0:
                imputed = pop_right_before.iloc[-1][field]
    
    return imputed


In [261]:
def get_country_weight(country_field):
    country_weight = pd.DataFrame(columns=["orig_index", "ICD_code", "year", "country"])
    for i, row in patent_CL_df.iterrows():
        new_row=dict()
        new_row["ICD_code"] = row["ICD_code"]
        new_row["year"] = row["filing_year"]
        new_row["orig_index"] = row["orig_index"]
        countries = row[country_field]
        if isinstance(countries, float):
            continue
        if len(countries) == 0:
            continue
        if isinstance(countries, str):
            countries = ast.literal_eval(countries)
        
        for c in countries:
            new_row["country"] = c
            country_weight = country_weight.append(new_row, ignore_index=True)
            
    # add mortality weight
    country_weight = pd.merge(left = country_weight, right=mort[["year", "ICD_code", "country", "deaths", "death_share"]], 
                            on=["year", "ICD_code", "country"], how="left")
    # add population weight 
    country_weight = pd.merge(left = country_weight, right=pop[["year", "country", "population_share"]],
                          on=["year", "country"], how="left")
    
    # add income cataegory 
    country_weight = pd.merge(left = country_weight, right = income_df[["country", "income_category"]], 
                              on="country", how="left")

    income_dummies = pd.get_dummies(country_weight.income_category)
    country_weight = pd.concat([country_weight, income_dummies], axis=1)
    #impute missing data
    country_weight.loc[country_weight.deaths.isnull(), "deaths"] = \
        country_weight.loc[country_weight.deaths.isnull()].apply(lambda x: impute_market_data(x, field="deaths") ,
                                                                 axis=1)
    country_weight.loc[country_weight.death_share.isnull(), "death_share"] = \
        country_weight.loc[country_weight.death_share.isnull()].apply(lambda x: impute_market_data(x, field="death_share") ,
                                                                 axis=1)
    country_weight.loc[country_weight.population_share.isnull(), "population_share"] = \
        country_weight.loc[country_weight.population_share.isnull()].apply(lambda x: impute_market_data(x, field="population_share") ,
                                                                 axis=1)

    country_weight.loc[country_weight.deaths.isnull(), "deaths"] = 0
    country_weight.loc[country_weight.death_share.isnull(), "death_share"] = 0
    country_weight.loc[country_weight.population_share.isnull(), "population_share"] = 0


    weight_dict=dict()
    weight_dict["deaths"] = "sum"
    weight_dict["death_share"] = "sum"
    weight_dict["population_share"] = "sum"
    print(country_weight.income_category.unique())
    for ic in country_weight.income_category.unique():
        
        death_interacted = "deaths" + "_" + ic
        country_weight[death_interacted] = country_weight["deaths"] *country_weight[ic]
        weight_dict[death_interacted] = "sum"
        
        death_share_interacted = "death_share" + "_" + ic
        country_weight[death_share_interacted] = country_weight["death_share"] *country_weight[ic]
        weight_dict[death_share_interacted] = "sum"
        
        population_share_interacted = "population_share" + "_" + ic
        country_weight[population_share_interacted] = country_weight["population_share"] *country_weight[ic]
        weight_dict[population_share_interacted] = "sum"

        
    country_weight = country_weight.groupby("orig_index").agg(weight_dict).reset_index()
    
    col_label = country_field[:-7]
    new_names = dict()
    for name in country_weight.columns[1::]:
        new_names[name] = col_label+name
    country_weight = country_weight.rename(columns=new_names)
    
    
    return country_weight

In [262]:
request_weight = get_country_weight("request_country")
CL_weight = get_country_weight("CL_country")
discount_weight = get_country_weight("discount_country")
patent_CL_df = pd.merge(left = patent_CL_df, right = request_weight, 
                        on="orig_index", how="left")
patent_CL_df = pd.merge(left = patent_CL_df, right = CL_weight, 
                        on="orig_index", how="left")
patent_CL_df = pd.merge(left = patent_CL_df, right = discount_weight, 
                        on="orig_index", how="left")


['H' 'LM' 'UM' 'L']
['H' 'LM' 'UM' 'L']
['H' 'LM']


In [263]:
patent_CL_df = patent_CL_df.drop(["orig_index","request_country", "CL_country", "discount_country"], axis=1)
patent_CL_df.loc[:, (patent_CL_df.columns != 'log_patent_count0')] = patent_CL_df.loc[:, (patent_CL_df.columns != 'log_patent_count0')].fillna(0)


In [None]:
patent_CL_df.loc[patent_CL_df.request_treated == 1].disease_code.unique()

In [None]:
patent_CL_df.loc[(patent_CL_df.request_treated == 1) & (patent_CL_df.neglected ==1)].disease_code.unique()

In [None]:
patent_CL_df.loc[ (patent_CL_df.neglected ==1)].disease_code.unique()

## add overall disease prevalence for each year

In [264]:
patent_CL_df = pd.merge(left=patent_CL_df, right=mort_agg, left_on=["ICD_code", "filing_year"],
                        right_on=["ICD_code", "year"], how="left")

In [265]:
patent_CL_df.loc[patent_CL_df.prevalence.isnull(), "prevalence"] = 0
patent_CL_df = patent_CL_df.drop("year", axis=1)


In [266]:
patent_CL_df["min_year"] = patent_CL_df.groupby("ICD_code")["filing_year"].transform("min")

In [267]:
patent_CL_df = patent_CL_df.rename(columns={"ICD_code":"disease_code", "custom_group":"disease_group",
                                           "filing_year":"year"})

In [268]:
patent_CL_df["neglected"] = patent_CL_df["neglected"].apply(int)

In [269]:
patent_CL_df.to_csv("data/patent_CL_df.csv", sep='\t', index=False)

In [270]:
patent_CL_df.loc[patent_CL_df.CL_treated==1,"disease_code"].unique()

array(['A04', 'B15,B16,B17,B18,B19', 'B20,B21,B22,B23,B24'], dtype=object)

In [271]:
patent_CL_df.loc[patent_CL_df.CL_treated==1,"disease_group"].unique()

array(['Other bacterial intestinal infections', 'Viral hepatitis',
       'Human immunodeficiency virus [HIV] disease'], dtype=object)

In [287]:
patent_CL_df.loc[(patent_CL_df.request_treated==1) & (patent_CL_df.neglected==1)].disease_group.nunique()

3

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(10,10))
dg="B20,B21,B22,B23,B24"
dg_label = by_disease_group.loc[by_disease_group.ICD_code==dg].custom_group.unique()
dg_patent = by_disease_group.loc[by_disease_group.ICD_code==dg]
dg_patent = dg_patent.loc[(dg_patent.publication_year >= 1990)]
x=dg_patent.publication_year
y=dg_patent.log_patent_count
plt.plot(x,y, label=dg_label)
plt.axvline(x=1995, color="red", label="TRIPS")
plt.axvline(x=2002, color="green", label="Doha")
CL = disease_group.loc[disease_group.custom_group_code == dg]["CL_year"].unique()
if CL is not None:
    plt.axvline(x=CL, color="orange", label="CL")

plt.legend()

## HIV data

In [None]:
hiv_df = matched_patents_full.loc[matched_patents_full.custom_group_code == "B20,B21,B22,B23,B24"]

In [None]:
hiv_df = hiv_df.reset_index(drop=True)

In [None]:
hiv_sub= pd.DataFrame(columns=["publication_number", "filing_year", "subclass"])
for i, row in hiv_df.iterrows(): 
    if isinstance(row["ipc_codes"], str):
        ipcs = row["ipc_codes"].split(",")
        new_row = dict()
        new_row["publication_number"] = row["publication_number"]
        new_row["filing_year"] = row["filing_year"]
        for i in ipcs:
            new_row["subclass"] = i
            hiv_sub = hiv_sub.append(new_row, ignore_index=True)

In [None]:
hiv_sub["class"] = hiv_sub["subclass"].apply(lambda x: x[:x.index("/")])

In [None]:
hiv_sub["subclass"] = hiv_sub["class"]
hiv_sub = hiv_sub.drop("class", axis=1)

In [None]:
hiv_sub.to_csv("data/hiv_sub.csv", sep="\t", index=False)

In [None]:
major = hiv_sub.groupby("subclass")["filing_year"].count().reset_index()
major = test.sort_values("filing_year", ascending=False).reset_index(drop=True)

In [None]:
major = major.loc[major.filing_year>=100]

In [None]:
major.shape

In [None]:
by_subclass = hiv_sub.loc[hiv_sub.subclass.isin(major.subclass.values)].groupby(["filing_year", "subclass"])\
    .agg({"publication_number":"count"}).reset_index()
by_subclass = by_subclass.loc[by_subclass.filing_year >= 1988]
by_subclass = by_subclass.reset_index(drop=True)
by_subclass =  by_subclass.rename(columns={"publication_number":"patent_count"})
by_subclass["filing_year"] = by_subclass["filing_year"].apply(int)

In [None]:
by_subclass

In [None]:
# fill in missing year

subs = by_subclass.subclass.unique()
for sub in subs:
    sub_df = by_subclass.loc[by_subclass.subclass == sub]
    start_year = max([sub_df.filing_year.min(), 1988])
    year_list = list(range(start_year, 2019))
    zero_row = dict({"subclass":sub,"patent_count":0})
    for y in year_list:
        if y not in sub_df.filing_year.values:
            zero_row["filing_year"] = y
            by_subclass = by_subclass.append(zero_row, ignore_index=True)
        

In [None]:
by_subclass.shape

In [None]:
by_subclass.loc[by_subclass.patent_count==0].shape

In [None]:
by_subclass["log_patent_count0"] = np.log(by_subclass["patent_count"] )
by_subclass.loc[by_subclass.patent_count == 0, "log_patent_count0"] = np.NaN
by_subclass["log_patent_count1"] = np.log(by_subclass["patent_count"] + 1)

In [None]:
hiv_cl_df = pd.merge(left = by_subclass, 
                     right=cl_ep_cumulative_complete.loc[cl_ep_cumulative_complete.ICD_code=="B20,B21,B22,B23,B24"], 
                        left_on="filing_year", right_on="year", how="left"
                        ).drop("year", axis=1)

In [None]:
hiv_cl_df[["request_cumulative", "CL_cumulative", "discount_cumulative"]] = hiv_cl_df[["request_cumulative", "CL_cumulative", "discount_cumulative"]].fillna(0)

In [None]:
hiv_cl_df = hiv_cl_df.drop("request_cumulative", axis=1)
hiv_CL_treated = hiv_cl_df.loc[hiv_cl_df.CL_cumulative > 0].subclass.unique()
hiv_discount_treated = hiv_cl_df.loc[hiv_cl_df.discount_cumulative > 0].subclass.unique()
hiv_cl_df["CL_treated"] = 0
hiv_cl_df["discount_treated"] = 0
hiv_cl_df.loc[hiv_cl_df.subclass.isin(hiv_CL_treated), "CL_treated"] = 1
hiv_cl_df.loc[hiv_cl_df.subclass.isin(hiv_discount_treated), "discount_treated"] = 1


In [None]:
hiv_cl_df


In [None]:
hiv_weight = patent_CL_df.loc[patent_CL_df.disease_code=="B20,B21,B22,B23,B24"]

In [None]:
hiv_weight[["disease_"]]