In [1]:
import pandas as pd

df = pd.read_csv('results/asco_articles_with_company_name_variants.csv')

In [2]:
# for countries column, if there are two capital letters (with or without dot) which means it is a US state, replace it with USA
df['countries'] = df['countries'].apply(lambda x: '; '.join(['USA' if isinstance(part.strip(), str) and 
                                                            ((len(part.strip()) == 2 and part.strip().isupper()) or  # handles "NY"
                                                             (len(part.strip()) == 3 and part.strip().endswith('.') and part.strip()[:2].isupper()))  # handles "NY."
                                                            else part.strip() 
                                                            for part in str(x).split(';')]) if isinstance(x, str) else x)

# Standardize China variations to China
df['countries'] = df['countries'].apply(lambda x: '; '.join(['China' if isinstance(part.strip(), str) and 'china' in part.strip().lower()
                                                            else part.strip()
                                                            for part in str(x).split(';')]) if isinstance(x, str) else x)

# remove trailing dots from country names
df['countries'] = df['countries'].apply(lambda x: '; '.join([part.strip().rstrip('.') 
                                                            for part in str(x).split(';')]) if isinstance(x, str) else x)


# new column named "multiple_nationality" if country >=3, and main_country with the most frequent country
df['multiple_nationality'] = df['countries'].apply(lambda x: 1 if len(set(country.strip() for country in str(x).split(';'))) > 2 else 0)
df['main_country'] = df['countries'].apply(lambda x: max(
    [country.strip() for country in str(x).split(';')], 
    key=lambda k: str(x).split(';').count(k.strip())
) if isinstance(x, str) else x)

# remove Author: None from author_disclosures
df['author_disclosures'] = df['author_disclosures'].apply(lambda x: [
    disclosure 
    for disclosure in (eval(x) if isinstance(x, str) else x) 
    if disclosure['Author'] != 'None'
])

# new column named "no_total_authors" by counting ";"
df['no_total_authors'] = df['authors'].apply(lambda x: x.count(';') + 1 if isinstance(x, str) else 0)


In [3]:
# create a new column named "num_author_with_any_coi"
df['num_author_with_any_coi'] = 0
for idx, row in df.iterrows():
    matched_authors = set() 

    try:
        author_disclosures = eval(row['author_disclosures'])
        company_variants = eval(row['company_name_variants']) if isinstance(row['company_name_variants'], str) else row['company_name_variants']
        
        for author in author_disclosures:
            if any(company in disclosure.split(':', 1)[1].strip()
                  for disclosure in author['Disclosures']
                  for company in company_variants
                  if company):
                matched_authors.add(author['Author'])  # Add author to set if they have a match
    except:
        pass

    df.at[idx, 'num_author_with_any_coi'] = len(matched_authors) 



In [4]:
# create a new column named "coi_employment"
df['coi_employment'] = 0

for idx, row in df.iterrows():
    employment_list = []
    match_count = 0
    
    # First get employment list
    try:
        author_disclosures = eval(row['author_disclosures'])  # Convert string to list if needed
        for author in author_disclosures:
            for disclosure in author['Disclosures']:
                if disclosure.lower().startswith('employment'):
                    employer = disclosure.split(':', 1)[1].strip()
                    # Split on comma and handle each employer separately
                    for employer in employer.split(','):
                        employment_list.append(employer.strip())
        
        # Count exact matches with company variants
        if employment_list: 
            company_variants = eval(row['company_name_variants']) if isinstance(row['company_name_variants'], str) else row['company_name_variants']
            for employer in employment_list:
                if employer in company_variants:
                    match_count += 1
    except:
        pass
    
    df.at[idx, 'coi_employment'] = match_count

In [5]:
# create a new column named "coi_advisory_consulting"
df['coi_advisory_consulting'] = 0

for idx, row in df.iterrows():
    advisory_consulting_list = []
    match_count = 0
    
    # First get employment list
    try:
        author_disclosures = eval(row['author_disclosures']) 
        for author in author_disclosures:
            for disclosure in author['Disclosures']:
                if disclosure.lower().startswith('consulting') or disclosure.lower().startswith('consultant'):
                    advisory_company = disclosure.split(':', 1)[1].strip()
                    for company in advisory_company.split(','):
                        advisory_consulting_list.append(company.strip())
        
        if advisory_consulting_list: 
            company_variants = eval(row['company_name_variants']) if isinstance(row['company_name_variants'], str) else row['company_name_variants']
            for employer in advisory_consulting_list:
                if employer in company_variants:
                    match_count += 1
    except:
        pass
    
    df.at[idx, 'coi_advisory_consulting'] = match_count

In [6]:
# create a new column named "coi_speakers_bureau"
df['coi_speakers_bureau'] = 0

for idx, row in df.iterrows():
    speaker_bureau_list = []
    match_count = 0
    
    # First get employment list
    try:
        author_disclosures = eval(row['author_disclosures']) 
        for author in author_disclosures:
            for disclosure in author['Disclosures']:
                if disclosure.lower().startswith('speaker'):
                    speaker_company = disclosure.split(':', 1)[1].strip()
                    for company in speaker_company.split(','):
                        speaker_bureau_list.append(company.strip())
        
        if speaker_bureau_list: 
            company_variants = eval(row['company_name_variants']) if isinstance(row['company_name_variants'], str) else row['company_name_variants']
            for employer in speaker_bureau_list:
                if employer in company_variants:
                    match_count += 1
    except:
        pass
    
    df.at[idx, 'coi_speakers_bureau'] = match_count

In [7]:
# create a new column named "coi_honoraria"
df['coi_honoraria'] = 0

for idx, row in df.iterrows():
    honoraria_list = []
    match_count = 0
    
    # First get employment list
    try:
        author_disclosures = eval(row['author_disclosures']) 
        for author in author_disclosures:
            for disclosure in author['Disclosures']:
                if disclosure.lower().startswith('honoraria'):
                    honoraria_company = disclosure.split(':', 1)[1].strip()
                    for company in honoraria_company.split(','):
                        honoraria_list.append(company.strip())
        
        if honoraria_list: 
            company_variants = eval(row['company_name_variants']) if isinstance(row['company_name_variants'], str) else row['company_name_variants']
            for employer in honoraria_list:
                if employer in company_variants:
                    match_count += 1
    except:
        pass
    
    df.at[idx, 'coi_honoraria'] = match_count

In [8]:
# focus on first and last author COIs
df['first_author_coi'] = df['author_disclosures'].apply(lambda x: 
    eval(x)[0]['Disclosures'] if isinstance(x, str) and len(eval(x)) > 0 
    else x[0]['Disclosures'] if isinstance(x, list) and len(x) > 0 
    else None)

df['last_author_coi'] = df['author_disclosures'].apply(lambda x: 
    eval(x)[-1]['Disclosures'] if isinstance(x, str) and len(eval(x)) > 0 
    else x[-1]['Disclosures'] if isinstance(x, list) and len(x) > 0 
    else None)

In [9]:
# count all types of coi for first and last authors
df['first_author_coi_all'] = 0
df['last_author_coi_all'] = 0
for idx, row in df.iterrows():
    match_count_first_author = 0
    match_count_last_author = 0
    
    try:
        first_author_disclosures = row['first_author_coi']
        last_author_disclosures = row['last_author_coi']
        
        company_variants = eval(row['company_name_variants']) if isinstance(row['company_name_variants'], str) else row['company_name_variants']
        
        if any(company in str(disclosure) 
              for disclosure in first_author_disclosures
              for company in company_variants
              if company):
            match_count_first_author = 1

        if any(company in str(disclosure) 
              for disclosure in last_author_disclosures
              for company in company_variants
              if company):
            match_count_last_author = 1
    except:
        pass
    
    df.at[idx, 'first_author_coi_all'] = match_count_first_author
    df.at[idx, 'last_author_coi_all'] = match_count_last_author

In [10]:
# COI employment for first and last author
df['coi_employment_first_author'] = 0
df['coi_employment_last_author'] = 0

for idx, row in df.iterrows():
    match_count_first_author = 0
    match_count_last_author = 0
    
    try:
        first_author_disclosures = row['first_author_coi']
        last_author_disclosures = row['last_author_coi']
        
        company_variants = eval(row['company_name_variants']) if isinstance(row['company_name_variants'], str) else row['company_name_variants']
        
        # Check employment matches for first author
        if any(company in str(disclosure) 
              for disclosure in first_author_disclosures
              for company in company_variants
              if disclosure.lower().startswith('employment') and company):
            match_count_first_author = 1

        # Check employment matches for last author
        if any(company in str(disclosure) 
              for disclosure in last_author_disclosures
              for company in company_variants
              if disclosure.lower().startswith('employment') and company):
            match_count_last_author = 1
    except:
        pass
    
    df.at[idx, 'coi_employment_first_author'] = match_count_first_author
    df.at[idx, 'coi_employment_last_author'] = match_count_last_author

In [11]:
# COI advisory for first and last author
df['coi_advisory_first_author'] = 0
df['coi_advisory_last_author'] = 0

for idx, row in df.iterrows():
    match_count_first_author = 0
    match_count_last_author = 0
    
    try:
        first_author_disclosures = row['first_author_coi']
        last_author_disclosures = row['last_author_coi']
        
        company_variants = eval(row['company_name_variants']) if isinstance(row['company_name_variants'], str) else row['company_name_variants']
        
        # Check consulting matches for first author
        if any(company in str(disclosure) 
              for disclosure in first_author_disclosures
              for company in company_variants
              if (disclosure.lower().startswith('consulting') or disclosure.lower().startswith('consultant')) and company):
            match_count_first_author = 1

        # Check consulting matches for last author
        if any(company in str(disclosure) 
              for disclosure in last_author_disclosures
              for company in company_variants
              if (disclosure.lower().startswith('consulting') or disclosure.lower().startswith('consultant')) and company):
            match_count_last_author = 1
    except:
        pass
    
    df.at[idx, 'coi_advisory_first_author'] = match_count_first_author
    df.at[idx, 'coi_advisory_last_author'] = match_count_last_author

In [12]:
# COI speakers bureau for first and last author
# Early studies did not have speakers bureau, prior to 3/1/2015
df['coi_speakers_first_author'] = 0
df['coi_speakers_last_author'] = 0

for idx, row in df.iterrows():
    match_count_first_author = 0
    match_count_last_author = 0
    
    try:
        first_author_disclosures = row['first_author_coi']
        last_author_disclosures = row['last_author_coi']
        
        company_variants = eval(row['company_name_variants']) if isinstance(row['company_name_variants'], str) else row['company_name_variants']
        
        # Check speaker matches for first author
        if any(company in str(disclosure) 
              for disclosure in first_author_disclosures
              for company in company_variants
              if disclosure.lower().startswith('speaker') and company):
            match_count_first_author = 1

        # Check speaker matches for last author
        if any(company in str(disclosure) 
              for disclosure in last_author_disclosures
              for company in company_variants
              if disclosure.lower().startswith('speaker') and company):
            match_count_last_author = 1
    except:
        pass
    
    df.at[idx, 'coi_speakers_first_author'] = match_count_first_author
    df.at[idx, 'coi_speakers_last_author'] = match_count_last_author

In [13]:
# COI honoraria for first and last author
df['coi_honoraria_first_author'] = 0
df['coi_honoraria_last_author'] = 0

for idx, row in df.iterrows():
    match_count_first_author = 0
    match_count_last_author = 0
    
    try:
        first_author_disclosures = row['first_author_coi']
        last_author_disclosures = row['last_author_coi']
        
        company_variants = eval(row['company_name_variants']) if isinstance(row['company_name_variants'], str) else row['company_name_variants']
        
        # Check honoraria matches for first author
        if any(company in str(disclosure) 
              for disclosure in first_author_disclosures
              for company in company_variants
              if disclosure.lower().startswith('honoraria') and company):
            match_count_first_author = 1

        # Check honoraria matches for last author
        if any(company in str(disclosure) 
              for disclosure in last_author_disclosures
              for company in company_variants
              if disclosure.lower().startswith('honoraria') and company):
            match_count_last_author = 1
    except:
        pass
    
    df.at[idx, 'coi_honoraria_first_author'] = match_count_first_author
    df.at[idx, 'coi_honoraria_last_author'] = match_count_last_author

In [14]:
df.head()

Unnamed: 0,title,authors,countries,abstract,publication_date,doi,author_disclosures,product_name,company_name,company_in_the_list,...,first_author_coi_all,last_author_coi_all,coi_employment_first_author,coi_employment_last_author,coi_advisory_first_author,coi_advisory_last_author,coi_speakers_first_author,coi_speakers_last_author,coi_honoraria_first_author,coi_honoraria_last_author
0,Adjuvant Nivolumab in High-Risk Muscle-Invasiv...,Matthew D. Galsky; Johannes Alfred Witjes; J...,USA; the Netherlands; Germany; USA; Romania; S...,Clinical trials frequently include multiple en...,1/1/2025,JCO.24.00340,"[{'Author': 'Matthew D. Galsky', 'Disclosures'...",nivolumab,Bristol Myers Squibb,True,...,1,1,0,0,0,1,0,0,0,0
1,Impact of an Epstein-Barr Virus Serology-Based...,Wen-Jie Chen; Xia Yu; Yu-Qiang Lu; Ruth M. ...,China; China; China; China; USA; China; China;...,Purpose: Screening for nasopharyngeal carcinom...,1/1/2025,JCO.23.01296,"[{'Author': 'Zhiwei Liu', 'Disclosures': ['Emp...",,,False,...,0,0,0,0,0,0,0,0,0,0
2,Single or Double Induction With 7 + 3 Containi...,Christoph RÃ¶llig; BjÃ¶rn Steffen; Christoph...,Germany; Germany; Germany; Germany; Germany; G...,Purpose: To determine the optimal daunorubicin...,1/1/2025,JCO.24.00235,"[{'Author': 'Christoph RÃ¶llig', 'Disclosures'...",daunorubicin,,False,...,0,0,0,0,0,0,0,0,0,0
3,Menin Inhibition With Revumenib for KMT2A-Rear...,Ghayas C. Issa; Ibrahim Aldoss; Michael J. ...,USA; USA; USA; USA; USA; USA; USA; USA; USA; U...,"Purpose: Revumenib, an oral, small molecule in...",1/1/2025,JCO.24.00826,"[{'Author': 'Ghayas C. Issa', 'Disclosures': [...",revumenib,Syndax,True,...,1,1,0,0,1,1,0,0,0,0
4,Inotuzumab Ozogamicin and Low-Intensity Chemot...,Patrice Chevallier; Thibaut Leguay; Marc De...,France; France; France; United Kingdom; Czech ...,Purpose: The use of inotuzumab ozogamicin (InO...,12/20/2024,JCO.24.00490,"[{'Author': 'Patrice Chevallier', 'Disclosures...",inotuzumab ozogamicin,Pfizer,True,...,1,0,0,0,0,0,0,0,0,0


In [15]:
# exclude articles before 1/1/2010
df['publication_date'] = pd.to_datetime(df['publication_date'])
df_analysis = df[df['publication_date'] >= '2010-01-01']

# remove columns
df_analysis = df_analysis.drop(columns=['authors', 'countries', 'author_disclosures', 'company_name', 'abstract', 'doi', 'first_author_coi', 'last_author_coi'])

df_analysis.head()

Unnamed: 0,title,publication_date,product_name,company_in_the_list,company_name_variants,multiple_nationality,main_country,no_total_authors,num_author_with_any_coi,coi_employment,...,first_author_coi_all,last_author_coi_all,coi_employment_first_author,coi_employment_last_author,coi_advisory_first_author,coi_advisory_last_author,coi_speakers_first_author,coi_speakers_last_author,coi_honoraria_first_author,coi_honoraria_last_author
0,Adjuvant Nivolumab in High-Risk Muscle-Invasiv...,2025-01-01,nivolumab,True,"['Bristol Myers Squibb', 'Bristol Myers Squibb...",1,USA,22,0,0,...,1,1,0,0,0,1,0,0,0,0
1,Impact of an Epstein-Barr Virus Serology-Based...,2025-01-01,,False,[],1,China,23,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Single or Double Induction With 7 + 3 Containi...,2025-01-01,daunorubicin,False,[],0,Germany,44,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Menin Inhibition With Revumenib for KMT2A-Rear...,2025-01-01,revumenib,True,"['Syndax', 'syndax']",1,USA,27,0,0,...,1,1,0,0,1,1,0,0,0,0
4,Inotuzumab Ozogamicin and Low-Intensity Chemot...,2024-12-20,inotuzumab ozogamicin,True,"['Pfizer', 'Bristol Myers Squibb/Pfizer', 'Ser...",1,France,40,0,0,...,1,0,0,0,0,0,0,0,0,0


In [16]:
df_analysis.to_csv('results/analysis_dataset.csv', index=False)
