In [1]:
import pandas as pd
import pickle
import preprocess_utils
import numpy as np
import json

from_database = False

if from_database:
    # Load tables from database 
    dfs = preprocess_utils.load_all_tables()
else:
    # Open the file in binary read mode and load the pickle data
    with open('preprocessed_data/dfs.pickle', 'rb') as f:
        dfs = pickle.load(f)
        for key, df in dfs.items():
            print(f"DataFrame {key:<20} has {len(df):<6} rows and {len(df.columns):<6} columns.")

if from_database:
    # Save tables to pickle
    with open('preprocessed_data/dfs.pickle', 'wb') as f:
        pickle.dump(dfs, f)

    # load it back with:
    with open('preprocessed_data/dfs.pickle', 'rb') as f:
        dfs = pickle.load(f)

DataFrame abuse_report_types   has 4      rows and 4      columns.
DataFrame abuse_reports        has 0      rows and 8      columns.
DataFrame ar_internal_metadata has 1      rows and 4      columns.
DataFrame articles             has 400    rows and 33     columns.
DataFrame assertion_types      has 10     rows and 11     columns.
DataFrame assertion_versions   has 19668  rows and 10     columns.
DataFrame assertions           has 13941  rows and 14     columns.
DataFrame assertions_genes     has 2053   rows and 2      columns.
DataFrame assertions_tags      has 403    rows and 2      columns.
DataFrame assertions_technique_types has 0      rows and 2      columns.
DataFrame assertions_techniques has 0      rows and 2      columns.
DataFrame assessment_types     has 13     rows and 9      columns.
DataFrame assessments          has 0      rows and 6      columns.
DataFrame authors              has 797    rows and 10     columns.
DataFrame banned_orcid_users   has 0      rows and 5   

## A. Database articles, claims and authors

### A.1. Process journal articles from the database

In [2]:
# Process articles
articles = preprocess_utils.clean_df_from_database(dfs["articles"])
articles = articles.rename(columns={"id": "article_id"})
# add journals to articles:
journals = preprocess_utils.clean_df_from_database(dfs["journals"])
journals = journals.drop('tag', axis=1).rename(columns={"id": "journal_id", "name": "journal_name"})
articles = articles.merge(journals, on="journal_id", how="left", suffixes=('', '_journal')).drop("journal_id", axis=1)


# Extract first and last authors from authors_txt
def extract_authors(authors_txt):
    if isinstance(authors_txt, str) and authors_txt:
        authors_list = authors_txt.split(';')
        first_author = authors_list[0].strip() if authors_list else None
        last_author = authors_list[-1].strip() if authors_list else None
        return pd.Series([first_author, last_author])
    return pd.Series([None, None])

# Apply the function to create new columns
articles[['first_author_extracted', 'last_author_extracted']] = articles['authors_txt'].apply(extract_authors)

# Check the results
print(f"Total articles: {len(articles)}")
print(f"Articles with first author: {articles['first_author_extracted'].notna().sum()}")
print(f"Articles with last author: {articles['last_author_extracted'].notna().sum()}")

articles = articles.drop(columns=["key", "references_txt", "additional_context", "num",  "nber_panels", "large_scale", "nber_tables", "published_at"])


# Display the first few rows to verify
articles[['authors_txt', 'first_author_extracted', 'last_author_extracted']].head()

Total articles: 400
Articles with first author: 400
Articles with last author: 400


Unnamed: 0,authors_txt,first_author_extracted,last_author_extracted
0,Rizki RM;Rizki TM,Rizki RM,Rizki TM
1,Leulier F;Lhocine N;Lemaitre B;Meier P,Leulier F,Meier P
2,Kim YS;Han SJ;Ryu JH;Choi KH;Hong YS;Chung YH;...,Kim YS,Lee WJ
3,Tingvall TO;Roos E;Engström Y,Tingvall TO,Engström Y
4,Bhaskar V;Valentine SA;Courey AJ,Bhaskar V,Courey AJ


In [3]:


# We need to analyze and clean up affiliations in the articles dataframe

def extract_primary_affiliation(affs_json_str):
    """Extract the first affiliation from the JSON string, safely handle bad JSON format."""
    try:
        # Parse the JSON string to a Python object
        affs_list = json.loads(affs_json_str)
        
        # Get the first non-empty affiliation
        for aff in affs_list:
            if aff and isinstance(aff, list) and len(aff) > 0 and aff[0].strip():
                return aff[0].strip()
        
        return None  # No valid affiliation found
    except (json.JSONDecodeError, TypeError, IndexError):
        # Handle poorly formatted JSON or other errors
        return None

# Create a new column with clean affiliations
articles['primary_affiliation'] = articles['affs_json'].apply(extract_primary_affiliation)

# Count articles with multiple vs. single affiliations
def count_affiliations(affs_json_str):
    """Count the number of non-empty affiliations in the JSON string."""
    try:
        affs_list = json.loads(affs_json_str)
        count = sum(1 for aff in affs_list if aff and isinstance(aff, list) and len(aff) > 0 and aff[0].strip())
        return count
    except (json.JSONDecodeError, TypeError, IndexError):
        return 0

articles['affiliation_count'] = articles['affs_json'].apply(count_affiliations)

# Identify articles with multiple affiliations
multiple_aff_articles = articles[articles['affiliation_count'] > 1]

# Print summary statistics
print(f"Total articles: {len(articles)}")
print(f"Articles with 1 affiliation: {len(articles[articles['affiliation_count'] == 1])}")
print(f"Articles with multiple affiliations: {len(multiple_aff_articles)}")
print(f"Articles with no valid affiliations: {len(articles[articles['affiliation_count'] == 0])}")

# Sample of articles with multiple affiliations
if not multiple_aff_articles.empty:
    print("\nSample of articles with multiple affiliations:")
    sample = multiple_aff_articles.head(5)
    for _, row in sample.iterrows():
        print(f"Article ID: {row['article_id']}, Title: {row['title'][:50]}...")
        try:
            affs = json.loads(row['affs_json'])
            for i, aff in enumerate(affs):
                if aff and isinstance(aff, list) and len(aff) > 0 and aff[0].strip():
                    print(f"  Affiliation {i+1}: {aff[0]}")
        except:
            print("  Error parsing affiliations")
        print()

Total articles: 400
Articles with 1 affiliation: 387
Articles with multiple affiliations: 0
Articles with no valid affiliations: 13


In [4]:
articles = articles.drop(columns=["affs_json", "affiliation_count"])

# Update impact factor for "Proceedings. Biological sciences" and standardize the journal name
articles.loc[articles["journal_name"] == "Proceedings. Biological sciences", "impact_factor"] = 4.7 
articles.loc[articles["journal_name"] == "Proceedings. Biological sciences", "journal_name"] = "Proceedings Biological Sciences"


In [5]:
articles = articles.apply(preprocess_utils.safe_strip)
articles.to_csv("preprocessed_data/articles_db.csv", index=False)

In [6]:
articles

Unnamed: 0,article_id,authors_txt,title,pmid,volume,issue,abstract,year,journal_name,impact_factor,first_author_extracted,last_author_extracted,primary_affiliation
0,3347,Rizki RM;Rizki TM,Cell interactions in the differentiation of a ...,111992,12,3,The cellular events in the formation of melano...,1979,Differentiation; research in biological diversity,3.39,Rizki RM,Rizki TM,
1,1987,Leulier F;Lhocine N;Lemaitre B;Meier P,The Drosophila inhibitor of apoptosis protein ...,16894030,26,21,The founding member of the inhibitor of apopto...,2006,Molecular and cellular biology,5.30,Leulier F,Meier P,The Breakthrough Toby Robins Breast Cancer Res...
2,2761,Kim YS;Han SJ;Ryu JH;Choi KH;Hong YS;Chung YH;...,"Lipopolysaccharide-activated kinase, an essent...",10636911,275,3,Eukaryotic organisms use a similar Rel/NF-kapp...,2000,The Journal of biological chemistry,4.80,Kim YS,Lee WJ,"Laboratory of Immunology, Medical Research Cen..."
3,2649,Tingvall TO;Roos E;Engström Y,The GATA factor Serpent is required for the on...,11274409,98,7,Innate immunity in Drosophila is characterized...,2001,Proceedings of the National Academy of Science...,11.10,Tingvall TO,Engström Y,Department of Molecular Biology and Functional...
4,3441,Bhaskar V;Valentine SA;Courey AJ,A functional interaction between dorsal and co...,10660560,275,6,To identify proteins that regulate the functio...,2000,The Journal of biological chemistry,4.80,Bhaskar V,Courey AJ,"Department of Chemistry and Biochemistry, Univ..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...
395,1882,Derré I;Pypaert M;Dautry-Varsat A;Agaisse H,RNAi screen in Drosophila cells reveals the in...,17967059,3,10,Chlamydia spp. are intracellular obligate bact...,2007,PLoS pathogens,6.70,Derré I,Agaisse H,"Section of Microbial Pathogenesis, Yale Univer..."
396,1815,Schmidt RL;Trejo TR;Plummer TB;Platt JL;Tang AH,Infection-induced proteolysis of PGRP-LC contr...,18308747,22,3,The Drosophila immune deficiency (IMD) pathway...,2008,FASEB journal : official publication of the Fe...,4.80,Schmidt RL,Tang AH,Department of Biochemistry and Molecular Biolo...
397,1884,Avet-Rochex A;Perrin J;Bergeret E;Fauvarque MO,Rac2 is a major actor of Drosophila resistance...,17903178,12,10,Pathogen recognition and engulfment by phagocy...,2007,Genes to cells : devoted to molecular & cellul...,2.10,Avet-Rochex A,Fauvarque MO,"Commissariat à l'Energie Atomique, DSV, iRTSV,..."
398,2041,Tanji T;Ohashi-Kobayashi A;Natori S,Participation of a galactose-specific C-type l...,16475980,396,1,A galactose-specific C-type lectin has been pu...,2006,The Biochemical journal,4.10,Tanji T,Natori S,"Department of Cell Biochemistry, Graduate Scho..."


### A.2. Process claims from the database

claims is merged with articles


In [7]:
# Main processing
claims = dfs["assertions"].copy()
print(len(claims))
claims = claims[claims["obsolete"] == False].copy()
print(len(claims))
claims = preprocess_utils.clean_df_from_database(claims)

id_cols = [col for col in claims.columns if "_id" in col]
print(id_cols)

claims = claims.merge(articles, on="article_id", how="left", suffixes=('', '_article'))
id_cols = [col for col in claims.columns if "_id" in col]
print(id_cols)

# Process assertion types
assertion_types = preprocess_utils.clean_df_from_database(dfs["assertion_types"])
assertion_types = assertion_types.rename(columns={"id": "assertion_type_id", "name": "assertion_type"})
claims = claims.merge(assertion_types, on="assertion_type_id", how="left", suffixes=('', '_assertion_type')).drop("assertion_type_id", axis=1)

# Process assessment types
assessment_types = preprocess_utils.clean_df_from_database(dfs["assessment_types"])
assessment_types = assessment_types.rename(columns={"id": "assessment_type_id", "name": "assessment_type"})
claims = claims.merge(assessment_types, on="assessment_type_id", how="left", suffixes=('', '_assessment_type')).drop("assessment_type_id", axis=1)

id_cols = [col for col in claims.columns if "_id" in col]
print(id_cols)

13941
13299
['article_id', 'assertion_type_id', 'assessment_type_id']
['article_id', 'assertion_type_id', 'assessment_type_id']
['article_id']


In [8]:
claims = claims.drop(['badge_tag_classes','description'], axis=1) # most not consistently used accross dataset
claims = claims.set_index('id')

In [9]:
claims = claims.apply(preprocess_utils.safe_strip)
claims.to_csv('preprocessed_data/claims_db.csv')

string_columns = claims.select_dtypes(include='object').columns.drop(["assessment_type", "journal_name"])
claims_truncated = claims.copy()
for col in string_columns:
    if col in claims_truncated.columns:
        claims_truncated[col] = claims_truncated[col].apply(lambda x: preprocess_utils.truncate_string(x))

# Save truncated dataframe
claims_truncated.to_csv('preprocessed_data/claims_db_truncated_for_llm.csv', index=False)
claims_truncated

Unnamed: 0_level_0,content,article_id,rank,pmid,authors_txt,title,pmid_article,volume,issue,abstract,...,journal_name,impact_factor,first_author_extracted,last_author_extracted,primary_affiliation,assertion_type,label,is_assessed,assessment_type,rank_assessment_type
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
5838,"<p>Belozerov VE, Lin...",2049.0,1,,Zhuang ZH;Zhou Y;Yu ...,Regulation of Drosop...,16014325.0,18,4,The p38 mitogen-acti...,...,Cellular signalling,4.8,Zhuang ZH,Ge BX,Signal Transduction ...,reference,Reference,False,,
693,,2517.0,1,,Choe KM;Werner T;Stö...,Requirement for a pe...,11872802.0,296,5566,Components of microb...,...,"Science (New York, N.Y.)",56.9,Choe KM,Anderson KV,Molecular Biology Pr...,assessment,Assessment,False,Not assessed,13.0
695,,2517.0,1,,Choe KM;Werner T;Stö...,Requirement for a pe...,11872802.0,296,5566,Components of microb...,...,"Science (New York, N.Y.)",56.9,Choe KM,Anderson KV,Molecular Biology Pr...,assessment,Assessment,False,Not assessed,13.0
5840,Verified by many sub...,2204.0,1,,Zettervall CJ;Anderl...,A directed screen fo...,15381778.0,101,39,An attack by a paras...,...,Proceedings of the National Academy of Science...,11.1,Zettervall CJ,Hultmark D,Umeå Centre for Mole...,assessment,Assessment,False,Verified,1.0
5839,Activation of a cell...,2204.0,1,,Zettervall CJ;Anderl...,A directed screen fo...,15381778.0,101,39,An attack by a paras...,...,Proceedings of the National Academy of Science...,11.1,Zettervall CJ,Hultmark D,Umeå Centre for Mole...,main_claim,Main claim,True,Verified,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14468,The DIF antiserum us...,2898.0,1,,Williams MJ;Rodrigue...,The 18-wheeler mutat...,9321392.0,16,20,Mammals and insects ...,...,The EMBO journal,11.4,Williams MJ,Eldon ED,Department of Biolog...,comment,Comment,False,Not assessed,13.0
11330,Although independent...,1577.0,1,,Shia AK;Glittenberg ...,Toll-dependent antim...,19934223.0,122,Pt 24,"In Drosophila, the h...",...,Journal of cell science,4.0,Shia AK,Ligoxygakis P,Department of Bioche...,assessment,Assessment,False,"Unchallenged, logically consistent",6.0
11329,Spatzle (#gene:FBgn0...,1577.0,1,,Shia AK;Glittenberg ...,Toll-dependent antim...,19934223.0,122,Pt 24,"In Drosophila, the h...",...,Journal of cell science,4.0,Shia AK,Ligoxygakis P,Department of Bioche...,main_claim,Main claim,True,"Unchallenged, logically consistent",6.0
11350,Although independent...,1577.0,1,,Shia AK;Glittenberg ...,Toll-dependent antim...,19934223.0,122,Pt 24,"In Drosophila, the h...",...,Journal of cell science,4.0,Shia AK,Ligoxygakis P,Department of Bioche...,assessment,Assessment,False,"Unchallenged, logically consistent",6.0


In [10]:
claims_truncated

Unnamed: 0_level_0,content,article_id,rank,pmid,authors_txt,title,pmid_article,volume,issue,abstract,...,journal_name,impact_factor,first_author_extracted,last_author_extracted,primary_affiliation,assertion_type,label,is_assessed,assessment_type,rank_assessment_type
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
5838,"<p>Belozerov VE, Lin...",2049.0,1,,Zhuang ZH;Zhou Y;Yu ...,Regulation of Drosop...,16014325.0,18,4,The p38 mitogen-acti...,...,Cellular signalling,4.8,Zhuang ZH,Ge BX,Signal Transduction ...,reference,Reference,False,,
693,,2517.0,1,,Choe KM;Werner T;Stö...,Requirement for a pe...,11872802.0,296,5566,Components of microb...,...,"Science (New York, N.Y.)",56.9,Choe KM,Anderson KV,Molecular Biology Pr...,assessment,Assessment,False,Not assessed,13.0
695,,2517.0,1,,Choe KM;Werner T;Stö...,Requirement for a pe...,11872802.0,296,5566,Components of microb...,...,"Science (New York, N.Y.)",56.9,Choe KM,Anderson KV,Molecular Biology Pr...,assessment,Assessment,False,Not assessed,13.0
5840,Verified by many sub...,2204.0,1,,Zettervall CJ;Anderl...,A directed screen fo...,15381778.0,101,39,An attack by a paras...,...,Proceedings of the National Academy of Science...,11.1,Zettervall CJ,Hultmark D,Umeå Centre for Mole...,assessment,Assessment,False,Verified,1.0
5839,Activation of a cell...,2204.0,1,,Zettervall CJ;Anderl...,A directed screen fo...,15381778.0,101,39,An attack by a paras...,...,Proceedings of the National Academy of Science...,11.1,Zettervall CJ,Hultmark D,Umeå Centre for Mole...,main_claim,Main claim,True,Verified,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14468,The DIF antiserum us...,2898.0,1,,Williams MJ;Rodrigue...,The 18-wheeler mutat...,9321392.0,16,20,Mammals and insects ...,...,The EMBO journal,11.4,Williams MJ,Eldon ED,Department of Biolog...,comment,Comment,False,Not assessed,13.0
11330,Although independent...,1577.0,1,,Shia AK;Glittenberg ...,Toll-dependent antim...,19934223.0,122,Pt 24,"In Drosophila, the h...",...,Journal of cell science,4.0,Shia AK,Ligoxygakis P,Department of Bioche...,assessment,Assessment,False,"Unchallenged, logically consistent",6.0
11329,Spatzle (#gene:FBgn0...,1577.0,1,,Shia AK;Glittenberg ...,Toll-dependent antim...,19934223.0,122,Pt 24,"In Drosophila, the h...",...,Journal of cell science,4.0,Shia AK,Ligoxygakis P,Department of Bioche...,main_claim,Main claim,True,"Unchallenged, logically consistent",6.0
11350,Although independent...,1577.0,1,,Shia AK;Glittenberg ...,Toll-dependent antim...,19934223.0,122,Pt 24,"In Drosophila, the h...",...,Journal of cell science,4.0,Shia AK,Ligoxygakis P,Department of Bioche...,assessment,Assessment,False,"Unchallenged, logically consistent",6.0


### A.3. Process authors from database

there are about two author per article... though the total of authors is 797 so some are missing.

In [11]:
a = preprocess_utils.clean_df_from_database(dfs["authors"])  
a[a["article_id"] == 2053]

Unnamed: 0,id,article_id,name,sex,career_stage_id,leading_author,first_author,expertise_level_id
0,2,2053,Brun S,1,3,False,True,2
1,3,2053,Lemaitre B,1,4,True,False,2


In [12]:
len(a)

797

In [13]:
a[a["name"] == "Lemaitre B"].head()

Unnamed: 0,id,article_id,name,sex,career_stage_id,leading_author,first_author,expertise_level_id
1,3,2053,Lemaitre B,1,4,True,False,2
7,8,3437,Lemaitre B,1,4,True,False,2
14,16,2801,Lemaitre B,1,4,True,False,2
30,36,2617,Lemaitre B,1,4,True,False,2
44,56,2869,Lemaitre B,1,4,True,False,2


In [14]:
a[a["article_id"] == 2451]

Unnamed: 0,id,article_id,name,sex,career_stage_id,leading_author,first_author,expertise_level_id
17,19,2451,De Gregorio E,1,3,False,True,2
95,20,2451,Lemaitre B,1,5,False,False,2
789,795,2451,Brey PT,1,5,True,False,2


In [15]:
authors = preprocess_utils.clean_df_from_database(dfs["authors"].copy())
authors
career_stage = dfs["career_stages"].rename(columns={"id": "career_stage_id"})
authors = authors.merge(career_stage, on="career_stage_id", how="left", suffixes=('', '_career_stage')).drop("career_stage_id", axis=1)
expertise_level = dfs["expertise_levels"].rename(columns={"id": "expertise_level_id"})
authors = authors.merge(expertise_level, on="expertise_level_id", how="left", suffixes=('', '_expertise_level')).drop("expertise_level_id", axis=1)
authors = authors.rename(columns={"name_career_stage": "career_stage", "name_expertise_level": "expertise_level", "id": "author_id"})

# Convert sex values: 1 to "Male", 0 to "Female"
authors.loc[authors["sex"] == 1, "sex"] = "Male"
authors.loc[authors["sex"] == 0, "sex"] = "Female"

authors

  authors.loc[authors["sex"] == 1, "sex"] = "Male"


Unnamed: 0,author_id,article_id,name,sex,leading_author,first_author,career_stage,expertise_level
0,2,2053,Brun S,Male,False,True,Post-doc,Experienced
1,3,2053,Lemaitre B,Male,True,False,Junior PI,Experienced
2,1,1650,Charroux B,Male,False,True,Post-doc,Experienced
3,97,2149,Söderhäll K,Male,True,False,Senior PI,Experienced
4,5,2278,Silverman N,Male,False,True,Post-doc,Newcomer
...,...,...,...,...,...,...,...,...
792,797,2067,Cerenius L,Male,True,False,Senior PI,Experienced
793,798,2122,Higgins DE,Male,True,False,Senior PI,Newcomer
794,799,2184,Mengin-Lecreulx D,Male,True,False,Senior PI,Experienced
795,800,2130,Moore KJ,Female,True,False,Junior PI,Newcomer


In [16]:
# check authors thare not both leading and first authors, or that are neither using XOR
authors[~(authors["leading_author"] ^ authors["first_author"])]

Unnamed: 0,author_id,article_id,name,sex,leading_author,first_author,career_stage,expertise_level
95,20,2451,Lemaitre B,Male,False,False,Senior PI,Experienced
194,127,2113,Schneider DS,Male,False,False,Junior PI,Experienced
195,197,3440,Lemaitre B,Male,False,False,Junior PI,Experienced
196,169,2184,Lemaitre B,Male,False,False,Junior PI,Experienced
243,250,1947,Schneider DS,Male,True,True,Senior PI,Experienced
256,260,1763,Schneider DS,Male,False,False,Senior PI,Experienced
257,252,2730,Shahabuddin M,Male,False,False,Junior PI,Newcomer
287,303,3458,Hultmark D,Male,True,True,Senior PI,Experienced
329,251,2730,Schneider DS,Male,True,True,Junior PI,Newcomer
334,378,1702,Williams MJ,Male,True,True,Junior PI,Experienced


In [17]:
#authors = authors.merge(articles, on="article_id", how="left", suffixes=('', '_article'))
#authors.sort_values(by=["article_id", "author_id", ])



In [18]:
for idr, row in authors[authors["author_key"] == "wang z"].iterrows():
    print(row["article_id"], end=" ")
    this_article = articles[articles["article_id"] == row["article_id"]]
    print(this_article["title"].values[0], this_article["journal_name"].values[0], this_article["year"].values[0])

KeyError: 'author_key'

In [None]:
for idr, row in authors[authors["author_key"] == "zhang z"].iterrows():
    print(row["article_id"], end=" ")
    this_article = articles[articles["article_id"] == row["article_id"]]
    print(this_article["title"].values[0], this_article["journal_name"].values[0], this_article["year"].values[0])

1491 Functional role of charged residues in drosomycin, a Drosophila antifungal peptide. Developmental and comparative immunology 2010
1606 Identification of a cis-regulatory element required for 20-hydroxyecdysone enhancement of antimicrobial peptide gene expression in Drosophila melanogaster. Insect molecular biology 2009


After a google search:
* 1557 Pathogen entrapment by transglutaminase--a conserved early innate immune mechanism. PLoS pathogens 2010 **is Zhi Wang**
* 1423 Host and pathogen glycosaminoglycan-binding proteins modulate antimicrobial peptide responses in Drosophila melanogaster. Infection and immunity 2011 **is Zhipeng Wang**
* 1491 Functional role of charged residues in drosomycin, a Drosophila antifungal peptide. Developmental and comparative immunology 2010 **is Zhenting Zhang**
* 1606 Identification of a cis-regulatory element required for 20-hydroxyecdysone enhancement of antimicrobial peptide gene expression in Drosophila melanogaster. Insect molecular biology 2009 **is assumed to be Zhaolin Zhang**

In [None]:
authors[(authors["author_key"] == "wang z") & (authors["article_id"] == 1557)]["Name"] = "Wang Zhi"
authors[(authors["author_key"] == "wang z") & (authors["article_id"] == 1423)]["Name"] = "Wang Zhipeng"
authors[(authors["author_key"] == "zhang z") & (authors["article_id"] == 1491)]["Name"] = "Zhang Zhenting"
authors[(authors["author_key"] == "zhang z") & (authors["article_id"] == 1606)]["Name"] = "Zhang Zhaolin"

In [None]:
authors= preprocess_utils.build_author_key(authors, author_name_col="name", key_col="author_key")
authors = authors.apply(preprocess_utils.safe_strip)
authors.to_csv('preprocessed_data/authors_db.csv', index=False)

Unnamed: 0,author_id,article_id,name,sex,leading_author,first_author,career_stage,expertise_level,author_key
724,745,1535,Abdelsadik A,Male,False,True,PhD,Experienced,abdelsadik a
485,496,2122,Agaisse H,Male,False,True,Post-doc,Experienced,agaisse h
487,499,1882,Agaisse H,Male,True,False,Junior PI,Experienced,agaisse h
15,17,2297,Agaisse H,Male,False,True,Post-doc,Experienced,agaisse h
104,108,1752,Aggarwal K,Female,False,True,PhD,Experienced,aggarwal k
...,...,...,...,...,...,...,...,...,...
754,784,1491,Zhu S,Male,True,False,Junior PI,Newcomer,zhu s
740,714,1756,Zhu S,Male,True,False,Junior PI,Newcomer,zhu s
653,621,1748,Zhu S,Male,True,False,Senior PI,Newcomer,zhu s
588,588,2025,Zhuang ZH,Male,False,True,PhD,Newcomer,zhuang zh
