In [1]:
import pandas as pd
import pickle
import preprocess_utils

from_database = True

if from_database:
    # Load tables from database 
    dfs = preprocess_utils.load_all_tables()
else:
    # Open the file in binary read mode and load the pickle data
    with open('preprocessed_data/dfs.pickle', 'rb') as f:
        dfs = pickle.load(f)


Found 46 tables: abuse_report_types, abuse_reports, ar_internal_metadata, articles, assertion_types, assertion_versions, assertions, assertions_genes, assertions_tags, assertions_technique_types, assertions_techniques, assessment_types, assessments, authors, banned_orcid_users, career_stages, claim_types, claims, comment_checks, comments, db_sets, ensembl_subdomains, expertise_levels, gene_names, gene_set_items, gene_sets, genes, journals, orcid_users, organisms, reason_types, reasons, rel_types, rels, schema_migrations, sessions, shares, statuses, tags, technique_types, techniques, tool_types, tools, users, workspace_orcid_users, workspaces
Loading abuse_report_types (4 rows)
Loading abuse_reports (0 rows)
Loading ar_internal_metadata (1 rows)
Loading articles (400 rows)
Loading assertion_types (10 rows)
Loading assertion_versions (19668 rows)
Loading assertions (13941 rows)
Loading assertions_genes (2053 rows)
Loading assertions_tags (403 rows)
Loading assertions_technique_types (0 r

In [2]:
if from_database:
    # Save tables to pickle
    with open('preprocessed_data/dfs.pickle', 'wb') as f:
        pickle.dump(dfs, f)

    # load it back with:
    with open('preprocessed_data/dfs.pickle', 'rb') as f:
        dfs = pickle.load(f)


 ## 2. Create dataframe claims with all the referenced table from the dataframes

In [6]:
def clean_df(df):
    # Create a copy of the DataFrame at the start
    df = df.copy()
    
    columns_to_remove = ['user_id', 'orcid_user_id', 
                    'created_at', 'updated_at', 'assertion_updated_at', 
                    'workspace_id', 'user_id', 'doi', 'organism_id', # 'pmid', 
                    'all_tags_json', 'obsolete', 'ext', 'badge_classes','pluralize_title',
                    'can_attach_file', 'refresh_side_panel', 'icon_classes', 'btn_classes']
    patterns_to_remove = ['validated', 'filename', 'obsolete_article']
    
    # Remove existing columns
    existing_cols = [col for col in columns_to_remove if col in df.columns]
    if existing_cols:
        df = df.drop(existing_cols, axis=1)
    
    # Remove pattern-matched columns
    pattern_cols = []
    for pattern in patterns_to_remove:
        pattern_cols.extend([c for c in df.columns if pattern in c])
    if pattern_cols:
        df = df.drop(pattern_cols, axis=1)
    
    return df

def truncate_string(s, max_length=20):
    """Truncate string to max_length characters."""
    if isinstance(s, str) and len(s) > max_length:
        return s[:max_length] + '...'
    return s

# Main processing
claims = dfs["assertions"].copy()
print(len(claims))
claims = claims[claims["obsolete"] == False].copy()
print(len(claims))
claims = clean_df(claims)

id_cols = [col for col in claims.columns if "_id" in col]
print(id_cols)

# Process articles
articles = clean_df(dfs["articles"])
articles = articles.rename(columns={"id": "article_id"})
claims = claims.merge(articles, on="article_id", how="left", suffixes=('', '_article'))

id_cols = [col for col in claims.columns if "_id" in col]
print(id_cols)

# Process journals
journals = clean_df(dfs["journals"])
journals = journals.drop('tag', axis=1).rename(columns={"id": "journal_id", "name": "journal_name"})
claims = claims.merge(journals, on="journal_id", how="left", suffixes=('', '_journal')).drop("journal_id", axis=1)

# Process assertion types
assertion_types = clean_df(dfs["assertion_types"])
assertion_types = assertion_types.rename(columns={"id": "assertion_type_id", "name": "assertion_type"})
claims = claims.merge(assertion_types, on="assertion_type_id", how="left", suffixes=('', '_assertion_type')).drop("assertion_type_id", axis=1)

# Process assessment types
assessment_types = clean_df(dfs["assessment_types"])
assessment_types = assessment_types.rename(columns={"id": "assessment_type_id", "name": "assessment_type"})
claims = claims.merge(assessment_types, on="assessment_type_id", how="left", suffixes=('', '_assessment_type')).drop("assessment_type_id", axis=1)

id_cols = [col for col in claims.columns if "_id" in col]
print(id_cols)

13941
13299
['article_id', 'assertion_type_id', 'assessment_type_id']
['article_id', 'assertion_type_id', 'assessment_type_id', 'journal_id']
['article_id']


In [7]:
claims = claims.drop(['published_at', 'badge_tag_classes','description', 'additional_context', 'references_txt'], axis=1) # most not consistently used accross dataset
claims = claims.set_index('id')

In [8]:
# Update impact factor for "Proceedings. Biological sciences" and standardize the journal name
claims.loc[claims["journal_name"] == "Proceedings. Biological sciences", "impact_factor"] = 4.7 
claims.loc[claims["journal_name"] == "Proceedings. Biological sciences", "journal_name"] = "Proceedings Biological Sciences"

In [9]:
string_columns = claims.select_dtypes(include='object').columns
print(string_columns)

for col in string_columns:
    claims[col] = claims[col].apply(lambda x: x.replace('&amp;', '&') if isinstance(x, str) else x)

claims.to_csv('preprocessed_data/claims.csv')


string_columns = string_columns.drop(["assessment_type", "journal_name"])
df_truncated = claims.copy()
for col in string_columns:
    if col in df_truncated.columns:
        df_truncated[col] = df_truncated[col].apply(lambda x: truncate_string(x))

# Save truncated dataframe
df_truncated.to_csv('preprocessed_data/claims_truncated_for_llm.csv', index=False)
df_truncated

Index(['content', 'pmid', 'authors_txt', 'title', 'volume', 'issue',
       'abstract', 'key', 'affs_json', 'large_scale', 'journal_name',
       'assertion_type', 'label', 'assessment_type'],
      dtype='object')


Unnamed: 0_level_0,content,article_id,rank,pmid,authors_txt,title,pmid_article,volume,issue,abstract,...,large_scale,nber_tables,nber_panels,journal_name,impact_factor,assertion_type,label,is_assessed,assessment_type,rank_assessment_type
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
5838,"<p>Belozerov VE, Lin...",2049.0,1,,Zhuang ZH;Zhou Y;Yu ...,Regulation of Drosop...,16014325.0,18,4,The p38 mitogen-acti...,...,False,0.0,14.0,Cellular signalling,4.8,reference,Reference,False,,
693,,2517.0,1,,Choe KM;Werner T;Stö...,Requirement for a pe...,11872802.0,296,5566,Components of microb...,...,False,,,"Science (New York, N.Y.)",56.9,assessment,Assessment,False,Not assessed,13.0
695,,2517.0,1,,Choe KM;Werner T;Stö...,Requirement for a pe...,11872802.0,296,5566,Components of microb...,...,False,,,"Science (New York, N.Y.)",56.9,assessment,Assessment,False,Not assessed,13.0
5840,Verified by many sub...,2204.0,1,,Zettervall CJ;Anderl...,A directed screen fo...,15381778.0,101,39,An attack by a paras...,...,False,2.0,9.0,Proceedings of the National Academy of Science...,11.1,assessment,Assessment,False,Verified,1.0
5839,Activation of a cell...,2204.0,1,,Zettervall CJ;Anderl...,A directed screen fo...,15381778.0,101,39,An attack by a paras...,...,False,2.0,9.0,Proceedings of the National Academy of Science...,11.1,main_claim,Main claim,True,Verified,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14468,The DIF antiserum us...,2898.0,1,,Williams MJ;Rodrigue...,The 18-wheeler mutat...,9321392.0,16,20,Mammals and insects ...,...,False,0.0,27.0,The EMBO journal,11.4,comment,Comment,False,Not assessed,13.0
11330,Although independent...,1577.0,1,,Shia AK;Glittenberg ...,Toll-dependent antim...,19934223.0,122,Pt 24,"In Drosophila, the h...",...,False,1.0,27.0,Journal of cell science,4.0,assessment,Assessment,False,"Unchallenged, logically consistent",6.0
11329,Spatzle (#gene:FBgn0...,1577.0,1,,Shia AK;Glittenberg ...,Toll-dependent antim...,19934223.0,122,Pt 24,"In Drosophila, the h...",...,False,1.0,27.0,Journal of cell science,4.0,main_claim,Main claim,True,"Unchallenged, logically consistent",6.0
11350,Although independent...,1577.0,1,,Shia AK;Glittenberg ...,Toll-dependent antim...,19934223.0,122,Pt 24,"In Drosophila, the h...",...,False,1.0,27.0,Journal of cell science,4.0,assessment,Assessment,False,"Unchallenged, logically consistent",6.0


## 3. Preprocess the author files

Last files version:
- **0: First** `input_data/2025-03-28/2025-03-27_dropbox_6_ReproSci/0_last version/2025_Last version_March 19th/0_First author cleaned_March 18th.xlsx`
  - Pour les statistiques sur ceux qui deviennent PI, Il ne faut pas prendre en compte ceux qui sont surlignés en bleu car ils sont des premiers auteurs qui sont déjà PI. Normalement ce fichier est bon.  
Avec un peu d’approximation, je trouve que sur 291 first authors: 107 become PI et il y  16  ?? (indéterminés)et 13 qui sont déjà PI en bleu :  ce qui fait 107/262=>40,8% (262=291-16-13)  
- **2: Both** `input_data/2025-03-28/2025-03-27_dropbox_6_ReproSci/0_last version/2025_Last version_March 19th/2_2025_March 9th_stats_author.xlsx`
  - obtenu depuis le site
  -  tu trouveras les last avec de nouveau critères  pour historical (tradition), continuty, first and last.  (pour junior et senior il faut voir par articles dans la database)  
  - Feuille 1 : Parmi les first author (1er feuille) :  
    - Zhenting Zhang et Zhaolin Zhang  sont fusionnés en Zhang Z  
    - Zhipeng Wang et Zhi Wang sont fusionné en Wang Z  
    - Hedengren M et Hedengren M et Hedengren-Olcott M ne sont pas fusionnés
- **3:citations** `input_data/2025-03-28/2025-03-27_dropbox_6_ReproSci/0_last version/2025_Last version_March 19th/3_2025_March 9th_citation_counts.xlsm`
  -  il s’agit des citations : je ne l’ai pas travaillé.  
 - **4:first&last** `input_data/2025-03-28/2025-03-27_dropbox_6_ReproSci/0_last version/2025_Last version_March 19th/4_working first and last.xlsx`
    -   ils’agit d’une liste des auteurs qui sont first and last and mais ne pas prendre en compte ceux qui sont PI.  
Un ppt avec une liste des figures  


 It seems that
 - sex -> FH
 - PhD Post-doc -> FH
 - Become a Pi -> FH
 - current job -> FH
 - MD -> **???**
 - Affiliation -> Both
 - Country -> Both
 - Ivy League -> Both

In [38]:
# read an xlx file in pandas
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import  preprocess_utils

stat_author_fn = "input_data/2025-03-28/2025-03-27_dropbox_6_ReproSci/0_last version/2025_Last version_March 19th/2_2025_March 9th_stats_author.xlsx"

claims_df = pd.read_csv('preprocessed_data/claims_truncated_for_llm.csv')
major_claims_df = claims_df[claims_df['assertion_type'] == 'major_claim']

# Read Author info, which contains all the pairs
paper_auth_pairs = pd.read_excel('input_data/2025-02-14_last_xlsx/1_Triage_Last author.xlsx', sheet_name='Tri sans les doublons')
# Drop all columns with 'Unnamed' in the name
paper_auth_pairs = paper_auth_pairs.drop(columns=[col for col in paper_auth_pairs.columns if 'Unnamed' in col]).drop(columns=['Source'])
# rename RIZKI MT	to Rizki TM in column last author
paper_auth_pairs['last author'] = paper_auth_pairs['last author'].replace({'Rizki MT': 'Rizki TM','RIZKI MT': 'Rizki TM' })
paper_auth_pairs.to_csv('input_data/2025-02-14_last_xlsx/1_Triage_Last author.csv', index=False)

first_authors_claims = pd.read_excel(stat_author_fn, sheet_name='First')
leading_authors_claims = pd.read_excel(stat_author_fn, sheet_name='Leading')
leading_authors_claims["Authorship"]= "Leading"
first_authors_claims["Authorship"]= "First"

authors_claims = pd.concat([leading_authors_claims, first_authors_claims])
authors_claims['Sex'] = authors_claims['Sex'].map({1: 'Male', 0: 'Female'})
authors_claims = authors_claims.drop(columns=[col for col in authors_claims.columns if '%' in col])
authors_claims = authors_claims.drop(columns=[col for col in authors_claims.columns if 'Unnamed' in col])

authors_claims = authors_claims.rename(columns={'Conituinity': 'Continuity', 
                                                "Partially verified":"Partially Verified", 
                                                "*Historical lab after 1998":"Historical lab after 1998"
                                                })

authors_claims['Historical lab'] = authors_claims['Historical lab'].astype('boolean')
authors_claims['Continuity'] = authors_claims['Continuity'].astype('boolean')
authors_claims = authors_claims.dropna(subset=['Name'])
# drop  Schneider DS+D from names as  Schneider DS and  Schneider D are also there in separate rows
authors_claims = authors_claims[~(authors_claims['Name'] == 'Schneider DS+D')]

authors_claims.to_csv('input_data/2025-02-14_last_xlsx/stats_author.csv', index=False)

### A. Last authors

In [39]:
paper_auth_pairs

Unnamed: 0,last author,first author,Sex,PhD Post-doc,Become a Pi,current job,MD,Affiliation,Country,Ivy league
0,Agaisse H,Derré,0,Post-doc,1,PI,0,Yale University,USA,1
1,Aguilera RJ,Seong CS,1,PhD,0,Academia,0,The University of Texas,USA,0
2,Aigaki T,Tsuda M,1,PhD,1,Admin,0,Tokyo Metropolitan University,Japan,0
3,Ando,Markus,1,PhD,0,Facility leader,0,"Hungarian Academy of Sciences, Szeged",Hungary,0
4,Ando,Rus F,0,PhD,0,??,0,"Hungarian Academy of Sciences, Szeged",Hungary,0
...,...,...,...,...,...,...,...,...,...,...
315,Yu XQ,Ao J,??,PhD,??,Senior Staff,0,niversity of Missouri-Kansas City,USA,0
316,Zhu S,Yuan Y,1,PhD,0,Senior Staff,0,"Institute of Zoology, Chinese Academy of Sciences",China,0
317,Zhu S,Tian C,0,PhD,1,PI,0,"Institute of Zoology, Chinese Academy of Sciences",China,0
318,Zhu S,Zhang Z,0,PhD,0,Academia,0,"Institute of Zoology, Chinese Academy of Sciences",China,0


In [40]:
paper_auth_pairs_LH = paper_auth_pairs[["last author", "Affiliation", "Country", "Ivy league"]]
paper_auth_pairs_LH = preprocess_utils.deduplicate_by(paper_auth_pairs_LH, "last author").copy()
claims_LH = authors_claims[authors_claims['Authorship'] == 'Leading'].copy()

paper_auth_pairs_LH.loc[:, 'lh_proc'] = paper_auth_pairs_LH['last author'].str.lower()
paper_auth_pairs_LH.loc[:, 'lh_proc'] = (paper_auth_pairs_LH['lh_proc']
    .str.normalize('NFKD')
    .str.encode('ascii', errors='ignore')
    .str.decode('utf-8'))
#paper_auth_pairs_LH.loc[:, 'lh_proc'] = paper_auth_pairs_LH['lh_proc'].str.replace('rizki mt', 'rizki tm')

# For claims_LH
# drop the row where Name that is NaN 
claims_LH.loc[:, 'lh_proc'] = claims_LH['Name'].str.lower()
claims_LH.loc[:, 'lh_proc'] = (claims_LH['lh_proc']
    .str.normalize('NFKD')
    .str.encode('ascii', errors='ignore')
    .str.decode('utf-8'))
claims_LH.loc[:, 'lh_proc'] = claims_LH['lh_proc'].str.replace('ando i', 'ando')

In [41]:
# check for duplicate lh_proc in paper_auth_pairs_LH
paper_auth_pairs_LH[paper_auth_pairs_LH.duplicated('lh_proc', keep=False)]
print(len(paper_auth_pairs_LH[paper_auth_pairs_LH.duplicated('lh_proc', keep=False)]), "duplicates in paper_auth_pairs_LH")
# check for duplicate lh_proc in claims_LH
print(len(claims_LH[claims_LH.duplicated('lh_proc', keep=False)]), "duplicates in claims_LH")

0 duplicates in paper_auth_pairs_LH
0 duplicates in claims_LH


In [42]:
# Perform the outer merge to see what we are missing
all_LH = pd.merge(claims_LH, paper_auth_pairs_LH, on='lh_proc', how='outer')
print(len(claims_LH), len(paper_auth_pairs_LH), len(all_LH))

157 160 161


In [43]:
unique_pairs = all_LH[["Name", "last author", "lh_proc"]].drop_duplicates().sort_values("lh_proc", ascending=True)
print(" ONLY in Pairs               only in Claims        matching key ")
print("-"*70)
for i in range(0, len(unique_pairs)):
    if pd.isna(unique_pairs.iloc[i]['last author']) or pd.isna(unique_pairs.iloc[i]['Name']):
        print('💥 ', end='')
        print(f"{unique_pairs.iloc[i]['last author']:<20} vs  {unique_pairs.iloc[i]['Name']:<20} --> {unique_pairs.iloc[i]['lh_proc']:<20}")
    #else:
    #    print(f"   {unique_pairs.iloc[i]['last author']:<20} vs  {unique_pairs.iloc[i]['Name']:<20} --> {unique_pairs.iloc[i]['lh_proc']:<20}")

 ONLY in Pairs               only in Claims        matching key 
----------------------------------------------------------------------
💥 Bellotti RA          vs  nan                  --> bellotti ra         
💥 nan                  vs  Nappi AJ             --> nappi aj            
💥 Shahabuddin M        vs  nan                  --> shahabuddin m       
💥 Shirasu-Hiza MM      vs  nan                  --> shirasu-hiza mm     
💥 Silvers MJ           vs  nan                  --> silvers mj          


In [44]:
all_LH_inner = pd.merge(claims_LH, paper_auth_pairs_LH, on='lh_proc', how='inner').sort_values("lh_proc", ascending=True)
#all_LH_inner.drop(columns=['lh_proc', 'last author', 'Authorship'], inplace=True)
print(len(all_LH_inner))
all_LH_inner.to_csv('preprocessed_data/LH_inner.csv', index=False)
all_LH_inner

156


Unnamed: 0,Name,Historical lab,Historical lab after 1998,Continuity,F and L,Sex,Articles,Major claims,Unchallenged,Verified,...,Mixed,Challenged,Start lab,Finish,Authorship,lh_proc,last author,Affiliation,Country,Ivy league
63,Agaisse H,False,0.0,False,1.0,Male,1.0,3.0,2.0,1.0,...,0.0,0.0,,,Leading,agaisse h,Agaisse H,Yale University,USA,1
107,Aguilera RJ,False,0.0,False,0.0,Male,1.0,2.0,0.0,1.0,...,0.0,1.0,,,Leading,aguilera rj,Aguilera RJ,The University of Texas,USA,0
64,Aigaki T,False,0.0,False,0.0,Male,1.0,3.0,2.0,1.0,...,0.0,0.0,,,Leading,aigaki t,Aigaki T,Tokyo Metropolitan University,Japan,0
6,Anderson KV,False,0.0,False,0.0,Female,8.0,20.0,7.0,11.0,...,0.0,2.0,1997,2010,Leading,anderson kv,Anderson KV,Memorial Sloan-Kettering Cancer Center and the...,USA,1
16,Andó I,False,0.0,True,0.0,Male,6.0,12.0,1.0,7.0,...,0.0,0.0,1999,still active,Leading,ando,Ando,"Hungarian Academy of Sciences, Szeged",Hungary,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
104,Xu T,False,0.0,False,0.0,Male,1.0,3.0,0.0,3.0,...,0.0,0.0,,,Leading,xu t,Xu T,Yale University School of Medicine,USA,0
146,Yamaguchi M,False,0.0,False,0.0,Male,1.0,2.0,1.0,1.0,...,0.0,0.0,,,Leading,yamaguchi m,Yamaguchi M,Aichi Cancer Center Research Institute,Japan,0
105,Yoo MA,False,0.0,False,0.0,Female,1.0,3.0,0.0,2.0,...,0.0,0.0,,,Leading,yoo ma,Yoo MA,Pusan National University,Korea,0
106,Yu XQ,False,0.0,False,0.0,Male,1.0,3.0,1.0,2.0,...,0.0,0.0,,,Leading,yu xq,Yu XQ,niversity of Missouri-Kansas City,USA,0


### B. First authors: TODO

In [45]:
paper_auth_pairs.columns

Index(['last author', 'first author', 'Sex', 'PhD Post-doc', 'Become a Pi',
       'current job', 'MD', 'Affiliation', 'Country', 'Ivy league'],
      dtype='object')

In [46]:
paper_auth_pairs

Unnamed: 0,last author,first author,Sex,PhD Post-doc,Become a Pi,current job,MD,Affiliation,Country,Ivy league
0,Agaisse H,Derré,0,Post-doc,1,PI,0,Yale University,USA,1
1,Aguilera RJ,Seong CS,1,PhD,0,Academia,0,The University of Texas,USA,0
2,Aigaki T,Tsuda M,1,PhD,1,Admin,0,Tokyo Metropolitan University,Japan,0
3,Ando,Markus,1,PhD,0,Facility leader,0,"Hungarian Academy of Sciences, Szeged",Hungary,0
4,Ando,Rus F,0,PhD,0,??,0,"Hungarian Academy of Sciences, Szeged",Hungary,0
...,...,...,...,...,...,...,...,...,...,...
315,Yu XQ,Ao J,??,PhD,??,Senior Staff,0,niversity of Missouri-Kansas City,USA,0
316,Zhu S,Yuan Y,1,PhD,0,Senior Staff,0,"Institute of Zoology, Chinese Academy of Sciences",China,0
317,Zhu S,Tian C,0,PhD,1,PI,0,"Institute of Zoology, Chinese Academy of Sciences",China,0
318,Zhu S,Zhang Z,0,PhD,0,Academia,0,"Institute of Zoology, Chinese Academy of Sciences",China,0


In [47]:
paper_auth_pairs_FH = paper_auth_pairs.drop(columns=['last author']).sort_values("first author", ascending=True)
paper_auth_pairs_FH = preprocess_utils.deduplicate_by(paper_auth_pairs_FH, "first author")



In [48]:
claims_FH[claims_FH.duplicated('fh_proc', keep=False)]

Unnamed: 0,Name,Historical lab,Historical lab after 1998,Continuity,F and L,Sex,Articles,Major claims,Unchallenged,Verified,Partially Verified,Mixed,Challenged,Start lab,Finish,Authorship,fh_proc
51,Kim YS,,,,,Male,2.0,5.0,0.0,2.0,1.0,0.0,2.0,,,First,kim ys
122,Hedengren M,,,,,Female,1.0,3.0,0.0,2.0,1.0,0.0,0.0,,,First,hedengren m
135,Kim YS,,,,,Female,1.0,3.0,0.0,2.0,1.0,0.0,0.0,,,First,kim ys
216,Hedengren M,,,,,Male,1.0,2.0,0.0,2.0,0.0,0.0,0.0,,,First,hedengren m


In [None]:
claims_FH = authors_claims[authors_claims['Authorship'] == 'First'].copy()

# create merge columns: lowercased and stripped of accents
paper_auth_pairs_FH['fh_proc'] = paper_auth_pairs_FH['first author'].str.lower()
paper_auth_pairs_FH['fh_proc'] = paper_auth_pairs_FH['fh_proc'].str.normalize('NFKD').str.encode('ascii', errors='ignore').str.decode('utf-8')
paper_auth_pairs_FH['fh_proc'] = paper_auth_pairs_FH['fh_proc'].str.replace('derre', 'derre i')
paper_auth_pairs_FH['fh_proc'] = paper_auth_pairs_FH['fh_proc'].str.replace('markus', 'markus r')
claims_FH['fh_proc'] = claims_FH['Name'].str.lower()
claims_FH['fh_proc'] = claims_FH['fh_proc'].str.normalize('NFKD').str.encode('ascii', errors='ignore').str.decode('utf-8')


# check for duplicate lh_proc in paper_auth_pairs_LH
print(len(paper_auth_pairs_FH[paper_auth_pairs_FH.duplicated('fh_proc', keep=False)]), "duplicates in paper_auth_pairs_LH")
# check for duplicate lh_proc in claims_LH
print(len(claims_FH[claims_FH.duplicated('fh_proc', keep=False)]), "duplicates in claims_LH")


all_FH = pd.merge(claims_FH, paper_auth_pairs_FH, on='fh_proc', how='outer')
print(len(claims_FH), len(paper_auth_pairs_FH), len(all_FH))

print(" ONLY in Pairs               only in Claims        matching key ")
print("-"*70)
unique_pairs = all_FH[["Name", "first author", "fh_proc"]].drop_duplicates().sort_values("fh_proc", ascending=True)
for i in range(0, len(unique_pairs)):
    if pd.isna(unique_pairs.iloc[i]['first author']) or pd.isna(unique_pairs.iloc[i]['Name']):
        print('💥 ', end='')
        print(f" {unique_pairs.iloc[i]['first author']:<20}  {unique_pairs.iloc[i]['Name']:<20} --> {unique_pairs.iloc[i]['fh_proc']:<20}")

0 duplicates in paper_auth_pairs_LH
4 duplicates in claims_LH
292 291 294
 ONLY in Pairs               only in Claims        matching key 
----------------------------------------------------------------------
💥  nan                   Hedengren-Olcott M   --> hedengren-olcott m  
💥  RIZKI MT              nan                  --> rizki mt            
💥  Schneider D           nan                  --> schneider d         


In [None]:
all_FH_inner = pd.merge(claims_FH, paper_auth_pairs_FH, on='fh_proc', how='inner').sort_values("fh_proc", ascending=True)
all_FH_inner.drop(columns=['Continuity', 'Historical lab after 1998', 'Historical lab', "Sex_y"], inplace=True)
# rename Sex_x to Sex
all_FH_inner = all_FH_inner.rename(columns={'Sex_x': 'Sex'})
print(len(all_FH_inner))
all_FH_inner.to_csv('preprocessed_data/FH_inner.csv', index=False)
all_FH_inner

291


Unnamed: 0,Name,F and L,Sex,Articles,Major claims,Unchallenged,Verified,Partially Verified,Mixed,Challenged,...,Authorship,fh_proc,first author,PhD Post-doc,Become a Pi,current job,MD,Affiliation,Country,Ivy league
80,Abdelsadik A,,Male,1.0,3.0,3.0,0.0,0.0,0.0,0.0,...,First,abdelsadik a,Abdelsadik A,Senior Staff,1,PI,0,Forschungszentrum Borstel,Germany,0
19,Agaisse H,,Male,2.0,6.0,2.0,4.0,0.0,0.0,0.0,...,First,agaisse h,Agaisse H,Post-doc,1,PI,0,Harvard Medical School,USA,1
196,Aggarwal K,,Female,1.0,2.0,0.0,2.0,0.0,0.0,0.0,...,First,aggarwal k,Aggarwal K,PhD,0,Industry,0,University of Massachusetts Medical School,USA,0
81,Ao J,,Male,1.0,3.0,1.0,2.0,0.0,0.0,0.0,...,First,ao j,Ao J,PhD,??,Senior Staff,0,niversity of Missouri-Kansas City,USA,0
20,Apidianakis Y,,Male,3.0,6.0,4.0,2.0,0.0,0.0,0.0,...,First,apidianakis y,Apidianakis Y,Post-doc,1,PI,0,Harvard Medical School and Massachusetts Gener...,USA,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
194,Zettervall CJ,,Male,1.0,3.0,0.0,3.0,0.0,0.0,0.0,...,First,zettervall cj,Zettervall CJ,PhD,0,Industry,0,University of Stockholm,Sweden,0
273,Zhang Z,,Female,2.0,2.0,1.0,1.0,0.0,0.0,0.0,...,First,zhang z,Zhang Z,PhD,0,Academia,0,"Institute of Zoology, Chinese Academy of Sciences",China,0
195,Zhao HW,,Female,1.0,3.0,2.0,1.0,0.0,0.0,0.0,...,First,zhao hw,Zhao HW,PhD,0,??,0,University of California San Diego,USA,1
290,Zhou Z,,Male,1.0,1.0,0.0,1.0,0.0,0.0,0.0,...,First,zhou z,Zhou Z,PhD,??,??,0,University of Houston,USA,0
