In [1]:
import pandas as pd
import pickle
import preprocess_utils

from_database = False

if from_database:
    # Load tables from database 
    dfs = preprocess_utils.load_all_tables()
else:
    # Open the file in binary read mode and load the pickle data
    with open('preprocessed_data/dfs.pickle', 'rb') as f:
        dfs = pickle.load(f)


In [2]:
if from_database:
    # Save tables to pickle
    with open('preprocessed_data/dfs.pickle', 'wb') as f:
        pickle.dump(dfs, f)

    # load it back with:
    with open('preprocessed_data/dfs.pickle', 'rb') as f:
        dfs = pickle.load(f)


 ## 2. Create dataframe claims with all the referenced table from the dataframes

In [13]:
def clean_df(df):
    # Create a copy of the DataFrame at the start
    df = df.copy()
    
    columns_to_remove = ['user_id', 'orcid_user_id', 
                    'created_at', 'updated_at', 'assertion_updated_at', 
                    'workspace_id', 'user_id', 'doi', 'organism_id', # 'pmid', 
                    'all_tags_json', 'obsolete', 'ext', 'badge_classes','pluralize_title',
                    'can_attach_file', 'refresh_side_panel', 'icon_classes', 'btn_classes']
    patterns_to_remove = ['validated', 'filename', 'obsolete_article']
    
    # Remove existing columns
    existing_cols = [col for col in columns_to_remove if col in df.columns]
    if existing_cols:
        df = df.drop(existing_cols, axis=1)
    
    # Remove pattern-matched columns
    pattern_cols = []
    for pattern in patterns_to_remove:
        pattern_cols.extend([c for c in df.columns if pattern in c])
    if pattern_cols:
        df = df.drop(pattern_cols, axis=1)
    
    return df

# Main processing
claims = dfs["assertions"].copy()
print(len(claims))
claims = claims[claims["obsolete"] == False].copy()
print(len(claims))
claims = clean_df(claims)

id_cols = [col for col in claims.columns if "_id" in col]
print(id_cols)

# Process articles
articles = clean_df(dfs["articles"])
articles = articles.rename(columns={"id": "article_id"})
claims = claims.merge(articles, on="article_id", how="left", suffixes=('', '_article'))

id_cols = [col for col in claims.columns if "_id" in col]
print(id_cols)

# Process journals
journals = clean_df(dfs["journals"])
journals = journals.drop('tag', axis=1).rename(columns={"id": "journal_id", "name": "journal_name"})
claims = claims.merge(journals, on="journal_id", how="left", suffixes=('', '_journal')).drop("journal_id", axis=1)

# Process assertion types
assertion_types = clean_df(dfs["assertion_types"])
assertion_types = assertion_types.rename(columns={"id": "assertion_type_id", "name": "assertion_type"})
claims = claims.merge(assertion_types, on="assertion_type_id", how="left", suffixes=('', '_assertion_type')).drop("assertion_type_id", axis=1)

# Process assessment types
assessment_types = clean_df(dfs["assessment_types"])
assessment_types = assessment_types.rename(columns={"id": "assessment_type_id", "name": "assessment_type"})
claims = claims.merge(assessment_types, on="assessment_type_id", how="left", suffixes=('', '_assessment_type')).drop("assessment_type_id", axis=1)

id_cols = [col for col in claims.columns if "_id" in col]
print(id_cols)

13941
13299
['article_id', 'assertion_type_id', 'assessment_type_id']
['article_id', 'assertion_type_id', 'assessment_type_id', 'journal_id']
['article_id']


In [4]:
claims = claims.drop(['published_at', 'badge_tag_classes','description', 'additional_context', 'references_txt'], axis=1) # most not consistently used accross dataset
claims = claims.set_index('id')

In [5]:
# Update impact factor for "Proceedings. Biological sciences" and standardize the journal name
claims.loc[claims["journal_name"] == "Proceedings. Biological sciences", "impact_factor"] = 4.7 
claims.loc[claims["journal_name"] == "Proceedings. Biological sciences", "journal_name"] = "Proceedings Biological Sciences"

In [6]:
string_columns = claims.select_dtypes(include='object').columns
print(string_columns)

for col in string_columns:
    claims[col] = claims[col].apply(lambda x: x.replace('&amp;', '&') if isinstance(x, str) else x)

claims.to_csv('preprocessed_data/claims.csv')



Index(['content', 'pmid', 'authors_txt', 'title', 'volume', 'issue',
       'abstract', 'key', 'affs_json', 'large_scale', 'journal_name',
       'assertion_type', 'label', 'assessment_type'],
      dtype='object')


In [7]:
def truncate_string(s, max_length=20):
    """Truncate string to max_length characters."""
    if isinstance(s, str) and len(s) > max_length:
        return s[:max_length] + '...'
    return s

string_columns = string_columns.drop(["assessment_type", "journal_name"])

df_truncated = claims.copy()

for col in string_columns:
    if col in df_truncated.columns:
        df_truncated[col] = df_truncated[col].apply(lambda x: truncate_string(x))

# Save truncated dataframe
df_truncated.to_csv('preprocessed_data/claims_truncated_for_llm.csv', index=False)
df_truncated

Unnamed: 0_level_0,content,article_id,rank,pmid,authors_txt,title,pmid_article,volume,issue,abstract,...,large_scale,nber_tables,nber_panels,journal_name,impact_factor,assertion_type,label,is_assessed,assessment_type,rank_assessment_type
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
5838,"<p>Belozerov VE, Lin...",2049.0,1,,Zhuang ZH;Zhou Y;Yu ...,Regulation of Drosop...,16014325.0,18,4,The p38 mitogen-acti...,...,False,0.0,14.0,Cellular signalling,4.8,reference,Reference,False,,
693,,2517.0,1,,Choe KM;Werner T;Stö...,Requirement for a pe...,11872802.0,296,5566,Components of microb...,...,False,,,"Science (New York, N.Y.)",56.9,assessment,Assessment,False,Not assessed,13.0
695,,2517.0,1,,Choe KM;Werner T;Stö...,Requirement for a pe...,11872802.0,296,5566,Components of microb...,...,False,,,"Science (New York, N.Y.)",56.9,assessment,Assessment,False,Not assessed,13.0
5840,Verified by many sub...,2204.0,1,,Zettervall CJ;Anderl...,A directed screen fo...,15381778.0,101,39,An attack by a paras...,...,False,2.0,9.0,Proceedings of the National Academy of Science...,11.1,assessment,Assessment,False,Verified,1.0
5839,Activation of a cell...,2204.0,1,,Zettervall CJ;Anderl...,A directed screen fo...,15381778.0,101,39,An attack by a paras...,...,False,2.0,9.0,Proceedings of the National Academy of Science...,11.1,main_claim,Main claim,True,Verified,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14468,The DIF antiserum us...,2898.0,1,,Williams MJ;Rodrigue...,The 18-wheeler mutat...,9321392.0,16,20,Mammals and insects ...,...,False,0.0,27.0,The EMBO journal,11.4,comment,Comment,False,Not assessed,13.0
11330,Although independent...,1577.0,1,,Shia AK;Glittenberg ...,Toll-dependent antim...,19934223.0,122,Pt 24,"In Drosophila, the h...",...,False,1.0,27.0,Journal of cell science,4.0,assessment,Assessment,False,"Unchallenged, logically consistent",6.0
11329,Spatzle (#gene:FBgn0...,1577.0,1,,Shia AK;Glittenberg ...,Toll-dependent antim...,19934223.0,122,Pt 24,"In Drosophila, the h...",...,False,1.0,27.0,Journal of cell science,4.0,main_claim,Main claim,True,"Unchallenged, logically consistent",6.0
11350,Although independent...,1577.0,1,,Shia AK;Glittenberg ...,Toll-dependent antim...,19934223.0,122,Pt 24,"In Drosophila, the h...",...,False,1.0,27.0,Journal of cell science,4.0,assessment,Assessment,False,"Unchallenged, logically consistent",6.0


## 3. Preprocess the author files
### A. Last authors
 It seems that
 - sex -> FH
 - PhD Post-doc -> FH
 - Become a Pi -> FH
 - current job -> FH
 - MD -> **???**
 - Affiliation -> Both
 - Country -> Both
 - Ivy League -> Both

In [8]:
# read an xlx file in pandas
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import  preprocess_utils

claims_df = pd.read_csv('preprocessed_data/claims_truncated_for_llm.csv')
major_claims_df = claims_df[claims_df['assertion_type'] == 'major_claim']

# Read Author info, which contains all the pairs
paper_auth_pairs = pd.read_excel('input_data/2025-02-14_last_xlsx/1_Triage_Last author.xlsx', sheet_name='Tri sans les doublons')
# Drop all columns with 'Unnamed' in the name
paper_auth_pairs = paper_auth_pairs.drop(columns=[col for col in paper_auth_pairs.columns if 'Unnamed' in col]).drop(columns=['Source'])
paper_auth_pairs.to_csv('input_data/2025-02-14_last_xlsx/1_Triage_Last author.csv', index=False)

first_authors_claims = pd.read_excel('input_data/2025-03-11/2025_March 9th_stats_author.xlsx', sheet_name='First')
leading_authors_claims = pd.read_excel('input_data/2025-03-11/2025_March 9th_stats_author.xlsx', sheet_name='Leading')
leading_authors_claims["Authorship"]= "Leading"
first_authors_claims["Authorship"]= "First"

authors_claims = pd.concat([leading_authors_claims, first_authors_claims])
authors_claims['Sex'] = authors_claims['Sex'].map({1: 'Male', 0: 'Female'})
authors_claims = authors_claims.drop(columns=[col for col in authors_claims.columns if '%' in col])
authors_claims = authors_claims.drop(columns=[col for col in authors_claims.columns if 'Unnamed' in col])

authors_claims = authors_claims.rename(columns={'Conituinity': 'Continuity', "Partially verified":"Partially Verified"})

authors_claims['Historical lab'] = authors_claims['Historical lab'].astype('boolean')
authors_claims['Continuity'] = authors_claims['Continuity'].astype('boolean')

authors_claims.to_csv('input_data/2025-02-14_last_xlsx/stats_author.csv', index=False)


In [14]:
paper_auth_pairs_LH = paper_auth_pairs[["last author", "Affiliation", "Country", "Ivy league"]]
paper_auth_pairs_LH = preprocess_utils.deduplicate_by(paper_auth_pairs_LH, "last author").copy()
claims_LH = authors_claims[authors_claims['Authorship'] == 'Leading'].copy()

In [15]:
# Create merge columns: lowercased and stripped of accents
# For paper_auth_pairs_LH
paper_auth_pairs_LH.loc[:, 'lh_proc'] = paper_auth_pairs_LH['last author'].str.lower()
paper_auth_pairs_LH.loc[:, 'lh_proc'] = (paper_auth_pairs_LH['lh_proc']
    .str.normalize('NFKD')
    .str.encode('ascii', errors='ignore')
    .str.decode('utf-8'))

# For claims_LH
claims_LH.loc[:, 'lh_proc'] = claims_LH['Name'].str.lower()
claims_LH.loc[:, 'lh_proc'] = claims_LH['lh_proc'].fillna('')
claims_LH.loc[:, 'lh_proc'] = (claims_LH['lh_proc']
    .str.normalize('NFKD')
    .str.encode('ascii', errors='ignore')
    .str.decode('utf-8'))
claims_LH.loc[:, 'lh_proc'] = claims_LH['lh_proc'].str.replace('ando i', 'ando')

# Perform the merge
all_LH = pd.merge(claims_LH, paper_auth_pairs_LH, on='lh_proc', how='outer')
print(len(claims_LH), len(paper_auth_pairs_LH), len(all_LH))

159 161 164


In [16]:
unique_pairs = all_LH[["Name", "last author", "lh_proc"]].drop_duplicates().sort_values("last author", ascending=True)
for i in range(0, len(unique_pairs)):
    if pd.isna(unique_pairs.iloc[i]['last author']) or pd.isna(unique_pairs.iloc[i]['Name']):
        print('💥 ', end='')
        print(f"{unique_pairs.iloc[i]['lh_proc']:<20} {unique_pairs.iloc[i]['last author']:<20}  {unique_pairs.iloc[i]['Name']}")

💥 bellotti ra          Bellotti RA           nan
💥 rizki mt             RIZKI MT              nan
💥 shahabuddin m        Shahabuddin M         nan
💥 shirasu-hiza mm      Shirasu-Hiza MM       nan
💥 silvers mj           Silvers MJ            nan
💥 schneider ds+d       nan                   Schneider DS+D
💥 nappi aj             nan                   Nappi AJ
💥                      nan                   nan


In [19]:
all_LH_inner = pd.merge(claims_LH, paper_auth_pairs_LH, on='lh_proc', how='inner')
all_LH_inner.drop(columns=['lh_proc', 'last author', 'Authorship'], inplace=True)
print(len(all_LH_inner))
all_LH_inner.to_csv('preprocessed_data/LH_inner.csv', index=False)

156


### B. First authors: TODO

In [None]:
paper_auth_pairs_FH = paper_auth_pairs[["first author", "Affiliation", "Country", "Ivy league"]] # TODO
paper_auth_pairs_FH = deduplicate_by(paper_auth_pairs_FH, "first author")
claims_FH = authors_claims[authors_claims['Authorship'] == 'First']

# create merge columns: lowercased and stripped of accents
paper_auth_pairs_FH['fh_proc'] = paper_auth_pairs_FH['first author'].str.lower()
paper_auth_pairs_FH['fh_proc'] = paper_auth_pairs_FH['fh_proc'].str.normalize('NFKD').str.encode('ascii', errors='ignore').str.decode('utf-8')
claims_FH['fh_proc'] = claims_FH['Name'].str.lower()
claims_FH['fh_proc'] = claims_FH['fh_proc'].str.normalize('NFKD').str.encode('ascii', errors='ignore').str.decode('utf-8')

all_FH = pd.merge(claims_FH, paper_auth_pairs_FH, on='fh_proc', how='outer')
print(len(claims_FH), len(paper_auth_pairs_FH), len(all_FH))

unique_pairs = all_FH[["Name", "first author", "fh_proc"]].drop_duplicates().sort_values("first author", ascending=True)
for i in range(0, len(unique_pairs)):
    if pd.isna(unique_pairs.iloc[i]['first author']) or pd.isna(unique_pairs.iloc[i]['Name']):
        print('💥 ', end='')
        print(f"{unique_pairs.iloc[i]['fh_proc']:<20} {unique_pairs.iloc[i]['first author']:<20}  {unique_pairs.iloc[i]['Name']}")
