## Merge

In [19]:
import pandas as pd
import pyreadr

# ============================================
# LOAD BASE DATA
# ============================================
train_df = pd.read_csv('data/processed/train.csv')
val_df = pd.read_csv('data/processed/val.csv')
test_df = pd.read_csv('data/processed/test.csv')
predictions_df = pd.read_csv('results/unused_predictions.csv')

pubmed = pyreadr.read_r('data/raw/pubmed.rds')
pubmed_df = list(pubmed.values())[0]

autoreg = pyreadr.read_r('data/raw/autoregulatoryDB.rds')
autoreg_df = list(autoreg.values())[0]

# ============================================
# PREPARE LABELED DATA (1,332 papers)
# ============================================
labeled_df = pd.concat([train_df, val_df, test_df], ignore_index=True)
labeled_df['mechanism_type'] = labeled_df['Terms'].apply(lambda x: x.split(',')[0].strip())
labeled_df['stage1_confidence'] = 1.0
labeled_df['stage2_confidence'] = 1.0
labeled_df['has_mechanism'] = True
labeled_df['is_training_data'] = True

labeled_df = labeled_df[['PMID', 'has_mechanism', 'stage1_confidence', 
                         'mechanism_type', 'stage2_confidence', 'is_training_data']]

# ============================================
# PREPARE PREDICTIONS (252,880 papers)
# ============================================
predictions_df['is_training_data'] = False

# ============================================
# COMBINE ALL PAPERS
# ============================================
all_df = pd.concat([labeled_df, predictions_df], ignore_index=True)

# ============================================
# ADD PUBMED METADATA
# ============================================
all_df = all_df.merge(pubmed_df, on='PMID', how='left')

# ============================================
# ADD PROTEIN INFO FROM AUTOREGDB
# ============================================
autoreg_df['PMID'] = pd.to_numeric(
    autoreg_df['RX'].str.extract(r'PubMed=(\d+)')[0],
    errors='coerce'
)

autoreg_agg = autoreg_df.groupby('PMID', as_index=False).agg({
    'AC': lambda x: ', '.join(x.dropna().astype(str).unique()),
    'OS': 'first'
})

all_df = all_df.merge(autoreg_agg, on='PMID', how='left')

# ============================================
# CREATE FINAL COLUMNS
# ============================================
all_df['Protein_ID'] = all_df.apply(
    lambda row: f"{row['AC'].split(', ')[0]}_{row['PMID']}" if pd.notna(row['AC']) else f"NA_{row['PMID']}",
    axis=1
)

all_df['Presence_of_Autoregulatory_Mechanism'] = all_df['has_mechanism'].map({
    True: 'Yes',
    False: 'No'
})

all_df['Probability_of_Presence'] = all_df['stage1_confidence']

all_df['Source'] = all_df['is_training_data'].map({
    True: 'UniProt',
    False: 'Non-UniProt'
})

all_df = all_df.rename(columns={
    'mechanism_type': 'Autoregulatory_Type',
    'stage2_confidence': 'Term_Probability'
})

# ============================================
# SELECT FINAL COLUMNS
# ============================================
final_df = final_df.rename(columns={
    'Protein_ID': 'Protein ID',
    'Presence_of_Autoregulatory_Mechanism': 'Has Mechanism',  # Shortened!
    'Probability_of_Presence': 'Mechanism Probability',
    'Autoregulatory_Type': 'Autoregulatory Type',
    'Term_Probability': 'Type Confidence'
})



In [20]:
final_df

Unnamed: 0,Protein ID,AC,OS,PMID,Title,Abstract,Journal,Authors,Has Mechanism,Mechanism Probability,Source,Autoregulatory Type,Type Confidence
0,P19712_24606708,P19712,Classical swine fever virus (strain Alfort/Tue...,24606708,Autocatalytic activity and substrate specifici...,Pestivirus N(pro) is the first protein transla...,Virology,"Keerthi Gottipati, Sudheer Acholi, Nicolas Rug...",Yes,1.000000,UniProt,autocatalytic,1.0
1,O54992_15538386,"O54992, Q61532",Mus musculus (Mouse),15538386,Scaffolding by ERK3 regulates MK5 in development.,"Extracellular-regulated kinase 3 (ERK3, MAPK6)...",The EMBO journal,"Stefanie Schumacher, Kathrin Laass, Shashi Kan...",Yes,1.000000,UniProt,autophosphorylation,1.0
2,B8BF91_20118235,"B8BF91, Q1MX30",Oryza sativa subsp indica (Rice).,20118235,A conserved threonine residue in the juxtamemb...,Despite the key role that pattern recognition ...,The Journal of biological chemistry,"Xuewei Chen, Mawsheng Chern, Patrick E Canlas,...",Yes,1.000000,UniProt,autophosphorylation,1.0
3,Q91J24_17280695,"Q91J24, P0C2W9, Q39011, Q9LHP4",Beet curly top virus (strain California/Logan)...,17280695,Geminivirus pathogenicity protein C4 interacts...,Beet curly top virus (BCTV) C4 interacted with...,Virology,"Nathalie Piroux, Keith Saunders, Anthony Page,...",Yes,1.000000,UniProt,autophosphorylation,1.0
4,Q52430_9057331,Q52430,Pseudomonas savastanoi pv phaseolicola (Pseudo...,9057331,"Expression of avrPphB, an avirulence gene from...",Protein production encoded by the avirulence g...,Molecular plant-microbe interactions : MPMI,"N Puri, C Jenner, M Bennett, R Stewart, J Mans...",Yes,1.000000,UniProt,autocatalytic,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
254207,P18541_9557665,P18541,Lymphocytic choriomeningitis virus (strain Arm...,9557665,"Two RING finger proteins, the oncoprotein PML ...",The promyelocytic leukemia (PML) protein forms...,Journal of virology,"K L Borden, E J Campbelldwyer, G W Carlile, M ...",No,0.992245,Non-UniProt,none,0.0
254208,P18541_10708446,P18541,Lymphocytic choriomeningitis virus (strain Arm...,10708446,The lymphocytic choriomeningitis virus RING pr...,Only a few host cell proteins that associate w...,Journal of virology,"E J Campbell Dwyer, H Lai, R C MacDonald, M S ...",No,0.973285,Non-UniProt,none,0.0
254209,P18541_11533204,P18541,Lymphocytic choriomeningitis virus (strain Arm...,11533204,RING finger Z protein of lymphocytic choriomen...,Arenaviruses have a bisegmented negative-stran...,Journal of virology,"T I Cornu, J C de la Torre",No,0.815674,Non-UniProt,none,0.0
254210,P18541_12050381,P18541,Lymphocytic choriomeningitis virus (strain Arm...,12050381,Characterization of the arenavirus RING finger...,The prototypic arenavirus lymphocytic choriome...,Journal of virology,"Tatjana I Cornu, Juan Carlos de la Torre",No,0.843628,Non-UniProt,none,0.0
