In [80]:
import pandas as pd
import networkx as nx
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

In [3]:
# Read the graph
G = nx.read_graphml("graphs/all.graphml")

# Find the maximum cliques
cliques = list(nx.find_cliques(G))

In [4]:
# print the number of cliques
print(f"Number of cliques: {len(cliques)}")
# print the size of the largest clique
print(f"Size of the largest clique: {len(max(cliques, key=len))}")

Number of cliques: 69879
Size of the largest clique: 99


In [81]:
# load data_clean
df = pd.read_csv("../data_light/data_clean.csv")
df['researchers_per_pub'] = df.groupby('pub_id')['researcher_id'].transform('count')

In [26]:
# for all cliques, compute the number of publications
publications_per_clique = []
for clique in tqdm(cliques):
    clique_publications = df[df['researcher_id'].isin(clique)]['pub_id'].unique().tolist()
    # number of publications per clique
    publications_per_clique.append(len(clique_publications))    


100%|██████████| 69879/69879 [1:09:31<00:00, 16.75it/s]


In [55]:
# create a DataFrame to store the results
clique_df = pd.DataFrame({
    'clique': cliques,
    'publications': publications_per_clique,
    'authors': [len(clique) for clique in cliques]
}).sort_values(by='publications', ascending=False)

# save the DataFrame to a CSV file
clique_df.to_csv("cliques/cliques.csv", index=False)


In [84]:
# Top 5 cliques per number of publications
# find 5 cliques that do not intersect, start with cliques with most publications, stop when 5 cliques are found
clique_df_sorted = clique_df.sort_values(by=['publications'], ascending=False)

selected_cliques = []
most_pubs_indexes = []

for index, row in tqdm(clique_df_sorted.iterrows()):
    clique = set(row['clique'])

    if all(len(clique.intersection(selected_clique)) == 0 for selected_clique in selected_cliques):
        selected_cliques.append(clique)
        most_pubs_indexes.append(index)

    if len(selected_cliques) == 5:
        break

# iterate over the selected cliques, find the researchers, save to dataframe the 
# number of authors, number of publications, percentages of LMIC, percentages of female
clique_df_selected = pd.DataFrame()

for clique in tqdm(selected_cliques):
    clique_df_selected = clique_df_selected.append({
        'authors': len(clique),
        'publications': len(df[df['researcher_id'].isin(clique)]['pub_id'].unique()),
        'HIC': df[df['researcher_id'].isin(clique)]['LMIC'].value_counts(normalize=True)[0],
        'male': df[df['researcher_id'].isin(clique)]['gender'].value_counts(normalize=True)[0]
    }, ignore_index=True)


clique_df_selected

134it [00:00, 2791.77it/s]
100%|██████████| 5/5 [00:01<00:00,  4.84it/s]


Unnamed: 0,authors,publications,HIC,male
0,38.0,987.0,0.967347,0.931911
1,57.0,553.0,0.593103,0.764622
2,6.0,477.0,0.948,0.936709
3,62.0,471.0,0.950207,0.798883
4,7.0,465.0,1.0,1.0


In [85]:
# Top 5 Cliques by number of authors
# find 5 cliques that do not intersect, start with cliques with most authors, stop when 5 cliques are found
clique_df_sorted = clique_df.sort_values(by=['authors'], ascending=False)

selected_cliques = []
most_pubs_indexes = []

for index, row in tqdm(clique_df_sorted.iterrows()):
    clique = set(row['clique'])

    if all(len(clique.intersection(selected_clique)) == 0 for selected_clique in selected_cliques):
        selected_cliques.append(clique)
        most_pubs_indexes.append(index)

    if len(selected_cliques) == 5:
        break

# iterate over the selected cliques, find the researchers, save to dataframe the 
# number of authors, number of publications, percentages of LMIC, percentages of female
clique_df_selected = pd.DataFrame()

for clique in tqdm(selected_cliques):
    clique_df_selected = clique_df_selected.append({
        'authors': len(clique),
        'publications': len(df[df['researcher_id'].isin(clique)]['pub_id'].unique()),
        'HIC': df[df['researcher_id'].isin(clique)]['LMIC'].value_counts(normalize=True)[0],
        'male': df[df['researcher_id'].isin(clique)]['gender'].value_counts(normalize=True)[0]
    }, ignore_index=True)


clique_df_selected

4it [00:00, 307.47it/s]
100%|██████████| 5/5 [00:01<00:00,  3.65it/s]


Unnamed: 0,authors,publications,HIC,male
0,99.0,11.0,1.0,0.656566
1,95.0,78.0,0.990148,0.678756
2,94.0,39.0,1.0,0.926174
3,91.0,113.0,0.648,0.742222
4,91.0,23.0,0.884615,0.615385
