In [21]:
import pandas as pd
import networkx as nx
from tqdm import tqdm

In [3]:
# Read the graph
G = nx.read_graphml("graphs/all.graphml")

# Find the maximum cliques
cliques = list(nx.find_cliques(G))

In [4]:
# print the number of cliques
print(f"Number of cliques: {len(cliques)}")
# print the size of the largest clique
print(f"Size of the largest clique: {len(max(cliques, key=len))}")

Number of cliques: 69879
Size of the largest clique: 99


In [5]:
# Find the maximum clique
max_clique = max(cliques, key=len)

In [45]:
# load data_clean
df = pd.read_csv("../data_light/data_clean.csv")
df['researchers_per_pub'] = df.groupby('pub_id')['researcher_id'].transform('count')

# find the researchers in the maximum clique
researchers = df[df['researcher_id'].isin(max_clique)]
researchers.head()

Unnamed: 0,pub_id,researcher_id,journal,year,gender,LMIC,researchers_per_pub
3467,pub.1001235026,ur.0706607200.18,JAMA,2007,male,0,14
26396,pub.1009244195,ur.0601454161.39,JAMA,2013,male,0,12
47570,pub.1017038770,ur.0706133702.96,New England Journal of Medicine,2016,female,0,21
92797,pub.1033600882,ur.0706133702.96,Nature Medicine,2012,female,0,22
98333,pub.1035740594,ur.0706133702.96,JAMA,2014,female,0,24


In [25]:
# sort by publications with most authors
researchers.groupby('pub_id')['researcher_id'].count().sort_values(ascending=False)

pub_id
pub.1145514438    99
pub.1036421772     2
pub.1123046780     2
pub.1001235026     1
pub.1009244195     1
pub.1017038770     1
pub.1033600882     1
pub.1035740594     1
pub.1135436665     1
pub.1139508327     1
pub.1151221129     1
Name: researcher_id, dtype: int64

In [26]:
# for all cliques, compute the number of publications
publications_per_clique = []
for clique in tqdm(cliques):
    clique_publications = df[df['researcher_id'].isin(clique)]['pub_id'].unique().tolist()
    # number of publications per clique
    publications_per_clique.append(len(clique_publications))    


100%|██████████| 69879/69879 [1:09:31<00:00, 16.75it/s]


In [55]:
# create a DataFrame to store the results
clique_df = pd.DataFrame({
    'clique': cliques,
    'publications': publications_per_clique,
    'authors': [len(clique) for clique in cliques]
}).sort_values(by='publications', ascending=False)

# save the DataFrame to a CSV file
clique_df.to_csv("cliques/cliques.csv", index=False)


In [46]:
max_pub_clique = cliques[63192]
researchers = df[df['researcher_id'].isin(max_pub_clique)]

In [47]:
# value counts on lmic and gender
researchers['gender'].value_counts(normalize=True)

male      0.931911
female    0.068089
Name: gender, dtype: float64

In [53]:
researchers.value_counts('researchers_per_pub', normalize=True)

researchers_per_pub
3     0.140408
2     0.121633
4     0.083265
5     0.083265
6     0.067755
7     0.059592
8     0.045714
38    0.039184
9     0.038367
10    0.037551
11    0.029388
13    0.022041
12    0.020408
14    0.019592
71    0.017959
20    0.014694
17    0.013878
19    0.013878
23    0.013061
15    0.012245
22    0.011429
26    0.008980
25    0.008163
21    0.008163
18    0.008163
24    0.007347
31    0.005714
16    0.005714
28    0.004898
56    0.004898
33    0.004082
29    0.003265
30    0.003265
32    0.003265
85    0.002449
90    0.002449
34    0.002449
40    0.001633
62    0.001633
27    0.001633
39    0.001633
41    0.000816
44    0.000816
37    0.000816
66    0.000816
81    0.000816
84    0.000816
dtype: float64

In [56]:
# get the clique sorted by number of publications and number of authors
clique_df.sort_values(by=['publications', 'authors'], ascending=False).head(10)

Unnamed: 0,clique,publications,authors
63192,"[ur.012310237337.47, ur.01202235233.91, ur.067...",987,38
63339,"[ur.012310237337.47, ur.01202235233.91, ur.067...",968,71
63313,"[ur.012310237337.47, ur.01202235233.91, ur.067...",871,26
63215,"[ur.012310237337.47, ur.01202235233.91, ur.067...",854,24
63390,"[ur.012310237337.47, ur.01202235233.91, ur.013...",804,12
63340,"[ur.012310237337.47, ur.01202235233.91, ur.067...",799,24
63285,"[ur.012310237337.47, ur.01202235233.91, ur.067...",780,25
63194,"[ur.012310237337.47, ur.01202235233.91, ur.067...",776,24
62310,"[ur.012310237337.47, ur.01311162025.52, ur.013...",753,8
63389,"[ur.012310237337.47, ur.01202235233.91, ur.013...",709,8


In [42]:
# add column with no. researcher per publication


In [44]:
df.head()

Unnamed: 0,pub_id,researcher_id,journal,year,gender,LMIC,researchers_per_pub
0,pub.1000001707,ur.01006172666.33,JAMA,2015,female,0,28
1,pub.1000001707,ur.01012736025.78,JAMA,2015,male,0,28
2,pub.1000001707,ur.010132635727.81,JAMA,2015,male,0,28
3,pub.1000001707,ur.01023477132.25,JAMA,2015,female,0,28
4,pub.1000001707,ur.01043100547.54,JAMA,2015,female,0,28
