In [1]:
import pandas as pd
import numpy as np
from itertools import combinations 
from collections import defaultdict

In [2]:
def get_paper_authors(series):
    # remove comma from names
    authors = series.str.replace(',','')


    # split authors 
    authors = authors.str.split(pat=';')


    # for author in authors: 
    authors.apply(lambda x : [str(x[i]) for i in range(len(x))] if isinstance(x,str) else x)

    # delete first item in each list
    authors = authors.apply(lambda x: [x[i].split('/') for i in range(len(x)) ] if isinstance(x,list) else x)
    authors = authors.apply(lambda x : [x[i][1:] for i in range(len(x))] if isinstance(x,list) else x)

    #authors = authors.apply(lambda x : [x[i] for i in range(len(x)) if i % 2 != 0])
    author_id = authors.apply(lambda x : [i for elem in x for i  in elem] if isinstance(x, list) else x)
    
    return author_id

In [21]:
def get_unique_authors(authors):
    
    # append unique authors to list
    unique_authors = list()
    for _,names in authors.items():
        for name in names : 
            if name in unique_authors:
                continue
            else :
                unique_authors.append(name)
    
    # sort in alphabetical order
    unique_authors.sort()
    
    return unique_authors

In [17]:
def get_coauthor_matrix(series):
    # get paper : authors list
    paper_authors = get_paper_authors(series)
    
    coauthor_matrix = defaultdict(lambda: defaultdict(int))

    for _, group in paper_authors.items():
        # create a list of author combinations for this article
        if isinstance(group,list) :
            author_pairs = combinations(group, 2)
            # update the co-author matrix for each author pair
            for pair in author_pairs:
                coauthor_matrix[pair[0]][pair[1]] += 1
                coauthor_matrix[pair[1]][pair[0]] += 1

            
    return coauthor_matrix

In [5]:
def to_dataframe(dict_file):
    df = pd.DataFrame(dict_file)
    df.fillna(value=0, inplace=True)
    
    return df

In [18]:
df_super_conducter = pd.read_excel('super_conducter_science.xls')
df_lung_cancer  = pd.read_excel('lung_cancer.xls')

In [19]:
coauthor_SC = get_coauthor_matrix(df_super_conducter['Researcher Ids'])
coauthor_LC = get_coauthor_matrix(df_lung_cancer['Researcher Ids'])

In [22]:
df_coauthor_LC = to_dataframe(coauthor_LC)
df_coauthor_SC = to_dataframe(coauthor_SC)

In [23]:
coauthor_LC

defaultdict(<function __main__.get_coauthor_matrix.<locals>.<lambda>()>,
            {'B-1277-2014': defaultdict(int,
                         {'AAC-5192-2020': 1,
                          'L-4554-2015': 1,
                          'H-8031-2014': 1,
                          'B-7157-2017': 1,
                          'N-9666-2013': 1,
                          'M-9715-2015': 1}),
             'AAC-5192-2020': defaultdict(int,
                         {'B-1277-2014': 1,
                          'L-4554-2015': 1,
                          'H-8031-2014': 1,
                          'B-7157-2017': 1,
                          'N-9666-2013': 1,
                          'M-9715-2015': 1}),
             'L-4554-2015': defaultdict(int,
                         {'B-1277-2014': 1,
                          'AAC-5192-2020': 1,
                          'H-8031-2014': 1,
                          'B-7157-2017': 1,
                          'N-9666-2013': 1,
                          'M-9715-

In [24]:
df_coauthor_LC.to_csv('coauthor_lung_cancer.csv', index=True)

In [25]:
df_coauthor_SC.to_csv('coauthor_super_conducter.csv', index=True)

In [28]:
df_coauthor_LC.head()

Unnamed: 0,B-1277-2014,AAC-5192-2020,L-4554-2015,H-8031-2014,B-7157-2017,N-9666-2013,M-9715-2015,GYV-1209-2022,AAC-6130-2022,HNJ-1606-2023,...,AAC-7705-2019,P-2559-2015,C-4495-2009,C-4324-2009,Z-3985-2019,HTO-5007-2023,A-1292-2009,K-5816-2017,P-6831-2014,O-7399-2017
AAC-5192-2020,1.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
L-4554-2015,1.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
H-8031-2014,1.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
B-7157-2017,1.0,1.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
N-9666-2013,1.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [29]:
df = pd.DataFrame(coauthor_LC)

In [31]:
df.fillna(0, inplace=True)

In [33]:
df.index

Index(['AAC-5192-2020', 'L-4554-2015', 'H-8031-2014', 'B-7157-2017',
       'N-9666-2013', 'M-9715-2015', 'B-1277-2014', 'AAC-6130-2022',
       'GYV-1209-2022', 'U-3020-2019',
       ...
       'P-2559-2015', 'S-5576-2019', 'C-4324-2009', 'Z-3985-2019',
       'C-4495-2009', 'A-1292-2009', 'HTO-5007-2023', 'P-6831-2014',
       'O-7399-2017', 'K-5816-2017'],
      dtype='object', length=183)