In [1]:
import os
import pathlib
import pandas as pd
from tqdm import tqdm

## Settings

In [2]:
df = pd.read_csv(os.path.join("..", "..", "data", "sample2.csv"))
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 235 entries, 0 to 234
Data columns (total 19 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   eid               235 non-null    object 
 1   s2id              230 non-null    object 
 2   doi               235 non-null    object 
 3   title             235 non-null    object 
 4   abstract          232 non-null    object 
 5   year              235 non-null    int64  
 6   authors           235 non-null    object 
 7   author_ids        235 non-null    object 
 8   affiliations      235 non-null    object 
 9   funding           109 non-null    object 
 10  PACs              95 non-null     object 
 11  publication_name  235 non-null    object 
 12  subject_areas     235 non-null    object 
 13  s2_authors        230 non-null    object 
 14  s2_author_ids     230 non-null    object 
 15  citations         201 non-null    object 
 16  references        191 non-null    object 
 1

# Create a Tensor

In [3]:
import sparse
from TELF.pre_processing import Beaver
from TELF.post_processing.Wolf.utils import load_file_as_dict

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
def get_id_to_name(df, name_col, id_col):
    """
    Creates a map of id to name. 
    The fist occurence of an id, name pair is recorded.
    
    Parameters:
    -----------
    df: pd.DataFrame
        The input DataFrame.
    name_col: str
        The column that contains names
    id_col: str
        The column contains the ids
        
    Returns:
    --------
    dict:
        The id to name map
    """
    id_to_name = {} 
    name_list = [x.split(';') for x in df[name_col].to_list()]
    id_list = [x.split(';') for x in df[id_col].to_list()]
    for id_sublist, name_sublist in zip(id_list, name_list):
        for id_, name_ in zip(id_sublist, name_sublist):
            if id_ not in id_to_name:
                id_to_name[id_] = name_
    return id_to_name


def create_tensor(df, col, output_dir='/tmp', joblib_backend='loky', n_jobs=-1, verbose=False):
    """
    Create a co-author tensor from the a specified dataframe and column.
    
    Parameters
    ----------
    fn: str, Path
        The path to the text file to be processed

    Returns
    -------
    dict
        The enumerated dict as an output
    """
    # create Beaver object to generate tensor creating
    # the (co-relationship x co-relationship x time) tensor
    beaver = Beaver()
    settings = {
        "dataset":df,
        "target_columns": [col,'year'],
        "split_authors_with": ';',
        "save_path":output_dir,
        "verbose": verbose,
        "n_jobs": n_jobs,
        "authors_idx_map": {},
        "joblib_backend": joblib_backend, #'multiprocessing', 
    }

    beaver.coauthor_tensor(**settings)
    X = sparse.load_npz(os.path.join(output_dir, 'coauthor.npz'))
    time_map = load_file_as_dict(os.path.join(output_dir, 'Time.txt'))
    author_map = load_file_as_dict(os.path.join(output_dir, 'Authors.txt'))
    return X, author_map, time_map

# Plots

In [5]:
from TELF.post_processing.Wolf import plot_tensor_graph_slices

### A. Co-Author

In [6]:
X, node_map, time_map = create_tensor(df, 'author_ids', n_jobs=1, verbose=True)
X

preparing dictionaries for communication


100%|██████████| 1/1 [00:00<00:00, 3880.02it/s]
100%|██████████| 302/302.0 [00:00<00:00, 1179403.92it/s]




0,1
Format,coo
Data Type,float64
Shape,"(9, 9, 36)"
nnz,302
Density,0.10356652949245541
Read-only,True
Size,9.4K
Storage ratio,0.41


In [7]:
author_name_map = get_id_to_name(df, 'authors', 'author_ids')
author_attributes = {}
for auth_id, auth_name in author_name_map.items():
    author_attributes[auth_id] = {'name': auth_name}

len(author_attributes)

9

In [8]:
name_to_slic = {
    'Rasmussen K.': '0506d5ab-b679-415f-a3ce-40c762a73251',
    'Alexandrov B.': 'dd635e7f-7148-4350-b808-7492912a8e19',
    'Eren M.E.': 'f70f1e3b-55cb-4160-a358-15aa647ebf60',
    'Bhattarai M.': '3b85ee58-825f-4734-8a19-26b6de71c7fc',
    'Solovyev N.': 'a1ae2e36-2319-4c17-8722-f32129c3d1a2'
}

for name in name_to_slic:
    print(f'{name.ljust(8)}, {name_to_slic[name].ljust(3)}, {author_attributes[name_to_slic[name]]}')

Rasmussen K., 0506d5ab-b679-415f-a3ce-40c762a73251, {'name': 'Rasmussen K.'}
Alexandrov B., dd635e7f-7148-4350-b808-7492912a8e19, {'name': 'Alexandrov B.'}
Eren M.E., f70f1e3b-55cb-4160-a358-15aa647ebf60, {'name': 'Eren M.E.'}
Bhattarai M., 3b85ee58-825f-4734-8a19-26b6de71c7fc, {'name': 'Bhattarai M.'}
Solovyev N., a1ae2e36-2319-4c17-8722-f32129c3d1a2, {'name': 'Solovyev N.'}


In [9]:
plot_tensor_graph_slices(X, node_map, time_map, 
                         node_attributes=author_attributes, 
                         trace_nodes = [
                             name_to_slic['Rasmussen K.'],
                             name_to_slic['Alexandrov B.'],
                         ],
                         highlight_nodes = [
                             name_to_slic['Rasmussen K.'],
                             name_to_slic['Alexandrov B.'],
                         ],
                         window = 0,  # dont unlink authors after connection is made
                         globalize_node_sizes=True,  # set max node size from largest degree node across all years
                         filter_isolated_nodes=False,  # if True only show nodes that have edges
                         verbose=True)   

Generating graphs for each slice. . .
100%|██████████| 36/36 [00:01<00:00, 33.16it/s]
Composing graphs with window size=0. . .
100%|██████████| 36/36 [00:00<00:00, 8786.95it/s]
Generating Plotly frames. . .
100%|██████████| 36/36 [00:00<00:00, 322.22it/s]
Preparing figure. . .


### B. Co-Affiliation

In [10]:
import ast 

affiliation_ids = []
affiliation_names = []
countries = []
for idx, row in df.iterrows():
    curr_countries = []
    curr_names = []
    affils = ast.literal_eval(row.affiliations)
    affiliation_ids.append(";".join(affils.keys()))
    for aid, values in affils.items():
        curr_countries.append(values["country"])
        curr_names.append(values["name"])

    affiliation_names.append(";".join(curr_names))
    countries.append(";".join(curr_countries))
    
df["affiliation_ids"] = affiliation_ids
df["affiliation_names"] = affiliation_names
df["countries"] = countries

In [11]:
X, node_map, time_map = create_tensor(df, 'affiliation_ids', n_jobs=1, verbose=True)
X

preparing dictionaries for communication


100%|██████████| 1/1 [00:00<00:00, 13148.29it/s]
100%|██████████| 74/74.0 [00:00<00:00, 1293243.73it/s]


0,1
Format,coo
Data Type,float64
Shape,"(24, 24, 36)"
nnz,74
Density,0.0035686728395061726
Read-only,True
Size,2.3K
Storage ratio,0.01


In [12]:
# create a node attribute map so that author IDs are given names
affiliation_name_map = get_id_to_name(df, 'affiliation_names', 'affiliation_ids')
affiliation_attributes = {}
for aff_id, aff_name in affiliation_name_map.items():
    affiliation_attributes[aff_id] = {'name': aff_name}

len(affiliation_attributes)

24

In [13]:
plot_tensor_graph_slices(X, node_map, time_map, 
                         node_attributes=affiliation_attributes, 
                         trace_nodes = affiliation_ids[:3],
                         window = 3,  # sliding window of 3 years (only show connections from up to 3 years ago)
                         globalize_node_sizes=True,  # set max node size from largest degree node across all years
                         filter_isolated_nodes=False,  # if True only show nodes that have edges
                         verbose=True)   

Generating graphs for each slice. . .
100%|██████████| 36/36 [00:00<00:00, 7436.71it/s]
Composing graphs with window size=3. . .
100%|██████████| 36/36 [00:00<00:00, 23872.72it/s]
Generating Plotly frames. . .
100%|██████████| 36/36 [00:00<00:00, 219.49it/s]
Preparing figure. . .


### C. Co-Country

In [14]:
X, node_map, time_map = create_tensor(df, 'countries', n_jobs=1, verbose=True)
X

preparing dictionaries for communication


100%|██████████| 1/1 [00:00<00:00, 16131.94it/s]
100%|██████████| 74/74.0 [00:00<00:00, 340700.87it/s]


0,1
Format,coo
Data Type,float64
Shape,"(24, 24, 36)"
nnz,74
Density,0.0035686728395061726
Read-only,True
Size,2.3K
Storage ratio,0.01


In [15]:
plot_tensor_graph_slices(X, node_map, time_map, 
                         trace_nodes = countries[:3],
                         window = 1,  # only show connections for the current slice (year)
                         globalize_node_sizes=False,  # each slice uses local normalization for node sizing
                         filter_isolated_nodes=False,  # if True only show nodes that have edges
                         verbose=True)   

Generating graphs for each slice. . .
100%|██████████| 36/36 [00:00<00:00, 8055.64it/s]
Composing graphs with window size=1. . .
100%|██████████| 36/36 [00:00<00:00, 1161499.57it/s]
Generating Plotly frames. . .
100%|██████████| 36/36 [00:00<00:00, 221.11it/s]
Preparing figure. . .
