# Load Dataset

In [1]:
import os
import numpy as np
import pandas as pd
from tqdm import tqdm
from collections import defaultdict

from TELF.pre_processing import Beaver

In [2]:
df = pd.read_csv(os.path.join('..', '..', 'data', 'sample.csv'))
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 940 entries, 0 to 939
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   eid             940 non-null    object
 1   title           940 non-null    object
 2   year            940 non-null    int64 
 3   abstract        940 non-null    object
 4   authors         940 non-null    object
 5   author_ids      940 non-null    object
 6   references      843 non-null    object
 7   clean_abstract  940 non-null    object
dtypes: int64(1), object(7)
memory usage: 58.9+ KB


In [3]:
# replace the missing citations with an empty string
# these entries will be ignored by Beaver.citation_tensor()
df.replace(np.nan, "", regex=True, inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 940 entries, 0 to 939
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   eid             940 non-null    object
 1   title           940 non-null    object
 2   year            940 non-null    int64 
 3   abstract        940 non-null    object
 4   authors         940 non-null    object
 5   author_ids      940 non-null    object
 6   references      940 non-null    object
 7   clean_abstract  940 non-null    object
dtypes: int64(1), object(7)
memory usage: 58.9+ KB


# Build Participation Tensor

In [4]:
beaver = Beaver()

In [5]:
# using joblib_backend='multiprocessing' leads to "NameError: name 'null' is not defined" 
# in python 3.8 multiprocessing module when trying to spawn multiple processes for n_jobs > 1 

settings = {
    "dataset":df,
    "target_columns":("author_ids", "eid", "references", "year"),
    "dimension_order":[0,1,2],
    "split_authors_with":";",
    "split_references_with":";",
    "save_path":None,
    "verbose":True,
    "n_jobs":1,
    "n_nodes":1,
    #"joblib_backend": "multiprocessing", 
}

X, author_ids, paper_ids, years = beaver.citation_tensor(**settings)

preparing dictionaries for communication


100%|██████████| 1/1 [00:00<00:00, 57.24it/s]
100%|██████████| 51119/51119.0 [00:00<00:00, 1978523.42it/s]


In [6]:
X

0,1
Format,coo
Data Type,float64
Shape,"(7054, 940, 29)"
nnz,51119
Density,0.0002658404372848715
Read-only,True
Size,1.6M
Storage ratio,0.00


# Test Validity 

In [7]:
# match the values in settings
author_ids_col        = settings['target_columns'][0]
paper_id_col          = settings['target_columns'][1]
references_col        = settings['target_columns'][2]
time_col              = settings['target_columns'][3]

author_ids_idx = settings['dimension_order'][0]
paper_id_idx   = settings['dimension_order'][1]
references_idx = settings['dimension_order'][2]

split_authors_with    = settings['split_authors_with']
split_references_with = settings['split_references_with']

### Test for valid non-zero entry positions in the tensor

To test the this property of this tensor, we can create a map of all authors and the papers they cite using the DataFrame. <br>
Then we do the same for the tensor. If the two maps match, then the tensor correctly has entries for all authors citing some paper at some time. <br>
**NOTE:** This check will not validate the actual values (# of times the citation has been made)

#### Form map from DataFame

In [8]:
from collections import defaultdict

def map_from_dataframe(df, author_ids='author_ids', paper_id='eid', references='references', time='year', split_authors_with=';', split_references_with=';'):
    
    # get all of the paper ids found in the dataset (to exclude cited paper ids outside of the dataset)
    paper_ids = set(df[paper_id].unique())
    
    # this line is functionally equivalent to defining an empty 2d dict of sets
    df_map = defaultdict(lambda: defaultdict(set))
    
    for author_ids_list, references_list, year in zip(df[author_ids].to_list(), df[references].to_list(), df[time].to_list()):
        year = int(year)
        for author_id in author_ids_list.split(split_authors_with):
            for paper_id in references_list.split(split_references_with):
                if paper_id not in paper_ids:
                    continue
                df_map[author_id][year].add(paper_id)
    
    # convert to dict of dicts on return 
    df_map = dict(df_map)
    return {k: dict(v) for k,v in df_map.items()}

In [9]:
df_map = map_from_dataframe(df, author_ids_col, paper_id_col, references_col, time_col, split_authors_with, split_references_with)
dict(list(df_map.items())[:1])  # example of single entry in the map

{'3df61b32-6781-11ee-b983-4ab2673ea3f0': {2016: {'3cbe02ac-6781-11ee-b983-4ab2673ea3f0',
   '3cbe2674-6781-11ee-b983-4ab2673ea3f0',
   '3cbe2bec-6781-11ee-b983-4ab2673ea3f0',
   '3cbe2f98-6781-11ee-b983-4ab2673ea3f0',
   '3cbe41c2-6781-11ee-b983-4ab2673ea3f0',
   '3cbe5342-6781-11ee-b983-4ab2673ea3f0',
   '3cbe6206-6781-11ee-b983-4ab2673ea3f0',
   '3cbe6d64-6781-11ee-b983-4ab2673ea3f0',
   '3cbe6dd2-6781-11ee-b983-4ab2673ea3f0',
   '3cbe717e-6781-11ee-b983-4ab2673ea3f0'}}}

In [10]:
len(df_map)  # number of authors that have cited another paper in this dataset

6606

#### Form map from tensor

In [11]:
from collections import defaultdict

def map_from_tensor(X, author_ids, paper_ids, time, author_index=0, paper_index=1, time_idx=2):
    
    # get the locations of non-zero entries in X
    nz = X.nonzero()
    
    # this line is functionally equivalent to defining an empty 2d dict of sets
    X_map = defaultdict(lambda: defaultdict(set))
    
    for i in range(len(nz[0])):  # for every non-zero value
        
        # get coordinates for maps from non-zero entry
        i_author, i_paper, i_time = nz[author_index][i], nz[paper_index][i], nz[time_idx][i]
    
        # convert coordinates to data
        author_id = author_ids[i_author]
        paper_id = paper_ids[i_paper]
        year = time[i_time]
        
        # add to dict
        X_map[author_id][year].add(paper_id)
    
    # convert to dict of dicts on return 
    X_map = dict(X_map)
    return {k: dict(v) for k,v in X_map.items()}

In [12]:
X_map = map_from_tensor(X, author_ids, paper_ids, years, author_ids_idx, paper_id_idx, references_idx)
dict(list(X_map.items())[:1])  # example of single entry in the map

{'3df61b32-6781-11ee-b983-4ab2673ea3f0': {np.int64(2016): {'3cbe02ac-6781-11ee-b983-4ab2673ea3f0',
   '3cbe2674-6781-11ee-b983-4ab2673ea3f0',
   '3cbe2bec-6781-11ee-b983-4ab2673ea3f0',
   '3cbe2f98-6781-11ee-b983-4ab2673ea3f0',
   '3cbe41c2-6781-11ee-b983-4ab2673ea3f0',
   '3cbe5342-6781-11ee-b983-4ab2673ea3f0',
   '3cbe6206-6781-11ee-b983-4ab2673ea3f0',
   '3cbe6d64-6781-11ee-b983-4ab2673ea3f0',
   '3cbe6dd2-6781-11ee-b983-4ab2673ea3f0',
   '3cbe717e-6781-11ee-b983-4ab2673ea3f0'}}}

In [13]:
len(X_map)

6606

#### Compare

In [14]:
df_map == X_map

True

### Test sum to check if counts are correct

#### DataFrame

In [15]:
def sum_from_dataframe(df, author_ids='author_ids', paper_id='eid', references='references', time='year', split_authors_with=';', split_references_with=';'):
    
    # get all of the paper ids found in the dataset (to exclude cited paper ids outside of the dataset)
    paper_ids = set(df[paper_id].unique())
    
    sum = 0
    for author_ids_list, references_list, year in zip(df[author_ids].to_list(), df[references].to_list(), df[time].to_list()):
        year = int(year)
        for author_id in author_ids_list.split(split_authors_with):
            for paper_id in references_list.split(split_references_with):
                if paper_id in paper_ids:
                    sum += 1
    return sum

In [16]:
sum_from_dataframe(df, author_ids_col, paper_id_col, references_col, time_col, split_authors_with, split_references_with)

51148

#### tensor

In [17]:
np.sum(X)

np.float64(51148.0)