# Load Dataset

In [1]:
import os
import numpy as np
import pandas as pd
from tqdm import tqdm
from collections import defaultdict

from TELF.pre_processing import Beaver

In [2]:
df = pd.read_csv(os.path.join('..', '..', 'data', 'sample.csv'))
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 940 entries, 0 to 939
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   eid             940 non-null    object
 1   title           940 non-null    object
 2   year            940 non-null    int64 
 3   abstract        940 non-null    object
 4   authors         940 non-null    object
 5   author_ids      940 non-null    object
 6   references      843 non-null    object
 7   clean_abstract  940 non-null    object
dtypes: int64(1), object(7)
memory usage: 58.9+ KB


### Check how many total author publications are in the dataset
Tensor should contain this many non-zero values.

In [3]:
author_pub_counts = defaultdict(int)
for authors in df.author_ids.to_list():
    for author in authors.split(';'):
        author_pub_counts[author] += 1
        
sum(author_pub_counts.values())

9917

# Build Participation Tensor

In [4]:
beaver = Beaver()

In [5]:
settings = {
    "dataset":df,
    "target_columns":("author_ids", "eid", "year"),
    "dimension_order":[0,1,2],
    "split_authors_with":";",
    "save_path":None,
    "verbose":True,
    "n_jobs":1,
    "n_nodes":1
}

X, author_ids, paper_ids, years = beaver.participation_tensor(**settings)

preparing dictionaries for communication


100%|██████████| 1/1 [00:00<00:00, 230.25it/s]
100%|██████████| 9917/9917.0 [00:00<00:00, 1634891.63it/s]


In [6]:
X

0,1
Format,coo
Data Type,float64
Shape,"(7054, 940, 29)"
nnz,9917
Density,5.15725975968636e-05
Read-only,True
Size,309.9K
Storage ratio,0.00


In [7]:
def inz_paper_author(X, i, author_ids, paper_ids, dimension_order=[0,1,2]):
    """ return the paper and author associated with the ith non-zero entry in X """
    
    nz = X.nonzero()
    
    # get author and paper dimension index corresponding to X 
    author_index = dimension_order.index(0)
    paper_index = dimension_order.index(1)
    i_author = nz[author_index][i]
    i_paper = nz[paper_index][i]
    return ( author_ids[i_author], paper_ids[i_paper] )

In [8]:
author_id, eid = inz_paper_author(X, 0, author_ids, paper_ids, dimension_order=settings['dimension_order'])
eid, author_id

('3cbdf82a-6781-11ee-b983-4ab2673ea3f0',
 '3df61b32-6781-11ee-b983-4ab2673ea3f0')

### Paper matching eid

In [9]:
df.loc[(df.eid == eid)]

Unnamed: 0,eid,title,year,abstract,authors,author_ids,references,clean_abstract
0,3cbdf82a-6781-11ee-b983-4ab2673ea3f0,Paper Title,2016,Supervisory Control and Data Acquisition (SCAD...,Name;Name;Name;Name;Name,3df61b32-6781-11ee-b983-4ab2673ea3f0;3df61c18-...,3cbe2bec-6781-11ee-b983-4ab2673ea3f0;3cbe6d64-...,supervisory control acquisition system often s...


### Papers published by author_id

In [10]:
df.loc[(df.author_ids.str.contains(str(author_id)))]

Unnamed: 0,eid,title,year,abstract,authors,author_ids,references,clean_abstract
0,3cbdf82a-6781-11ee-b983-4ab2673ea3f0,Paper Title,2016,Supervisory Control and Data Acquisition (SCAD...,Name;Name;Name;Name;Name,3df61b32-6781-11ee-b983-4ab2673ea3f0;3df61c18-...,3cbe2bec-6781-11ee-b983-4ab2673ea3f0;3cbe6d64-...,supervisory control acquisition system often s...


In [11]:
# check if entry in X is valid
assert len(df.loc[(df.eid == eid) & (df.author_ids.str.contains(str(author_id)))]) > 0, 'Invalid entry in X'

### Check for all entries

In [12]:
dimension_order=settings['dimension_order']
for i in tqdm(range(X.nnz)):
    author_id, eid = inz_paper_author(X, i, author_ids, paper_ids, dimension_order)
    assert len(df.loc[(df.eid == eid) & (df.author_ids.str.contains(str(author_id)))]) > 0, f'Invalid entry #{i} in X'

100%|██████████| 9917/9917 [00:05<00:00, 1974.08it/s]
