# Load Dataset

In [1]:
import pandas as pd

df = pd.read_csv("../../data/sample.csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 940 entries, 0 to 939
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   eid             940 non-null    object
 1   title           940 non-null    object
 2   year            940 non-null    int64 
 3   abstract        940 non-null    object
 4   authors         940 non-null    object
 5   author_ids      940 non-null    object
 6   references      843 non-null    object
 7   clean_abstract  940 non-null    object
dtypes: int64(1), object(7)
memory usage: 58.9+ KB


# Build Matrix

In [2]:
from TELF.pre_processing import Beaver

beaver = Beaver()

In [3]:
settings = {
    "dataset":df,
    "target_column":"clean_abstract",
    "options":{"min_df": 5, "max_df": 0.5},
    "matrix_type":"tfidf",
    "save_path":None
}

X, vocabulary = beaver.documents_words(**settings)

In [4]:
X.shape

(940, 467)

In [5]:
len(vocabulary)

467

In [6]:
vocabulary[:10]

array(['128pb', '2018', '32x', 'ability', 'abstain', 'accelerate',
       'accuracy', 'accurate', 'acquisition', 'activity'], dtype=object)

# Co-Occurrence / SPPMI

In [7]:
settings = {
    "dataset":df,
    "target_column":"clean_abstract",
    "cooccurrence_settings":{
        "n_jobs":2,
        "window_size": 100, 
        "vocabulary":vocabulary},
    "sppmi_settings":{},
    "save_path":None
}

CO_OCCURRENCE, SPPMI = beaver.cooccurrence_matrix(**settings)

[Parallel(n_jobs=2)]: Using backend MultiprocessingBackend with 2 concurrent workers.
100%|██████████| 470/470 [00:01<00:00, 267.04it/s][Parallel(n_jobs=2)]: Done   2 out of   2 | elapsed:   10.5s finished


  0%|                                                     | 0/2 [00:00<?, ?it/s]
100%|█████████████████████████████████| 25933/25933 [00:00<00:00, 832702.15it/s]
  0%|                                                     | 0/1 [00:00<?, ?it/s]
100%|████████████████████████████████| 25933/25933 [00:00<00:00, 1016578.84it/s]
0it [00:00, ?it/s]

Building sparse matrix from COO matrix...





In [8]:
CO_OCCURRENCE

<467x467 sparse matrix of type '<class 'numpy.float32'>'
	with 51721 stored elements in Compressed Sparse Row format>

In [9]:
CO_OCCURRENCE.data

array([101., 101., 303., ...,  90.,  90.,  90.], dtype=float32)

In [10]:
SPPMI

<467x467 sparse matrix of type '<class 'numpy.float32'>'
	with 0 stored elements in Compressed Sparse Row format>