# Load Dataset

In [1]:
import pandas as pd

df = pd.read_csv("../../data/sample.csv")
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 940 entries, 0 to 939
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   eid             940 non-null    object
 1   title           940 non-null    object
 2   year            940 non-null    int64 
 3   abstract        940 non-null    object
 4   authors         940 non-null    object
 5   author_ids      940 non-null    object
 6   references      843 non-null    object
 7   clean_abstract  940 non-null    object
dtypes: int64(1), object(7)
memory usage: 58.9+ KB


# Build Matrix

In [2]:
from TELF.pre_processing import Beaver

beaver = Beaver()

First let's get our vocabulary from the documents words matrix. **This step is optional!**

In [3]:
settings = {
    "dataset":df,
    "target_column":"clean_abstract",
    "min_df":10,
    "max_df":0.5,
}

%time vocabulary = beaver.get_vocabulary(**settings)
len(vocabulary)

CPU times: user 34.5 ms, sys: 2.2 ms, total: 36.7 ms
Wall time: 35.6 ms


467

In [4]:
settings = {
    "dataset":df,
    "target_column":"clean_abstract",
    "options":{"min_df": 5, "max_df": 0.5, "vocabulary":vocabulary},
    "matrix_type":"tfidf",
    "highlighting":['aberration', 'ability', 'ablation', 'ablator', 'able'],
    "weights":2,
    "save_path":"./"
}

beaver.documents_words(**settings)



(<940x471 sparse matrix of type '<class 'numpy.float32'>'
 	with 72655 stored elements in Compressed Sparse Row format>,
 array(['128pb', '2018', '32x', 'aberration', 'ability', 'ablation',
        'ablator', 'able', 'abstain', 'accelerate', 'accuracy', 'accurate',
        'acquisition', 'activity', 'addition', 'address', 'adoption',
        'aggregate', 'aid', 'alamos', 'allow', 'alternate', 'analogous',
        'analysis', 'analyst', 'analytic', 'anomaly', 'apply', 'approach',
        'approximately', 'archive', 'art', 'arxiv', 'associate',
        'asynchronously', 'attack', 'author', 'automatic', 'available',
        'base', 'baseline', 'batch', 'behavior', 'belong', 'benchmark',
        'benign', 'bias', 'block', 'bottleneck', 'break', 'bronze',
        'build', 'bulk', 'call', 'cancer', 'canonical', 'capability',
        'catalog', 'category', 'cause', 'central', 'certain', 'challenge',
        'characterization', 'citation', 'class', 'classification',
        'classify', 'client

In [6]:
import scipy.sparse as ss

# load the saved file which is in Sparse COO format
X_csr_sparse = ss.load_npz("documents_words.npz")
X_csr_sparse

<940x470 sparse matrix of type '<class 'numpy.float32'>'
	with 72279 stored elements in Compressed Sparse Row format>

In [7]:
def chop_chop_rows(X, n=1):
    m_rows = X.shape[0]
    chunk_size = int(m_rows/n)
    chunks = []
    
    for idx in range(n):
        start = idx*chunk_size
        if idx == (n-1):
            end = m_rows
        else:
            end = (idx*chunk_size)+chunk_size
        chunks.append(X[start:end])
        
    return chunks

In [8]:
chop_chop_rows(X_csr_sparse, n=4)

[<235x470 sparse matrix of type '<class 'numpy.float32'>'
 	with 17780 stored elements in Compressed Sparse Row format>,
 <235x470 sparse matrix of type '<class 'numpy.float32'>'
 	with 18674 stored elements in Compressed Sparse Row format>,
 <235x470 sparse matrix of type '<class 'numpy.float32'>'
 	with 17642 stored elements in Compressed Sparse Row format>,
 <235x470 sparse matrix of type '<class 'numpy.float32'>'
 	with 18183 stored elements in Compressed Sparse Row format>]