# Load Dataset

In [1]:
import pandas as pd
import os 
df = pd.read_csv(os.path.join("..", "..", "data", "sample.csv"))
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 940 entries, 0 to 939
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   eid             940 non-null    object
 1   title           940 non-null    object
 2   year            940 non-null    int64 
 3   abstract        940 non-null    object
 4   authors         940 non-null    object
 5   author_ids      940 non-null    object
 6   references      843 non-null    object
 7   clean_abstract  940 non-null    object
dtypes: int64(1), object(7)
memory usage: 58.9+ KB


# Build Matrix

In [2]:
from TELF.pre_processing import Beaver

beaver = Beaver()

First let's get our vocabulary from the documents words matrix. **This step is optional!**

In [3]:
settings = {
    "dataset":df,
    "target_column":"clean_abstract",
    "min_df":10,
    "max_df":0.5,
}

%time vocabulary = beaver.get_vocabulary(**settings)
len(vocabulary)

CPU times: user 31.1 ms, sys: 2.51 ms, total: 33.6 ms
Wall time: 33.5 ms


467

In [4]:
settings = {
    "dataset":df,
    "target_column":"clean_abstract",
    "options":{"min_df": 5, "max_df": 0.5, "vocabulary":vocabulary},
    "matrix_type":"tfidf",
    "highlighting":['aberration', 'ability', 'ablation', 'ablator', 'able'],
    "weights":2,
    "save_path":os.path.join(".", "results")
}

X, vocabulary = beaver.documents_words(**settings)



In [5]:
X

<Compressed Sparse Row sparse matrix of dtype 'float32'
	with 72655 stored elements and shape (940, 471)>

In [6]:
import scipy.sparse as ss

# load the saved file which is in Sparse COO format
X_csr_sparse = ss.load_npz(os.path.join(".", "results", "documents_words.npz"))
X_csr_sparse

<Compressed Sparse Row sparse matrix of dtype 'float32'
	with 72655 stored elements and shape (940, 471)>

In [7]:
def chop_chop_rows(X, n=1):
    m_rows = X.shape[0]
    chunk_size = int(m_rows/n)
    chunks = []
    
    for idx in range(n):
        start = idx*chunk_size
        if idx == (n-1):
            end = m_rows
        else:
            end = (idx*chunk_size)+chunk_size
        chunks.append(X[start:end])
        
    return chunks

In [8]:
chop_chop_rows(X_csr_sparse, n=4)

[<Compressed Sparse Row sparse matrix of dtype 'float32'
 	with 17870 stored elements and shape (235, 471)>,
 <Compressed Sparse Row sparse matrix of dtype 'float32'
 	with 18784 stored elements and shape (235, 471)>,
 <Compressed Sparse Row sparse matrix of dtype 'float32'
 	with 17738 stored elements and shape (235, 471)>,
 <Compressed Sparse Row sparse matrix of dtype 'float32'
 	with 18263 stored elements and shape (235, 471)>]