## Импорт

In [1]:
# !pip install line_profiler
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix, vstack
import line_profiler

## Исходные настройки

In [2]:
sessions_cnt = 140000
session_length = 10
window_size = 7
total_cnt = 30000
sites = pd.Series(np.random.randint(1,total_cnt + 1,sessions_cnt))

## Варианты реализации функций

### 1. Встроенными методами numpy
1.1 Берем срез, скармливаем np.bincount и сразу преобразуем в CSR. После чего объединяем с уже имеющейся CSR матрицей

In [3]:
def test_spsr_1(lst, session_length=10, window_size=10, total_cnt=11):
    X = None
    i = 0
    cnt = 0
    while i  < len(lst):
        j = (i + session_length) if i  + session_length  <= len(sites) else len(sites)
        arr = csr_matrix(np.bincount(lst[i: j], minlength=total_cnt + 1)[1:])
        X = vstack([X, arr]) if X is not None else arr   
        i += window_size
    return X

In [4]:
l = line_profiler.LineProfiler()
l.add_function(test_spsr_1)
l.run("test_spsr_1(sites, session_length=session_length, window_size=window_size, total_cnt=total_cnt)")
l.print_stats()

Timer unit: 3.11021e-07 s

Total time: 18.9843 s
File: <ipython-input-3-07959f2aa770>
Function: test_spsr_1 at line 1

Line #      Hits         Time  Per Hit   % Time  Line Contents
     1                                           def test_spsr_1(lst, session_length=10, window_size=10, total_cnt=11):
     2         1            4      4.0      0.0      X = None
     3         1            3      3.0      0.0      i = 0
     4         1            1      1.0      0.0      cnt = 0
     5     20001       377433     18.9      0.6      while i  < len(lst):
     6     20000       212120     10.6      0.3          j = (i + session_length) if i  + session_length  <= len(sites) else len(sites)
     7     20000     37811336   1890.6     61.9          arr = csr_matrix(np.bincount(lst[i: j], minlength=total_cnt + 1)[1:])
     8     20000     22567185   1128.4     37.0          X = vstack([X, arr]) if X is not None else arr   
     9     20000        70428      3.5      0.1          i += window_siz

### 2. Вручную

2.1 Вручную готовим массивы для CSR с помощью np.unique, используем для хранения промежуточных значений np.array

In [5]:
def test_spsr_2(lst, session_length=10, window_size=10, total_cnt=11):
    X = None
    i = 0
    k = 0
    data = np.array([])
    indices = np.array([])
    indptr = np.array([0])
    while i  < len(lst):
        j = (i + session_length) if i  + session_length  <= len(sites) else len(sites)
        idx, cnt = np.unique(lst[i: j], return_counts=True)
        data = np.append(data, cnt)
        indices = np.append(indices, idx)
        indptr = np.append(indptr, indptr[-1] + len(idx)) 
        i += window_size
        k += 1
    arr = csr_matrix((data, indices, indptr), shape=(k, total_cnt + 1))
    X = vstack([X, arr]) if X is not None else arr 
    return X

In [6]:
l = line_profiler.LineProfiler()
l.add_function(test_spsr_2)
l.run("test_spsr_2(sites, session_length=session_length, window_size=window_size, total_cnt=total_cnt)")
l.print_stats()

Timer unit: 3.11021e-07 s

Total time: 10.8846 s
File: <ipython-input-5-5097cbf6c2ee>
Function: test_spsr_2 at line 1

Line #      Hits         Time  Per Hit   % Time  Line Contents
     1                                           def test_spsr_2(lst, session_length=10, window_size=10, total_cnt=11):
     2         1            5      5.0      0.0      X = None
     3         1            2      2.0      0.0      i = 0
     4         1            1      1.0      0.0      k = 0
     5         1           39     39.0      0.0      data = np.array([])
     6         1            5      5.0      0.0      indices = np.array([])
     7         1            9      9.0      0.0      indptr = np.array([0])
     8     20001       339866     17.0      1.0      while i  < len(lst):
     9     20000       202216     10.1      0.6          j = (i + session_length) if i  + session_length  <= len(sites) else len(sites)
    10     20000     10423287    521.2     29.8          idx, cnt = np.unique(lst[i

2.2 Вручную готовим массивы для CSR с помощью np.unique, используем для хранения промежуточных значений list

In [7]:
def test_spsr_3(lst, session_length=10, window_size=10, total_cnt=11):
    X = None
    i = 0
    k = 0
    data = []
    indices = []
    indptr = [0]
    while i  < len(lst):
        j = (i + session_length) if i  + session_length  <= len(sites) else len(sites)
        idx, cnt = np.unique(lst[i: j], return_counts=True)
        data.extend(cnt)
        indices.extend(idx)
        indptr.append(indptr[-1] + len(idx)) 
        i += window_size
        k += 1
    arr = csr_matrix((data, indices, indptr), shape=(k, total_cnt + 1))
    X = vstack([X, arr]) if X is not None else arr 
    return X

In [8]:
l = line_profiler.LineProfiler()
l.add_function(test_spsr_3)
l.run("test_spsr_3(sites, session_length=session_length, window_size=window_size, total_cnt=total_cnt)")
l.print_stats()

Timer unit: 3.11021e-07 s

Total time: 3.30896 s
File: <ipython-input-7-9e780d4b4470>
Function: test_spsr_3 at line 1

Line #      Hits         Time  Per Hit   % Time  Line Contents
     1                                           def test_spsr_3(lst, session_length=10, window_size=10, total_cnt=11):
     2         1            4      4.0      0.0      X = None
     3         1            2      2.0      0.0      i = 0
     4         1            2      2.0      0.0      k = 0
     5         1            2      2.0      0.0      data = []
     6         1            2      2.0      0.0      indices = []
     7         1            2      2.0      0.0      indptr = [0]
     8     20001       267787     13.4      2.5      while i  < len(lst):
     9     20000       187440      9.4      1.8          j = (i + session_length) if i  + session_length  <= len(sites) else len(sites)
    10     20000      9553943    477.7     89.8          idx, cnt = np.unique(lst[i: j], return_counts=True)
    

2.3 Вручную готовим массивы для CSR с помощью pd.Series.value_counts, используем для хранения промежуточных значений list

In [9]:
def test_spsr_4(lst, session_length=10, window_size=10,total_cnt=11):
    X = None
    i = 0
    k = 0
    data = []
    indices = []
    indptr = [0]
    while i  < len(lst):
        j = (i + session_length) if i  + session_length  <= len(sites) else len(sites)
        vc = lst[i: j].value_counts()
        idx = vc.index
        cnt = vc.values
        data.extend(cnt)
        indices.extend(idx)
        indptr.append(indptr[-1] + len(idx)) 
        i += window_size
        k += 1
    arr = csr_matrix((data, indices, indptr), shape=(k, total_cnt + 1))
    X = vstack([X, arr]) if X is not None else arr 
    return X

In [10]:
l = line_profiler.LineProfiler()
l.add_function(test_spsr_4)
l.run("test_spsr_4(sites, session_length=session_length, window_size=window_size, total_cnt=total_cnt)")
l.print_stats()

Timer unit: 3.11021e-07 s

Total time: 13.845 s
File: <ipython-input-9-ec4a8a95350c>
Function: test_spsr_4 at line 1

Line #      Hits         Time  Per Hit   % Time  Line Contents
     1                                           def test_spsr_4(lst, session_length=10, window_size=10,total_cnt=11):
     2         1            6      6.0      0.0      X = None
     3         1            2      2.0      0.0      i = 0
     4         1            1      1.0      0.0      k = 0
     5         1            2      2.0      0.0      data = []
     6         1            2      2.0      0.0      indices = []
     7         1            2      2.0      0.0      indptr = [0]
     8     20001       285397     14.3      0.6      while i  < len(lst):
     9     20000       198833      9.9      0.4          j = (i + session_length) if i  + session_length  <= len(sites) else len(sites)
    10     20000     42818466   2140.9     96.2          vc = lst[i: j].value_counts()
    11     20000       10513