In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
cd /content/drive/MyDrive/Classroom/Школа-ИТ DS 1, весна 2021, группа В. Иванова

/content/drive/MyDrive/Classroom/Школа-ИТ DS 1, весна 2021, группа В. Иванова


In [4]:
ls -la

total 71420
-rw------- 1 root root    50924 Apr 18 11:56 'HW2: Profile & optimise PairwiseCounter.ipynb'
-rw------- 1 root root    57695 Apr 18 15:51 'HW2: Profile & optimise PairwiseCounter_optimaze.ipynb'
-rw------- 1 root root    44663 Apr 18 15:53 'HW2: Profile & optimise PairwiseCounter_profile.ipynb'
-r-------- 1 root root 72975044 Apr  1 17:30  product_pairwise_counter.txt
drwx------ 2 root root     4096 Apr 18 09:24  [0m[01;34m__pycache__[0m/
-r-------- 1 root root      151 Mar 25 20:14 'Кирилл Ионкин - Предложения о работе с ODS.gdoc'


In [7]:
!pip install line_profiler
%load_ext line_profiler
%load_ext autoreload
%autoreload 4

The line_profiler extension is already loaded. To reload it, use:
  %reload_ext line_profiler
The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [8]:
!pip install memory_profiler
%load_ext memory_profiler

Collecting memory_profiler
  Downloading https://files.pythonhosted.org/packages/8f/fd/d92b3295657f8837e0177e7b48b32d6651436f0293af42b76d134c3bb489/memory_profiler-0.58.0.tar.gz
Building wheels for collected packages: memory-profiler
  Building wheel for memory-profiler (setup.py) ... [?25l[?25hdone
  Created wheel for memory-profiler: filename=memory_profiler-0.58.0-cp37-none-any.whl size=30180 sha256=d17a350dd6ae3e8ee002664790eaee4c031f5dc5aa0afb0be1470e638716de76
  Stored in directory: /root/.cache/pip/wheels/02/e4/0b/aaab481fc5dd2a4ea59e78bc7231bb6aae7635ca7ee79f8ae5
Successfully built memory-profiler
Installing collected packages: memory-profiler
Successfully installed memory-profiler-0.58.0


Далее представлен процесс профилирования кода(без изменений) для понимания того, где именно в коде "слабости".

Блоки кода разбиты логически на файлы .py:
    
1) pairwise_counter.py

2) load_data_and_create_list_of_product_ids.py

3) find_and_write_most_to_ocurring_product_in_file.py

In [9]:
%%writefile pairwise_counter.py
from typing import Tuple, Iterable, Any, Dict, Optional, NamedTuple
 
import numpy as np
 
from scipy import sparse
 
 
# need for log(0) where pair_count = 0
# do not affect results
EPS = 1e-100
 
 
class Stats(NamedTuple):
    pair_count: float
    count_1: float
    count_2: float
    total: float
 
 
class PairwiseCounter:
 
    def __init__(
            self,
            counts_matrix: sparse.csr_matrix,
            index_mapper: Dict[Any, int],
            total_key: Any,
    ):
        """
        Class for calculating some pair statistics.
        :param counts_matrix: sparse matrix of pairs
        :param index_mapper: dict from key to index in matrix
        :param total_key: key to count size of the data by line
        (total_key, total_key, value)
        """
        self.counts_matrix = counts_matrix
        self.index_mapper = index_mapper
        self.total_key = total_key
        total_index = index_mapper[total_key]
        self.total = self.counts_matrix[total_index, total_index]

    @profile
    def get_stats(self, key_1: Any, key_2: Any) -> Optional[Stats]:
        index_1 = self.index_mapper.get(key_1)
        index_2 = self.index_mapper.get(key_2)
 
        if index_1 is None or index_2 is None:
            return None
 
        pair_count = self.counts_matrix[index_1, index_2]
        count_1 = self.counts_matrix[index_1, index_1]
        count_2 = self.counts_matrix[index_2, index_2]
 
        if not count_1 or not count_2:
            return None
 
        return Stats(
            pair_count=float(pair_count),
            count_1=float(count_1),
            count_2=float(count_2),
            total=float(self.total),
        )

    @profile
    def calculate_pmi(self, key_1: Any, key_2: Any) -> Optional[float]:
        """
        Calculates by formula: PMI
        PMI = log(p(x,y)/(p(x)p(y)))
        :param key_1: key 1
        :param key_2: key 2
        :return: weighted PMI
        """
 
        stats = self.get_stats(key_1, key_2)
        if stats is None:
            return None
        return (
            np.log(stats.pair_count + EPS)
            + np.log(stats.total)
            - np.log(stats.count_1)
            - np.log(stats.count_2)
        )
 
    def to_dict(self) -> Dict[str, Any]:
        counts_matrix_dict = dict(
            data=self.counts_matrix.data.tolist(),
            indices=self.counts_matrix.indices.tolist(),
            indptr=self.counts_matrix.indptr.tolist(),
            shape=self.counts_matrix.shape,
        )
        return dict(
            counts_matrix=counts_matrix_dict,
            index_mapper=self.index_mapper,
            total_key=self.total_key,
        )
 
    @staticmethod
    def from_dict(params_dict: Dict[str, Any]):
        counts_matrix = sparse.csr_matrix(
            (
                params_dict['counts_matrix']['data'],
                params_dict['counts_matrix']['indices'],
                params_dict['counts_matrix']['indptr'],
            ),
            shape=params_dict['counts_matrix']['shape'],
        )
        return PairwiseCounter(
            counts_matrix=counts_matrix,
            index_mapper=params_dict['index_mapper'],
            total_key=params_dict['total_key'],
        )

Overwriting pairwise_counter.py


In [10]:
%%writefile load_data_and_create_list_of_product_ids.py

import collections
import json
import typing as tp

from tqdm.auto import tqdm

from pairwise_counter import PairwiseCounter


@profile
def load_data():
    with open('product_pairwise_counter.txt', 'r', encoding='utf8') as infile:
        pairwise_counter = PairwiseCounter.from_dict(json.load(infile))
    return pairwise_counter


@profile
def create_list_of_product_ids(pairwise_counter):
    product_ids = [
        product_id 
        for product_id in pairwise_counter.index_mapper.keys() 
        if product_id != pairwise_counter.total_key
    ]
    return product_ids


if __name__ == "__main__":
    pairwise_counter = load_data()
    product_ids = create_list_of_product_ids(pairwise_counter)

Writing load_data_and_create_list_of_product_ids.py


In [15]:
%%writefile find_and_write_most_to_ocurring_product_in_file.py

import collections
import json
import typing as tp

from tqdm.auto import tqdm

from load_data_and_create_list_of_product_ids import load_data, create_list_of_product_ids
from pairwise_counter import PairwiseCounter


@profile
def find_and_write_most_to_ocurring_product_in_file(product_ids, pairwise_counter):
    MAX_TOP_CANDIDATES: int = 10
    most_co_occurring_products: tp.Dict[str, tp.List[str]] = dict()
    
    for key_1 in tqdm(product_ids[:100], desc='outer loop'):
        candidates: tp.List[tp.Tuple[str, float]] = []
        for key_2 in product_ids[:100]: 
            if key_1 == key_2:
                continue
    
            pmi = pairwise_counter.calculate_pmi(key_1, key_2)
            if pmi is None:
                continue
    
            candidates.append((key_2, pmi))
    
        top_candidates = sorted(
            candidates, 
            key=lambda p: p[1], 
            reverse=True
        )[:MAX_TOP_CANDIDATES]
        most_co_occurring_products[key_1] = [
            product_id
            for product_id, pmi in top_candidates
        ]    

    with open('most_co_occurring_products.txt', 'w') as outfile:
        json.dump(most_co_occurring_products, outfile)


if __name__ == "__main__":
    pairwise_counter = load_data()
    product_ids = create_list_of_product_ids(pairwise_counter)
    find_and_write_most_to_ocurring_product_in_file(product_ids, pairwise_counter)

Overwriting find_and_write_most_to_ocurring_product_in_file.py


In [16]:
!kernprof -l -v find_and_write_most_to_ocurring_product_in_file.py

outer loop: 100% 100/100 [00:02<00:00, 48.99it/s]
Wrote profile results to find_and_write_most_to_ocurring_product_in_file.py.lprof
Timer unit: 1e-06 s

Total time: 3.59084 s
File: /content/drive/My Drive/Classroom/Школа-ИТ DS 1, весна 2021, группа В. Иванова/load_data_and_create_list_of_product_ids.py
Function: load_data at line 11

Line #      Hits         Time  Per Hit   % Time  Line Contents
    11                                           @profile
    12                                           def load_data():
    13         1       5798.0   5798.0      0.2      with open('product_pairwise_counter.txt', 'r', encoding='utf8') as infile:
    14         1    3585042.0 3585042.0     99.8          pairwise_counter = PairwiseCounter.from_dict(json.load(infile))
    15         1          3.0      3.0      0.0      return pairwise_counter

Total time: 0.003827 s
File: /content/drive/My Drive/Classroom/Школа-ИТ DS 1, весна 2021, группа В. Иванова/load_data_and_create_list_of_product_ids.

In [17]:
!python -m memory_profiler find_and_write_most_to_ocurring_product_in_file.py

outer loop: 100% 100/100 [00:19<00:00,  5.04it/s]
Filename: /content/drive/My Drive/Classroom/Школа-ИТ DS 1, весна 2021, группа В. Иванова/pairwise_counter.py

Line #    Mem usage    Increment  Occurences   Line Contents
    41  318.258 MiB 3150752.344 MiB        9900       @profile
    42                                             def get_stats(self, key_1: Any, key_2: Any) -> Optional[Stats]:
    43  318.258 MiB    0.000 MiB        9900           index_1 = self.index_mapper.get(key_1)
    44  318.258 MiB    0.000 MiB        9900           index_2 = self.index_mapper.get(key_2)
    45                                          
    46  318.258 MiB    0.000 MiB        9900           if index_1 is None or index_2 is None:
    47                                                     return None
    48                                          
    49  318.258 MiB    0.000 MiB        9900           pair_count = self.counts_matrix[index_1, index_2]
    50  318.258 MiB    0.000 MiB        9900 

# Выводы(после профилирования полученного кода):

## line_profiler 

1) load_data(Total time: 3.59084 s) - так как не имеем права изменять методы from_dict, to_dict, то и не будем никак пытаться ускорить код данной функции.

2) create_list_of_product_ids(Total time: 0.003827s) - используется list comprehension, которые создаются довольно быстро и пытаться его улучшить не вижу смысла(хотя можно и попробовать, но есть проблемы и посерьезнее, чем даная функция. К тому же время исполнения не вилико, по сравнению с другими ф-ями.).

3) find_and_write_most_to_ocurring_product_in_file(Total time: 2.00509 s) - очевидная проблематика долгой работы данной функции кроется в ф-ии -> calculate_pmi.

4) calculate_pmi(Total time: 1.87516 s) - аналогично очевидная проблематика уже кроется в ф-ии get_stats

5) get_stats(Total time: 1.68843 s) - как видно из профайлера, проблема связана с тем, что очень много времени тратиться при обращении к конкретным элементам разреженной матрицы, поэтому это пока первостепенная проблема, с которой мы попытаемся разобраться.

## memory_profiler
1) Основные затраты памяти на создание обьекта PairwiseCounter.(318.324 Mi)