### Лекция 8. Техники оптимизации на примерах.

План: закидывать примеры в quick-bench.com и объяснять что происходит.

<br />

##### quick-bench.com

Открыть, показать интерфейс, показать, как пользоваться, рассказать про google benchmark и как он встроен в quick-bench.com, методику измерения:

http://quick-bench.com

https://github.com/google/benchmark

In [None]:
static void StringCreation(benchmark::State& state) {
  // Code inside this loop is measured repeatedly
  for (auto _ : state) {
    std::string created_string("hello");
    // Make sure the variable is not optimized away by compiler
    benchmark::DoNotOptimize(created_string);
  }
}
// Register the function as a benchmark
BENCHMARK(StringCreation);

static void StringCopy(benchmark::State& state) {
  // Code before the loop is not measured
  std::string x = "hello";
  for (auto _ : state) {
    std::string copy(x);
    // Make sure the variable is not optimized away by compiler
    benchmark::DoNotOptimize(copy);
  }
}
BENCHMARK(StringCopy);

<br />

##### std::vector::push_back

In [None]:
#include <vector>

static const int n = 50000;

static std::vector<int> fill_vec()
{
    std::vector<int> rv;
    for (int i = 0; i < n; ++i)
        rv.push_back(i);
    return rv;
}

static std::vector<int> fill_vec_with_reserve()
{
    std::vector<int> rv;
    rv.reserve(n);
    for (int i = 0; i < n; ++i)
        rv.push_back(i);
    return rv;
}

static void BMFillVec(benchmark::State& state) {
    for (auto _ : state)
        benchmark::DoNotOptimize(fill_vec());
}
BENCHMARK(BMFillVec);

static void BMFillVecWithReserve(benchmark::State& state) {
    for (auto _ : state)
        benchmark::DoNotOptimize(fill_vec_with_reserve());
}
BENCHMARK(BMFillVecWithReserve);

<br />

##### Два разных способа заполнить std::vector

Пример 1:

In [None]:
#include <vector>

static const int n = 50000;

static std::vector<int> fill_vec_with_assign()
{
    std::vector<int> rv(n, 0);
    for (int i = 0; i < n; ++i)
        rv[i] = i;
    return rv;
}

static std::vector<int> fill_vec_with_reserve()
{
    std::vector<int> rv;
    rv.reserve(n);
    for (int i = 0; i < n; ++i)
        rv.push_back(i);
    return rv;
}

static void BMFillVecWithAssign(benchmark::State& state) {
    for (auto _ : state)
        benchmark::DoNotOptimize(fill_vec_with_assign());
}
BENCHMARK(BMFillVecWithAssign);

static void BMFillVecWithReserve(benchmark::State& state) {
    for (auto _ : state)
        benchmark::DoNotOptimize(fill_vec_with_reserve());
}
BENCHMARK(BMFillVecWithReserve);

Пример 2:

In [None]:
#include <string>
#include <vector>

static const int n = 50000;

static std::vector<std::string> fill_vec_with_assign()
{
    std::vector<std::string> rv(n);
    for (int i = 0; i < n; ++i)
        rv[i] = "1234567890123456790";
    return rv;
}

static std::vector<std::string> fill_vec_with_reserve()
{
    std::vector<std::string> rv;
    rv.reserve(n);
    for (int i = 0; i < n; ++i)
        rv.emplace_back("1234567890123456790");
    return rv;
}

static void BMFillVecWithAssign(benchmark::State& state) {
    for (auto _ : state)
        benchmark::DoNotOptimize(fill_vec_with_assign());
}
BENCHMARK(BMFillVecWithAssign);

static void BMFillVecWithReserve(benchmark::State& state) {
    for (auto _ : state)
        benchmark::DoNotOptimize(fill_vec_with_reserve());
}
BENCHMARK(BMFillVecWithReserve);

<br />

##### где push_back быстрее?

In [None]:
#include <deque>
#include <list>
#include <vector>

static const int n = 10000;

static std::list<int> fill_list()
{
    std::list<int> rv;
    for (int i = 0; i < n; ++i)
        rv.push_back(i);
    return rv;
}

static std::vector<int> fill_vec()
{
    std::vector<int> rv;
    for (int i = 0; i < n; ++i)
        rv.push_back(i);
    return rv;
}

static std::deque<int> fill_deq()
{
    std::deque<int> rv;
    for (int i = 0; i < n; ++i)
        rv.push_back(i);
    return rv;
}

static void BMFillList(benchmark::State& state) {
    for (auto _ : state)
        benchmark::DoNotOptimize(fill_list());
}
BENCHMARK(BMFillList);

static void BMFillVec(benchmark::State& state) {
    for (auto _ : state)
        benchmark::DoNotOptimize(fill_vec());
}
BENCHMARK(BMFillVec);

static void BMFillDeque(benchmark::State& state) {
    for (auto _ : state)
        benchmark::DoNotOptimize(fill_deq());
}
BENCHMARK(BMFillDeque);

<br />

###### Инициализация отображения int -> int

In [None]:
#include <map>
#include <unordered_map>

static const int n = 50000;

static std::map<int, int> fill_map()
{
    std::map<int, int> rv;
    for (int i = 0; i < n; ++i)
        rv[i] = i;
    return rv;
}

static std::unordered_map<int, int> fill_unordered_map()
{
    std::unordered_map<int, int> rv;
    for (int i = 0; i < n; ++i)
        rv[i] = i;
    return rv;
}

static std::unordered_map<int, int> fill_unordered_map_reserve()
{
    std::unordered_map<int, int> rv;
    rv.reserve(n);
    for (int i = 0; i < n; ++i)
        rv[i] = i;
    return rv;
}

static void BMFillMap(benchmark::State& state) {
    for (auto _ : state)
        benchmark::DoNotOptimize(fill_map());
}
BENCHMARK(BMFillMap);

static void BMFillUnorderedMap(benchmark::State& state) {
    for (auto _ : state)
        benchmark::DoNotOptimize(fill_unordered_map());
}
BENCHMARK(BMFillUnorderedMap);

static void BMFillUnorderedMapReserve(benchmark::State& state) {
    for (auto _ : state)
        benchmark::DoNotOptimize(fill_unordered_map_reserve());
}
BENCHMARK(BMFillUnorderedMapReserve);

<br />

##### Поиск элемента в отображении

In [None]:
#include <map>
#include <unordered_map>

static const int n = 50000;

static std::map<int, int> fill_map()
{
    std::map<int, int> rv;
    for (int i = 0; i < n; ++i)
        rv[i] = i;
    return rv;
}

static std::unordered_map<int, int> fill_unordered_map()
{
    std::unordered_map<int, int> rv;
    rv.reserve(n);
    for (int i = 0; i < n; ++i)
        rv[i] = i;
    return rv;
}


static void BMAccessMap(benchmark::State& state) {
    const auto x = fill_map();
    for (auto _ : state) {
        for (int i = n - 1; i >= 0; --i)
            benchmark::DoNotOptimize(x.find(i));
    }
}
BENCHMARK(BMAccessMap);

static void BMAccessUnorderedMap(benchmark::State& state) {
    const auto x = fill_unordered_map();
    for (auto _ : state) {
        for (int i = n - 1; i >= 0; --i)
            benchmark::DoNotOptimize(x.find(i));
    }
}
BENCHMARK(BMAccessUnorderedMap);

Повторить тот же самый пример с n = 10

А потом вот этот пример:

In [None]:
#include <algorithm>
#include <cstdlib>
#include <map>
#include <unordered_map>

static const int n = 20;

static std::map<int, int> fill_map()
{
    std::map<int, int> rv;
    for (int i = 0; i < n; ++i)
        rv[i] = i;
    return rv;
}

static std::unordered_map<int, int> fill_unordered_map()
{
    std::unordered_map<int, int> rv;
    rv.reserve(n);
    for (int i = 0; i < n; ++i)
        rv[i] = i;
    return rv;
}

static std::vector<std::pair<int, int>> fill_vector()
{
    std::vector<std::pair<int, int>> rv(n);
    for (int i = 0; i < n; ++i)
        rv[i] = std::make_pair(i, i);
    return rv;
}

static __attribute__((noinline))
const int* find_value(const std::vector<std::pair<int, int>>& map_as_vector, const int key)
{
    for (const auto& pair : map_as_vector)
        if (pair.first == key)
            return &pair.second;
    return nullptr;
}

static int compare_int_pairs_by_first(const void * a, const void * b)
{
  const auto* const l = (const std::pair<int, int> *)(a);
  const auto* const r = (const std::pair<int, int> *)(b);
  return l->first - r->first;
}

static __attribute__((noinline))
const int* bin_find_value(const std::vector<std::pair<int, int>>& map_as_vector, const int key)
{
    const std::pair<int, int> search_target(key, 0);
    auto* pItem = (const std::pair<int, int>*) std::bsearch(
        &search_target,
        map_as_vector.data(),
        map_as_vector.size(),
        sizeof(std::pair<int, int>),
        compare_int_pairs_by_first);
    if (!pItem)
        return nullptr;

    return &(pItem->second);
}

static void BMAccessMap(benchmark::State& state) {
    const auto x = fill_map();
    for (auto _ : state) {
        for (int i = n - 1; i >= 0; --i)
            benchmark::DoNotOptimize(x.find(i));
    }
}
BENCHMARK(BMAccessMap);

static void BMAccessUnorderedMap(benchmark::State& state) {
    const auto x = fill_unordered_map();
    for (auto _ : state) {
        for (int i = n - 1; i >= 0; --i)
            benchmark::DoNotOptimize(x.find(i));
    }
}
BENCHMARK(BMAccessUnorderedMap);

static void BMAccessVector(benchmark::State& state) {
    const auto x = fill_vector();
    for (auto _ : state) {
        for (int i = n - 1; i >= 0; --i)
            benchmark::DoNotOptimize(find_value(x, i));
    }
}
BENCHMARK(BMAccessVector);

static void BMAccessVectorBinSearch(benchmark::State& state) {
    const auto x = fill_vector();
    for (auto _ : state) {
        for (int i = n - 1; i >= 0; --i)
            benchmark::DoNotOptimize(bin_find_value(x, i));
    }
}
BENCHMARK(BMAccessVectorBinSearch);

Показать его на libstc++ и на libc++, установить n значительно выше, показать на libstdc++ и libc++

<br />

##### Стоимость умных указателей

In [None]:
#include <memory>
#include <vector>

static const int n = 50000;

static std::vector<std::shared_ptr<int>> fill_shared_ptr()
{
    std::vector<std::shared_ptr<int>> rv;
    rv.reserve(n);
    for (int i = 0; i < n; ++i)
        rv.push_back(std::shared_ptr<int>(new int(i)));
    return rv;
}

static std::vector<std::shared_ptr<int>> fill_make_shared()
{
    std::vector<std::shared_ptr<int>> rv;
    rv.reserve(n);
    for (int i = 0; i < n; ++i)
        rv.push_back(std::make_shared<int>(i));
    return rv;
}

static std::vector<std::unique_ptr<int>> fill_make_unique()
{
    std::vector<std::unique_ptr<int>> rv;
    rv.reserve(n);
    for (int i = 0; i < n; ++i)
        rv.push_back(std::make_unique<int>(i));
    return rv;
}

static void BMFillSharedPtr(benchmark::State& state) {
    for (auto _ : state)
        benchmark::DoNotOptimize(fill_shared_ptr())
}
BENCHMARK(BMFillSharedPtr);

static void BMFillMakeShared(benchmark::State& state) {
    for (auto _ : state)
        benchmark::DoNotOptimize(fill_make_shared())
}
BENCHMARK(BMFillMakeShared);

static void BMFillVecMakeUnique(benchmark::State& state) {
    for (auto _ : state)
        benchmark::DoNotOptimize(fill_make_unique())
}
BENCHMARK(BMFillVecMakeUnique);

<br />

##### Работа со строками на примере join

In [None]:
#include <string>
#include <vector>

static const std::vector<std::string> strings {
    "Yellow Submarine\n",
    "\n",
    "In the town where I was born\n",
    "Lived a man who sailed to sea\n",
    "And he told us of his life\n",
    "In the land of submarines\n",
    "So we sailed up to the sun\n",
    "Till we found the sea of green\n",
    "And we lived beneath the waves\n",
    "In our yellow submarine\n"
};

static std::string join1()
{
    std::string rv;
    for (const auto line: strings)
        rv = rv + line;
    return rv;
}

static std::string join2()
{
    std::string rv;
    for (const auto& line: strings)
        rv = rv + line;
    return rv;
}

static std::string join3()
{
    std::string rv;
    for (const auto& line: strings)
        rv += line;
    return rv;
}

static std::string join4()
{
    size_t total_size = 0;
    for (const auto& line: strings)
        total_size += line.size();

    std::string rv;
    rv.reserve(total_size);
    for (const auto& line: strings)
        rv += line;
    return rv;
}

static void BMJoin1(benchmark::State& state) {
    for (auto _ : state)
        benchmark::DoNotOptimize(join1());
}
BENCHMARK(BMJoin1);

static void BMJoin2(benchmark::State& state) {
    for (auto _ : state)
        benchmark::DoNotOptimize(join2());
}
BENCHMARK(BMJoin2);

static void BMJoin3(benchmark::State& state) {
    for (auto _ : state)
        benchmark::DoNotOptimize(join3());
}
BENCHMARK(BMJoin3);

static void BMJoin4(benchmark::State& state) {
    for (auto _ : state)
        benchmark::DoNotOptimize(join4());
}
BENCHMARK(BMJoin4);

<br />

##### конвертация числа в строку

In [None]:
#include <charconv>
#include <sstream>
#include <string>

const uint64_t x = 12345678900ul;

static __attribute__((noinline)) std::string to_string1(uint64_t val)
{
    std::ostringstream oss;
    oss << val;
    return oss.str();
}

static __attribute__((noinline)) std::string to_string2(uint64_t val)
{
    return std::to_string(val);  // C++11
}

static __attribute__((noinline)) std::string to_string3(uint64_t val)
{
    const char buf[128];
    int n = sprintf(buf, "%lu", val);  // C
    return {buf, buf + n};
}

static __attribute__((noinline)) std::string to_string4(uint64_t val)
{
    const char buf[128];
    const auto res = std::to_chars(buf, buf + 128, val); // C++17
    return {buf, res.ptr};
}

static void BMToStringStream(benchmark::State& state) {
    for (auto _ : state)
        benchmark::DoNotOptimize(to_string1(x));
}
BENCHMARK(BMToStringStream);

static void BMToString(benchmark::State& state) {
    for (auto _ : state)
        benchmark::DoNotOptimize(to_string2(x));
}
BENCHMARK(BMToString);

static void BMToStringSprintf(benchmark::State& state) {
    for (auto _ : state)
        benchmark::DoNotOptimize(to_string3(x));
}
BENCHMARK(BMToStringSprintf);

static void BMToStringToChars(benchmark::State& state) {
    for (auto _ : state)
        benchmark::DoNotOptimize(to_string4(x));
}
BENCHMARK(BMToStringToChars);

<br />

##### SSO

In [None]:
#include <string>

static std::string make_string_14()
{
    return "12345678901234";
}

static std::string make_string_15()
{
    return "123456789012345";
}

static std::string make_string_16()
{
    return "1234567890123456";
}

static std::string make_string_17()
{
    return "12345678901234567";
}

static void BMMakeString14(benchmark::State& state) {
    for (auto _ : state)
        benchmark::DoNotOptimize(make_string_14());
}
BENCHMARK(BMMakeString14);

static void BMMakeString15(benchmark::State& state) {
    for (auto _ : state)
        benchmark::DoNotOptimize(make_string_15());
}
BENCHMARK(BMMakeString15);

static void BMMakeString16(benchmark::State& state) {
    for (auto _ : state)
        benchmark::DoNotOptimize(make_string_16());
}
BENCHMARK(BMMakeString16);

static void BMMakeString17(benchmark::State& state) {
    for (auto _ : state)
        benchmark::DoNotOptimize(make_string_17());
}
BENCHMARK(BMMakeString17);

<br />

##### allocations caching

In [None]:
#include <string>
#include <vector>

const std::vector<std::vector<double>> data = {
    {3.17744692, 5.92620571, 8.25519352, 5.38791914, 0.8765440},
    {2.08272032, 8.32814122, 8.80346836, 0.03075542, 7.5883834},
    {1.36476568, 4.29295626, 8.35784302, 0.42031432, 1.7609703},
    {9.96206844, 1.32851998, 7.53035291, 7.55081274, 2.8070660},
    {3.17744692, 5.92620571, 8.25519352, 5.38791914, 0.8765440},
    {2.08272032, 8.32814122, 8.80346836, 0.03075542, 7.5883834},
    {1.36476568, 4.29295626, 8.35784302, 0.42031432, 1.7609703},
    {9.96206844, 1.32851998, 7.53035291, 7.55081274, 2.8070660},
    {3.17744692, 5.92620571, 8.25519352, 5.38791914, 0.8765440},
    {2.08272032, 8.32814122, 8.80346836, 0.03075542, 7.5883834},
    {1.36476568, 4.29295626, 8.35784302, 0.42031432, 1.7609703},
    {9.96206844, 1.32851998, 7.53035291, 7.55081274, 2.8070660},
};

static std::vector<std::string> run()
{
    std::vector<std::string> rv;
    rv.reserve(data.size());
    for (const auto& row: data)
    {
        std::string str;
        for (double v : row)
            str += std::to_string(v);
        rv.push_back(str);
    }
    return rv;
}

static std::vector<std::string> run_alloc_caching()
{
    std::vector<std::string> rv;
    rv.reserve(data.size());
    std::string str;
    for (const auto& row: data)
    {
        str.clear();
        for (double v : row)
            str += std::to_string(v);
        rv.push_back(str);
    }
    return rv;
}

static void BMRun(benchmark::State& state) {
    for (auto _ : state)
        benchmark::DoNotOptimize(run());
}
BENCHMARK(BMRun);

static void BMRunAllocCaching(benchmark::State& state) {
    for (auto _ : state)
        benchmark::DoNotOptimize(run_alloc_caching());
}
BENCHMARK(BMRunAllocCaching);

<br />

##### return value and output parameter

In [None]:
const int N = 1000;
int arr[N] = {1, 2, 3, 4, 5};

static __attribute__((noinline)) int sum1(int *x, int n)
{
  int rv = 0;
  for (int i = 0; i < n; ++i)
    rv += x[i];
  return rv;
}

static __attribute__((noinline)) void sum2(int *x, int n, int *rv)
{
  *rv = 0;
  for (int i = 0; i < n; ++i)
    *rv += x[i];
}

static void BM_sum1(benchmark::State& state) {
    for (auto _ : state)
        benchmark::DoNotOptimize(sum1(arr, N));
}
BENCHMARK(BM_sum1);

static void BM_sum2(benchmark::State& state) {
    int s;
    for (auto _ : state)
        sum2(arr, N, &s);
}
BENCHMARK(BM_sum2);

Закинуть это дело на godbolt.org и показать, почему так

<br />

##### Алгоритмы, алгоритмы

In [None]:
#include <algorithm>
#include <vector>

static const std::vector<int> v(2000, 1);

static __attribute__((noinline))
bool contains_algo(const std::vector<int>& v, const int value)
{
  return std::find(begin(v), end(v), value) != end(v);
}

static __attribute__((noinline))
bool contains_naive(const std::vector<int>& v, const int value)
{
  for (int i : v)
    if (i == value)
      return true;
  return false;
}

static void BM_algo(benchmark::State& state) {
    for (auto _ : state)
        benchmark::DoNotOptimize(contains_algo(v, 2));
}
BENCHMARK(BM_algo);

static void BM_naive(benchmark::State& state) {
    for (auto _ : state)
        benchmark::DoNotOptimize(contains_naive(v, 2));
}
BENCHMARK(BM_naive);

Сначала показать результат на libc++ LLVM, потом на libstdc++ GNU. Объяснить, почему так:
    
https://bugs.llvm.org/show_bug.cgi?id=19708

<br />

##### Специфика однобайтовых типов

In [None]:
#include <cstdint>
#include <vector>

const int N = 2000;

template<typename T>
void inc_v1(std::vector<T>& v) {
  for (int i = 0; i < v.size(); ++i)
    ++v[i];
}

template<typename T>
void inc_v2(std::vector<T>& v) {
  for (int i = 0, count = v.size(); i < count; ++i)
    ++v[i];
}

template<typename T>
void inc_v3(std::vector<T>& v) {
  for (auto i = v.begin(), e = v.end(); i != e; ++i)
    ++(*i);
}

template<typename T>
void inc_v4(std::vector<T>& v) {
  for (auto& i : v)
    ++i;
}

static void BM_inc8_v1(benchmark::State& state) {
  std::vector<std::uint8_t> v(N* 4);
  for (auto _ : state) {
    inc_v1(v);
    benchmark::DoNotOptimize(v);
  }
}
BENCHMARK(BM_inc8_v1);

static void BM_inc8_v2(benchmark::State& state) {
  std::vector<std::uint8_t> v(N * 4);
  for (auto _ : state) {
    inc_v2(v);
    benchmark::DoNotOptimize(v);
  }
}
BENCHMARK(BM_inc8_v2);

static void BM_inc8_v3(benchmark::State& state) {
  std::vector<std::uint8_t> v(N * 4);
  for (auto _ : state) {
    inc_v3(v);
    benchmark::DoNotOptimize(v);
  }
}
BENCHMARK(BM_inc8_v3);

static void BM_inc8_v4(benchmark::State& state) {
  std::vector<std::uint8_t> v(N * 4);
  for (auto _ : state) {
    inc_v4(v);
    benchmark::DoNotOptimize(v);
  }
}
BENCHMARK(BM_inc8_v4);

static void BM_inc32_v1(benchmark::State& state) {
  std::vector<std::uint32_t> v(N);
  for (auto _ : state) {
    inc_v1(v);
    benchmark::DoNotOptimize(v);
  }
}
BENCHMARK(BM_inc32_v1);

static void BM_inc32_v3(benchmark::State& state) {
  std::vector<std::uint32_t> v(N);
  for (auto _ : state) {
    inc_v3(v);
    benchmark::DoNotOptimize(v);
  }
}
BENCHMARK(BM_inc32_v3);

static void BM_inc32_v4(benchmark::State& state) {
  std::vector<std::uint32_t> v(N);
  for (auto _ : state) {
    inc_v4(v);
    benchmark::DoNotOptimize(v);
  }
}
BENCHMARK(BM_inc32_v4);

Показать сначала clang, потом gcc