# Python üêå

In [1]:
# Python
def count_triples1(N):
    found = 0
    for a in range(1, N+1):
        for b in range(a+1, N+1):
            for c in range(b+1, N+1):
                if a*a + b*b == c*c:
                    found += 1
    return found

In [2]:
# -r Anzahl der Runs
# -n Anzahl der Wiederholungen pro run
# -o gib ein Timeit-objekt zur√ºck
t1 = %timeit -o -r 1 -n 1 count_triples1(1000)

27.2 s ¬± 0 ns per loop (mean ¬± std. dev. of 1 run, 1 loop each)


# Cython üöÄ

In [1]:
%load_ext cython

In [2]:
%%cython -f

def count_triples2(N):
    found = 0
    for a in range(1, N+1):
        for b in range(a+1, N+1):
            for c in range(b+1, N+1):
                if a*a + b*b == c*c:
                    found += 1
    return found

In [3]:
t2 = %timeit -o -r 1 -n 1 count_triples2(1000)

18.5 s ¬± 0 ns per loop (mean ¬± std. dev. of 1 run, 1 loop each)


In [4]:
%%cython

def count_triples3(int N):
    cdef int found = 0
    cdef int a,b,c
    for a in range(1, N+1):
        for b in range(a+1, N+1):
            for c in range(b+1, N+1):
                if a*a + b*b == c*c:
                    found += 1
    return found

In [5]:
t3 = %timeit -o count_triples3(1000)

26.4 ms ¬± 497 ¬µs per loop (mean ¬± std. dev. of 7 runs, 10 loops each)


In [28]:
# Speedup gegen√ºber Python:
print(f"{t1.average / t3.average:0.0f}x Speedup")

1335x Speedup


# Extratuning

Geht da noch mehr?

## Compilerflags und Compilerdirectives

In [12]:
# macOS only: Verwende gcc statt clang f√ºr aggressivere SIMD-Autovektorisierung
import os
os.environ['CC'] = 'gcc-10'

In [9]:
%%cython -f -c=-O3 -c=-march=native

def count_triples4(int N):
    cdef int found = 0
    cdef int a, b, c
    for a in range(1, N+1):
        for b in range(a+1, N+1):
            for c in range(b+1, N+1):
                if a*a + b*b == c*c:
                    found += 1
    return found

In [10]:
t4 = %timeit -o count_triples4(1000)

16.9 ms ¬± 361 ¬µs per loop (mean ¬± std. dev. of 7 runs, 100 loops each)


In [11]:
t4 = %timeit -o -r 1 -n 1 count_triples4(10000)

10.6 s ¬± 0 ns per loop (mean ¬± std. dev. of 1 run, 1 loop each)


## Parallelisierung

In [13]:
%%cython -f -c=-O3 -c=-march=native -c=-fopenmp --link-args=-fopenmp

from cython.parallel cimport prange

def count_triples5(int N):
    cdef int found = 0
    cdef int a, b, c
    for a in prange(1, N+1, nogil=True):
        for b in range(a+1, N+1):
            for c in range(b+1, N+1):
                if a*a + b*b == c*c:
                    found += 1
    return found

In [14]:
t5 = %timeit -o -r 1 -n 1 count_triples5(10000)

3.66 s ¬± 0 ns per loop (mean ¬± std. dev. of 1 run, 1 loop each)


In [23]:
%%cython -f -c=-O3 -c=-march=native -c=-fopenmp --link-args=-fopenmp

from cython.parallel cimport prange

def count_triples6(int N):
    cdef int found = 0
    cdef int a, b, c
    for a in prange(1, N+1, nogil=True, schedule='static', chunksize=1):
        for b in range(a+1, N+1):
            for c in range(b+1, N+1):
                if a*a + b*b == c*c:
                    found += 1
    return found

In [24]:
t6 = %timeit -o -r 1 -n 1 count_triples6(10000)

1.54 s ¬± 0 ns per loop (mean ¬± std. dev. of 1 run, 1 loop each)


In [29]:
# Das entspricht f√ºr N = 10000 ca einem Speedup gegen√ºber count_triples1 von...
print(f"{t1.average * 1e3 / t6.average:.0f}x Speedup")

17624x Speedup


# üò±üò±üò±