# 选题介绍 - 快速排序

## 小组成员

- 计1504 41416058 徐经纬
- 计1504 41524206 李凯
- 计1504 41524208 李鸿卓
- 计1504 41524201 朱柯佳
- 计1502 61562034 艾万


## 朴素快速排序算法


众所周知的基于分治思想的排序算法，朴素的实现上，最优时间复杂度为$O(nlogn)$，最坏时间复杂度可以被卡到$O(n^2)$

以下针对整形数组来介绍和探究快速排序算法的应用和扩展

主要算法步骤：

- 从数组中选择一个目标值aim，按照下面的方式划分数组：
    
    - 所有小于aim的元素都划分到aim的左侧
    
    - 所有不小于aim的元素都划分到aim的右侧

- 递归对左右两部分分别进行上一步操作，直至整个数组处理完成

### 一个简单的例子

    8 5 7 3 9    - 选择目标值为7

    7 5 8 3 9    - 将目标位置和当前最左边位置互换，令left=1(val=5), right=5(val=9)

    7 5 8 3 9    - 右移left，左移right至满足交换条件：left=3(val=8), right=4(val=3), 交换两个位置的值

    7 5 3 8 9    - left=4 > right=3, 结束交换，交换right位置和最左边位置

    3 5 7 8 9    - 对[3 5]和[8 9]两部分递归进行处理
    
    3 5 7 8 9    - 完成排序

### 优化

主要算法步骤：

- 从数组中选择一个目标值aim，按照下面的方式划分数组：
    
    - 所有小于aim的元素都划分到aim的左侧
    
    - 所有不小于aim的元素都划分到aim的右侧

- 递归对左右两部分分别进行上一步操作，直至整个数组处理完成

主要优化点：

- [x] 目标位置选取优化：对于固定选取最左边位置，完全逆序数据会被卡到最差复杂度 => 结论：固定目标位置的选取都是不优秀的
- [x] 划分方式优化：对于朴素二分方式，完全相同数据会被卡到最差复杂度 => 结论：对于存在大量相同数据的数据集，朴素二分方式是不优秀的


```cpp
using namespace std;
typedef pair<int, int> P;

P partition(int a[], int l, int r) {
	if (l >= r) return P(l, r);
	int cur = l;
	int left = l;
	int right = r;
	int p = rand() % (r - l + 1);
	int aim = a[l + p];
	swap(a[l + p], a[l]);
	while (cur <= right) {
		if (a[cur] == aim) cur++;
		else if (a[cur] < aim) swap(a[cur++], a[left++]);
		else swap(a[cur], a[right--]);
	}
	return P(left, right);
}

void qsort(int a[], int l, int r) {
	if (l >= r) return;
	P ret = partition(a, l, r);
    int left = ret.first, right = ret.second;
	qsort(a, l, left - 1);
	qsort(a, right + 1, r);
}
```

- $a[l...left - 1] < aim$
- $a[left...right] = aim$
- $a[right+ 1...r] > aim$

In [2]:
import subprocess
def run(method, workers, testSize, seed, compiled=True):
    if method not in ['naive', 'openmp', 'mpi']:
        raise ValueError(method)
    # print("run: method={}, workers={}, n={}, seed={}".format(method, workers, testSize, seed))
    script = './run-{}.sh'.format(method)
    cmd = [script, '-w {}'.format(workers), '-n {}'.format(testSize), '-s {}'.format(seed), ]
    if not compiled:
        cmd.append('-c')
    return float(subprocess.check_output(cmd))

In [4]:
run('naive', 2, 10000000, 4516)

1.500005

In [3]:
run("mpi", 2, 10000000, 4516)

run: method=mpi, workers=2, n=10000000, seed=4516


0.955845

In [4]:
run("openmp", 2, 10000000, 4516)

run: method=openmp, workers=2, n=10000000, seed=4516


1.602312

In [5]:
import math
import random
import plotly.plotly
import plotly.graph_objs as go
from plotly.offline import iplot, init_notebook_mode

init_notebook_mode(connected=True)

def trace1():
    testSize = int(5e6)
    seed = [i - 1 for i in range(20)]
    y = []
    for i in seed:
        timeInSec = run('naive', 1, testSize, i)
        y.append(timeInSec)

    return go.Scatter(x=seed, y=y, mode='lines+markers', name='fixed test size')

def trace2():
    seed = 123456
    y = []
    testSize = [1000, ]
    for i in range(16):
        testSize.append(testSize[-1] * 2)
    
    for i in testSize:
        timeInSec = run('naive', 1, i, seed)
        y.append(timeInSec)

    return go.Scatter(x=testSize, y=y, mode='lines+markers', name='fixed seed')

In [6]:
%%time
trace_naive_testsize = trace1()
trace_naive_seed = trace2()

run: method=naive, workers=1, n=5000000, seed=-1
run: method=naive, workers=1, n=5000000, seed=0
run: method=naive, workers=1, n=5000000, seed=1
run: method=naive, workers=1, n=5000000, seed=2
run: method=naive, workers=1, n=5000000, seed=3
run: method=naive, workers=1, n=5000000, seed=4
run: method=naive, workers=1, n=5000000, seed=5
run: method=naive, workers=1, n=5000000, seed=6
run: method=naive, workers=1, n=5000000, seed=7
run: method=naive, workers=1, n=5000000, seed=8
run: method=naive, workers=1, n=5000000, seed=9
run: method=naive, workers=1, n=5000000, seed=10
run: method=naive, workers=1, n=5000000, seed=11
run: method=naive, workers=1, n=5000000, seed=12
run: method=naive, workers=1, n=5000000, seed=13
run: method=naive, workers=1, n=5000000, seed=14
run: method=naive, workers=1, n=5000000, seed=15
run: method=naive, workers=1, n=5000000, seed=16
run: method=naive, workers=1, n=5000000, seed=17
run: method=naive, workers=1, n=5000000, seed=18
run: method=naive, workers=1, 

In [7]:
bar_trance = go.Bar(
    x=trace_naive_testsize.x,
    y=trace_naive_testsize.y,
    name='bar trance',
    marker=dict(
        color='rgb(158,202,225)',
        line=dict(
            color='rgb(8,48,107)',
            width=1.5),
        ),
    opacity=0.6
)

iplot({
    "data": [trace_naive_testsize, bar_trance],
    "layout": go.Layout(title="plot 1 of quick sort<br>fixed test size: 5M elements", 
                        xaxis=dict(title='seed'),
                        yaxis=dict(title='runtime(s)'),
                        showlegend=True,
                       )
})

def nlogn(val):
    return math.log(val) / math.log(2) * val

ratio_trace = go.Scatter(
    x=trace_naive_seed.x,
    y=[trace_naive_seed.y[i] / nlogn(trace_naive_seed.x[i]) for i in range(len(trace_naive_seed.x))],
    name='ratio',
    yaxis='y2'
)

iplot({
    "data": [trace_naive_seed, ratio_trace],
    "layout": go.Layout(title="plot 2 of quick sort<br>fixed seed: SEED = 123456",
                        xaxis=dict(title='test size'),
                        yaxis=dict(title='runtime(s)'),
                        yaxis2=dict(title='ratio', overlaying='y', side='right'),
                        showlegend=True,
                       )
})

# OpenMP

```cpp
void qsort(int a[], int l, int r) {
	if (l >= r) return;
	P ret = partition(a, l, r);
	qsort(a, l, ret.first - 1);
	qsort(a, ret.second + 1, r);
}
```

# OpenMP

```cpp
void qsort(int a[], int l, int r) {
	if (l >= r) return;
	P ret = partition(a, l, r);
	int left = ret.first, right = ret.second;
	{
		#pragma omp task firstprivate(a, l, left)
		{
			qsort(a, l, left - 1);
		}
		#pragma omp task firstprivate(a, right, r)
		{
			qsort(a, right + 1, r);
		}
	}
}

/////////////////

omp_set_nested(1);

omp_set_num_threads(num_of_threads);

#pragma omp parallel shared(data, n)
{
    #pragma omp single nowait
    {
        qsort(data, 0, n - 1);
    }
}
```

### section | task

### firstprivate

### single nowait



In [8]:
def trace3():
    # x轴为线程数
    ret = []
    seed = 123456
    testSize = [10000, ]
    for i in range(6):
        testSize.append(testSize[-1] * 2)
    threads = [2**i for i in range(10)]
    
    for n in testSize:
        y = []
        for thread in threads:
            timeInSec = run('openmp', thread, n, seed)
            y.append(timeInSec)
        ret.append(go.Scatter(x=threads, y=y, mode='lines+markers', name='{} items'.format(n)))
    return ret

def trace4():
    # 线程数量=4,8,16
    # seed=123456
    # 固定种子，增加测试集大小
    ret = []
    seed = 123456
    testSize = [1000, ]
    for i in range(16):
        testSize.append(testSize[-1] * 2)

    for thread in [1, 2, 4, 8, 16]:
        y = []
        for n in testSize:
            timeInSec = run('openmp', thread, n, seed)
            y.append(timeInSec)
        ret.append(go.Scatter(x=testSize, y=y, mode='lines+markers', name='{} threads'.format(thread)))
    return ret

In [9]:
%%time
trace_openmp_testsize = trace3()
trace_openmp_seed = trace4()

run: method=openmp, workers=1, n=10000, seed=123456
run: method=openmp, workers=2, n=10000, seed=123456
run: method=openmp, workers=4, n=10000, seed=123456
run: method=openmp, workers=8, n=10000, seed=123456
run: method=openmp, workers=16, n=10000, seed=123456
run: method=openmp, workers=32, n=10000, seed=123456
run: method=openmp, workers=64, n=10000, seed=123456
run: method=openmp, workers=128, n=10000, seed=123456
run: method=openmp, workers=256, n=10000, seed=123456
run: method=openmp, workers=512, n=10000, seed=123456
run: method=openmp, workers=1, n=20000, seed=123456
run: method=openmp, workers=2, n=20000, seed=123456
run: method=openmp, workers=4, n=20000, seed=123456
run: method=openmp, workers=8, n=20000, seed=123456
run: method=openmp, workers=16, n=20000, seed=123456
run: method=openmp, workers=32, n=20000, seed=123456
run: method=openmp, workers=64, n=20000, seed=123456
run: method=openmp, workers=128, n=20000, seed=123456
run: method=openmp, workers=256, n=20000, seed=123

run: method=openmp, workers=16, n=65536000, seed=123456
CPU times: user 246 ms, sys: 498 ms, total: 744 ms
Wall time: 2min 26s


In [10]:
iplot({
    "data": trace_openmp_testsize,
    "layout": go.Layout(title="plot 3 of quick sort<br>openMP<br>fixed test size", 
                        xaxis=dict(title='number of threads'),
                        yaxis=dict(title='runtime(s)'),
                        showlegend=True,
                       )
})

iplot({
    "data": trace_openmp_seed,
    "layout": go.Layout(title="plot 4 of quick sort<br>openMP<br>fixed seed: 123456", 
                        xaxis=dict(title='size of test'),
                        yaxis=dict(title='runtime(s)'),
                        showlegend=True,
                       )
})

# MPI

### 任务划分

### 数据划分

In [11]:
def gen_shapes(stx, sty, width, height, n, gapx=1):
    ret = []
    point_x = []
    point_y = []
    edy = sty + height
    for i in range(n):
        x = stx + i * (width + gapx)
        edx = x + width
        ret.append({
            'type': 'rect',
            'x0': x,
            'y0': sty,
            'x1': edx,
            'y1': edy,
            'line': {
                'color': 'rgba(128, 0, 128, 1)',
                'width': 2,
            },
            'fillcolor': 'gray',
            'opacity': 0.3,
        })
        point_x.append(x / 2 + edx / 2)
        point_y.append(sty / 2 + edy / 2)
    return ret, point_x, point_y

def gen_all_shapes(stx, sty, base_width, base_height, n, gapx, gapy):
    ret = []
    point_x = []
    point_y = []
    for i in range(n):
        l = 2 ** (n - i - 1)
        cnt = 2 ** i
        curx = stx
        cury = sty + i * (base_height + gapy)
        width = cnt * (base_width + gapx) - gapx
        shapes, px, py = gen_shapes(curx, cury, width, base_height, l, gapx)
        ret.extend(shapes)
        point_x.extend(px)
        point_y.extend(py)
    return ret, point_x, point_y

shapes, x, y = gen_all_shapes(1, 1, 3, 1, 4, 1, 1)
text = []
for i in range(4):
    l = 2 ** (3 - i)
    for j in range(l, l + l):
        text.append(j)

trace0 = go.Scatter(
    x=x,
    y=y,
    text=text,
    mode='text',
)
data = [trace0]

layout = {
    'xaxis': {
        'range': [0, 35],
        'showgrid': False,
        'zeroline': False,
        'showline': False,
        'ticks': '',
        'showticklabels': False,
    },
    'yaxis': {
        'range': [0, 10],
        'showgrid': False,
        'zeroline': False,
        'showline': False,
        'ticks': '',
        'showticklabels': False,
    },
    'shapes': shapes,
}
fig = {
    'data': data,
    'layout': layout,
}
iplot(fig)

In [12]:
trace1 = go.Scatter(
    x=x,
    y=y,
    text=[1,5,3,6,2,7,4,8,1,3,2,4,1,2,1],
    mode='text',
)
iplot({
    'data': [trace1],
    'layout': layout,
})

In [13]:
trace2 = go.Scatter(
    x=x,
    y=y,
    text=[1,'5=1+2^2',3,'7=3+2^2',2,'6=2+2^2',4,'8=4+2^2',1,'3=1+2^1',2,'4=2+2^1',1,'2=1+2^0',1],
    mode='text',
)
iplot({
    'data': [trace2],
    'layout': layout,
})

$$ RANK\_LSON = RANK $$
$$ RANK\_RSON = RANK + 2^{depth} $$

# MPI

```cpp
/* Skipped: Base sort and help func */
void mpi_sort(int a[], int l, int r, int id, int max_id, int deep) {
	MPI_Status status;
	int rson_id = id + pow(2, deep);
	if (rson_id > max_id) {
		qsort(a, l, r);
		return;
	}
	rson_id--;
	
	if (l > r) {
		MPI_Send(a + l, 0, MPI::INT, rson_id, 0, MPI_COMM_WORLD);
		MPI_Recv(a + l, 0, MPI::INT, rson_id, MPI_ANY_TAG, MPI_COMM_WORLD, &status);
		return;
	}
	
	P ret = partition(a, l, r);
	int lsize = ret.first - l;
	int rsize = r - ret.second;
	if (lsize < rsize) {
		MPI_Send(a + l, lsize, MPI::INT, rson_id, l, MPI_COMM_WORLD);
		mpi_sort(a, ret.second + 1, r, id, max_id, deep + 1);
		MPI_Recv(a + l, lsize, MPI::INT, rson_id, MPI_ANY_TAG, MPI_COMM_WORLD, &status);
	} else {
		MPI_Send(a + ret.second + 1, rsize, MPI::INT, rson_id, ret.second + 1, MPI_COMM_WORLD);
		mpi_sort(a, l, ret.first - 1, id, max_id, deep + 1);
		MPI_Recv(a + ret.second + 1, rsize, MPI::INT, rson_id, MPI_ANY_TAG, MPI_COMM_WORLD, &status);
	}
}
```

# MPI

```cpp
int main(int argc, char *argv[]) {
    /* Skipped: input and initialisation*/
	myid++;
    // id start from 1
	int retcode = 0;
	if (myid == 1) {
		int n, seed;
		sscanf(argv[1], "%d", &n);
		sscanf(argv[2], "%d", &seed);
		int *data = (int*)malloc(sizeof(int)*n);
		srand(seed);
		for (int i = 0; i < n; i++)
			if (0 == seed) data[i] = 0;
			else if (-1 == seed) data[i] = n - i;
			else data[i] = rand() % n + 1;

		double tmp = MPI_Wtime();
		mpi_sort(data, 0, n - 1, myid, numprocs, 0);
		tmp = MPI_Wtime() - tmp;
		printf("%lf\n", tmp);
		retcode = validate(data, n);
		free(data);
	} else {
		int *subarray = NULL;
		MPI_Status status;
		int sub_size = 0;
		int index = 0;
		int parent_id = 0;
		while (pow(2, index) < myid) index++;
		MPI_Probe(MPI_ANY_SOURCE, MPI_ANY_TAG, MPI_COMM_WORLD, &status);
		MPI_Get_count(&status, MPI::INT, &sub_size);
		parent_id = status.MPI_SOURCE;
		subarray = (int*)malloc(sub_size * sizeof(int));
		
		MPI_Recv(subarray, sub_size, MPI::INT, MPI_ANY_SOURCE, MPI_ANY_TAG, MPI_COMM_WORLD, &status);
		mpi_sort(subarray, 0, sub_size - 1, myid, numprocs, index);
		MPI_Send(subarray, sub_size, MPI::INT, parent_id, parent_id,MPI_COMM_WORLD);
		
		free(subarray);
	}
	MPI_Finalize();
	return retcode;
}
```

In [20]:
def trace5():
    # x轴为线程数
    ret = []
    seed = 123456
    testSize = [1000, ]
    for i in range(6):
        testSize.append(testSize[-1] * 2)
    procs = [i + 1 for i in range(32)]
    
    for n in testSize:
        y = []
        for proc in procs:
            timeInSec = run('mpi', proc, n, seed)
            y.append(timeInSec)
        ret.append(go.Scatter(x=procs, y=y, mode='lines+markers', name='{} items'.format(n)))
    return ret

def trace6():
    # 线程数量=4,8,16
    # seed=123456
    # 固定种子，增加测试集大小
    ret = []
    seed = 123456
    testSize = [1000, ]
    for i in range(16):
        testSize.append(testSize[-1] * 2)

    for proc in [1, 2, 4, 8, 16]:
        y = []
        for n in testSize:
            timeInSec = run('mpi', proc, n, seed)
            y.append(timeInSec)
        ret.append(go.Scatter(x=testSize, y=y, mode='lines+markers', name='{} procs'.format(proc)))
    return ret

In [21]:
%%time
trace_mpi_testsize = trace5()

run: method=mpi, workers=1, n=1000, seed=123456
run: method=mpi, workers=2, n=1000, seed=123456
run: method=mpi, workers=3, n=1000, seed=123456
run: method=mpi, workers=4, n=1000, seed=123456
run: method=mpi, workers=5, n=1000, seed=123456
run: method=mpi, workers=6, n=1000, seed=123456
run: method=mpi, workers=7, n=1000, seed=123456
run: method=mpi, workers=8, n=1000, seed=123456
run: method=mpi, workers=9, n=1000, seed=123456
run: method=mpi, workers=10, n=1000, seed=123456
run: method=mpi, workers=11, n=1000, seed=123456
run: method=mpi, workers=12, n=1000, seed=123456
run: method=mpi, workers=13, n=1000, seed=123456
run: method=mpi, workers=14, n=1000, seed=123456
run: method=mpi, workers=15, n=1000, seed=123456
run: method=mpi, workers=16, n=1000, seed=123456
run: method=mpi, workers=17, n=1000, seed=123456
run: method=mpi, workers=18, n=1000, seed=123456
run: method=mpi, workers=19, n=1000, seed=123456
run: method=mpi, workers=20, n=1000, seed=123456
run: method=mpi, workers=21, 

run: method=mpi, workers=10, n=32000, seed=123456
run: method=mpi, workers=11, n=32000, seed=123456
run: method=mpi, workers=12, n=32000, seed=123456
run: method=mpi, workers=13, n=32000, seed=123456
run: method=mpi, workers=14, n=32000, seed=123456
run: method=mpi, workers=15, n=32000, seed=123456
run: method=mpi, workers=16, n=32000, seed=123456
run: method=mpi, workers=17, n=32000, seed=123456
run: method=mpi, workers=18, n=32000, seed=123456
run: method=mpi, workers=19, n=32000, seed=123456
run: method=mpi, workers=20, n=32000, seed=123456
run: method=mpi, workers=21, n=32000, seed=123456
run: method=mpi, workers=22, n=32000, seed=123456
run: method=mpi, workers=23, n=32000, seed=123456
run: method=mpi, workers=24, n=32000, seed=123456
run: method=mpi, workers=25, n=32000, seed=123456
run: method=mpi, workers=26, n=32000, seed=123456
run: method=mpi, workers=27, n=32000, seed=123456
run: method=mpi, workers=28, n=32000, seed=123456
run: method=mpi, workers=29, n=32000, seed=123456


In [22]:
%%time
trace_mpi_seed = trace6()

run: method=mpi, workers=1, n=1000, seed=123456
run: method=mpi, workers=1, n=2000, seed=123456
run: method=mpi, workers=1, n=4000, seed=123456
run: method=mpi, workers=1, n=8000, seed=123456
run: method=mpi, workers=1, n=16000, seed=123456
run: method=mpi, workers=1, n=32000, seed=123456
run: method=mpi, workers=1, n=64000, seed=123456
run: method=mpi, workers=1, n=128000, seed=123456
run: method=mpi, workers=1, n=256000, seed=123456
run: method=mpi, workers=1, n=512000, seed=123456
run: method=mpi, workers=1, n=1024000, seed=123456
run: method=mpi, workers=1, n=2048000, seed=123456
run: method=mpi, workers=1, n=4096000, seed=123456
run: method=mpi, workers=1, n=8192000, seed=123456
run: method=mpi, workers=1, n=16384000, seed=123456
run: method=mpi, workers=1, n=32768000, seed=123456
run: method=mpi, workers=1, n=65536000, seed=123456
run: method=mpi, workers=2, n=1000, seed=123456
run: method=mpi, workers=2, n=2000, seed=123456
run: method=mpi, workers=2, n=4000, seed=123456
run: me

In [23]:
iplot({
    "data": trace_mpi_testsize,
    "layout": go.Layout(title="plot 5 of quick sort<br>mpi<br>fixed test size", 
                        xaxis=dict(title='number of procs'),
                        yaxis=dict(title='runtime(s)'),
                        showlegend=True,
                       )
})

iplot({
    "data": trace_mpi_seed,
    "layout": go.Layout(title="plot 6 of quick sort<br>mpi<br>fixed seed: 123456", 
                        xaxis=dict(title='size of test'),
                        yaxis=dict(title='runtime(s)'),
                        showlegend=True,
                       )
})