# 1: Procedurally Building Arrays

Remember `concatenate`:

In [None]:
import numpy as np

a = np.array([[1,2,3], [4,5,6]])
b = np.array([1,2,3])

np.concatenate([a,b])

In [None]:
b = b[np.newaxis, :]
print(b)
np.concatenate([a,b])

Remember `stack`:

In [None]:
np.stack([np.array([1,2,3]), np.array([4,5,6]), np.array([7,8,9])], axis=0)

Fun fact: Array dimensions can be size 0.

In [None]:
empty = np.zeros((0, 3))
print(empty)

In [None]:
np.concatenate((empty, [[1,2,3]]))

**Task:** Build a 2D array with each row coming from the generator below:

In [None]:
import numpy as np
# useful: np.concatenate([arr1, arr2, arr...], axis=0)
# or: np.stack()

def row_yielder():
    rows = np.random.randint(2495, 2505)
    for row in range(rows):
        yield np.random.random((500,))


In [None]:
from time import time
start = time()

full = np.empty((0, 500))
for row in row_yielder():
    full = np.concatenate((full, row[None,:]), axis=0)
    
stop = time()
print(stop-start)  # elapsed seconds
print(full.shape)  # just a sanity check

In [None]:
start = time()

full = np.stack([row for row in row_yielder()], axis=0)

stop = time()
print(stop-start)
print(full.shape)

In [None]:
start = time()

full = np.array([row for row in row_yielder()])

stop = time()
print(stop-start)
print(full.shape)

# 2: Cleaning Data

Remember boolean indexing:

In [None]:
a = np.arange(20)
b = np.random.choice([True, False], size=20)
print(a)
print(b)

In [None]:
print(a[b])

In [None]:
a[b] = -1
print(a)

In [None]:
a = np.array([[1,2,3], [4,5,6]])
a[np.array([[True, False, False], [False, True, True]])] = 100
print(a)

There is a special `nan` value in numpy:

In [None]:
np.nan

In [None]:
np.nan + 5

In [None]:
np.isnan([3, 4, np.nan, 5])

**Task:** Write a function that takes an array as input and replaces any entry that is either nan or below 0 by 0.

In [None]:
# valerii's solution
def clean(arr):
    # doesn't modify the input, creates a new array!
    new = np.nan_to_num(arr)
    new = np.clip(new, 0, None)
    return new

a = np.array([[1,2,-5],[np.nan, -2, 5]])
print(clean(a))

In [None]:
# using boolean indices
def clean(arr):
    # returns nothing; modifies the input array!
    arr[np.logical_or(np.isnan(arr), arr < 0)] = 0

In [None]:
a = np.array([[1,-5,3], [np.nan,np.nan,23]])
clean(a)
print(a)

# 3: Splitting a Dataset

Remember index arrays:

In [None]:
a = np.arange(20, 40)
b = a.reshape((2,10))
print(a)
print(b)

In [None]:
a[[0,15,1,3]]

In [None]:
b[[1,0,1,0,0]]

In [None]:
b[[0, 0, 1], [5,9,3]]

Using `np.delete` and index arrays, you can remove elements from an array:

In [None]:
print(a)
print(np.delete(a, [15,3,11,1]))

In [None]:
print(b, "\n")
print(np.delete(b, [1]), "\n")
print(np.delete(b, [1], axis=0))

Write a function that takes two arrays `data` and `labels` and splits each into two new arrays: A random 80% of the rows into `data_a`, and the remaining 20% into `data_b`. `labels` should be split *in the same way*. E.g. if row \#15 of `data` ends up in position 31 of part a, so should entry \#15 of `labels`. Return the resulting four arrays.
- The data doesn't need to stay in the same order.
- *Do not set* `np.random.seed`*!*

In [None]:
# example data:
data = np.random.random((100, 20))
labels = np.random.randint(0, 2, size=100)
# useful: np.random.choice(array, size, replace=False)
# also useful: np.delete(array, indices, axis=0)

def split(data, labels):
    pass

In [None]:
def split(data, labels):
    inds_a = np.random.choice(len(data), size=int(0.8*len(data)), replace=False)
    data_a = data[inds_a]
    labels_a = labels[inds_a]
    data_b = np.delete(data, inds_a, axis=0)
    labels_b = np.delete(labels, inds_a, axis=0)
    return data_a, data_b, labels_a, labels_b

In [None]:
data = np.arange(50).reshape((10,5))
labels = np.arange(10)

data_a, data_b, labels_a, labels_b = split(data, labels)

In [None]:
print(data_a)
print(labels_a)

In [None]:
print(data_b)
print(labels_b)