# 8 Data Manipulation with NumPy
- Examine how to clean and preprocess data using NumPy.
- Hoy to discover missing values (and fill them up).
- Ways to remove irrelevant data.
- sort(), shuffle(), reshape(), stack(), strip()
## 8_05 Sorting NDarrays

#### numpy.sort(a, axis=-1, kind=None, order=None, *, stable=None)
- Return a sorted copy of an array.
- https://stackoverflow.com/questions/26984414/efficiently-sorting-a-numpy-array-in-descending-order
- numpy.flip(m, axis=None): Reverse the order of elements in an array along the given axis. The shape of the array is preserved, but the elements are reordered.

In [13]:
import numpy as np
np.__version__

'1.26.4'

In [14]:
# Function show_attr

def show_attr(arrnm: str) -> str:
    strout = f' {arrnm}: '

    for attr in ('shape', 'ndim', 'size', 'dtype'):     #, 'itemsize'):
            arrnm_attr = arrnm + '.' + attr
            strout += f'| {attr}: {eval(arrnm_attr)} '

    return strout

In [15]:
# Let's work with a dataset that contains NANs.

lend_co_data_num = np.loadtxt('Lending-Company-Numeric-Data.csv',
                              delimiter=',')

display(show_attr('lend_co_data_num'))
lend_co_data_num

' lend_co_data_num: | shape: (1043, 6) | ndim: 2 | size: 6258 | dtype: float64 '

array([[ 2000.,    40.,   365.,  3121.,  4241., 13621.],
       [ 2000.,    40.,   365.,  3061.,  4171., 15041.],
       [ 1000.,    40.,   365.,  2160.,  3280., 15340.],
       ...,
       [ 2000.,    40.,   365.,  4201.,  5001., 16600.],
       [ 1000.,    40.,   365.,  2080.,  3320., 15600.],
       [ 2000.,    40.,   365.,  4601.,  4601., 16600.]])

In [16]:
# Let's sort. Cause default axis= -1 (last axis) will sort values in each row
# 2D array only two axis (0,1) -> axis 0 = axis -2; axis=1, last, column axis
display(np.sort(lend_co_data_num))          # Arranging the diff cols in e/row
display(np.sort(lend_co_data_num, axis=1))  # same as axis=-1, default
display(np.sort(lend_co_data_num, axis=-1))
show_attr('np.sort(lend_co_data_num)')

array([[   40.,   365.,  2000.,  3121.,  4241., 13621.],
       [   40.,   365.,  2000.,  3061.,  4171., 15041.],
       [   40.,   365.,  1000.,  2160.,  3280., 15340.],
       ...,
       [   40.,   365.,  2000.,  4201.,  5001., 16600.],
       [   40.,   365.,  1000.,  2080.,  3320., 15600.],
       [   40.,   365.,  2000.,  4601.,  4601., 16600.]])

array([[   40.,   365.,  2000.,  3121.,  4241., 13621.],
       [   40.,   365.,  2000.,  3061.,  4171., 15041.],
       [   40.,   365.,  1000.,  2160.,  3280., 15340.],
       ...,
       [   40.,   365.,  2000.,  4201.,  5001., 16600.],
       [   40.,   365.,  1000.,  2080.,  3320., 15600.],
       [   40.,   365.,  2000.,  4601.,  4601., 16600.]])

array([[   40.,   365.,  2000.,  3121.,  4241., 13621.],
       [   40.,   365.,  2000.,  3061.,  4171., 15041.],
       [   40.,   365.,  1000.,  2160.,  3280., 15340.],
       ...,
       [   40.,   365.,  2000.,  4201.,  5001., 16600.],
       [   40.,   365.,  1000.,  2080.,  3320., 15600.],
       [   40.,   365.,  2000.,  4601.,  4601., 16600.]])

' np.sort(lend_co_data_num): | shape: (1043, 6) | ndim: 2 | size: 6258 | dtype: float64 '

In [17]:
# Sort the values in each column, and to refrain using scientific notation
np.set_printoptions(suppress=True)          # Apply to the ENTIRE work
display(cols_sorted := np.sort(lend_co_data_num, axis=0))
show_attr('cols_sorted')
## All separate columns arranged in increasing order.

array([[ 1000.,    35.,   365., -2870., -2870.,  -350.],
       [ 1000.,    35.,   365., -2550., -2100.,   150.],
       [ 1000.,    35.,   365., -2450., -2000.,  1100.],
       ...,
       [ 9000.,   125.,   365., 16751., 18751., 54625.],
       [ 9000.,   165.,   365., 17650., 20001., 54625.],
       [ 9000.,   165.,   365., 19001., 22001., 64001.]])

' cols_sorted: | shape: (1043, 6) | ndim: 2 | size: 6258 | dtype: float64 '

In [18]:
# Soting in reverse order use negative operator (np.sort don't have the parameter)
display(np.sort(-lend_co_data_num)) # same as (-1) * data (convert all to negatine)
-np.sort(-lend_co_data_num)         # another neg to restore the orig values

array([[-13621.,  -4241.,  -3121.,  -2000.,   -365.,    -40.],
       [-15041.,  -4171.,  -3061.,  -2000.,   -365.,    -40.],
       [-15340.,  -3280.,  -2160.,  -1000.,   -365.,    -40.],
       ...,
       [-16600.,  -5001.,  -4201.,  -2000.,   -365.,    -40.],
       [-15600.,  -3320.,  -2080.,  -1000.,   -365.,    -40.],
       [-16600.,  -4601.,  -4601.,  -2000.,   -365.,    -40.]])

array([[13621.,  4241.,  3121.,  2000.,   365.,    40.],
       [15041.,  4171.,  3061.,  2000.,   365.,    40.],
       [15340.,  3280.,  2160.,  1000.,   365.,    40.],
       ...,
       [16600.,  5001.,  4201.,  2000.,   365.,    40.],
       [15600.,  3320.,  2080.,  1000.,   365.,    40.],
       [16600.,  4601.,  4601.,  2000.,   365.,    40.]])

In [19]:
# Sorting individual columns or rows -> Using indexing
# Sort only 4th column (col[3])
display(orig_c3 := lend_co_data_num[:,3])   # to restore later
display(lend_co_data_num)                   # to see original data
np.sort(lend_co_data_num[:,3])      # return only one col, col[3] sorted

array([3121., 3061., 2160., ..., 4201., 2080., 4601.])

array([[ 2000.,    40.,   365.,  3121.,  4241., 13621.],
       [ 2000.,    40.,   365.,  3061.,  4171., 15041.],
       [ 1000.,    40.,   365.,  2160.,  3280., 15340.],
       ...,
       [ 2000.,    40.,   365.,  4201.,  5001., 16600.],
       [ 1000.,    40.,   365.,  2080.,  3320., 15600.],
       [ 2000.,    40.,   365.,  4601.,  4601., 16600.]])

array([-2870., -2550., -2450., ..., 16751., 17650., 19001.])

In [20]:
# but to change the actual col[3] in the 2D array, two ways
orig_c3 = np.copy(lend_co_data_num[:,3])      # to restore later
display(orig_c3)
lend_co_data_num[:,3] = np.sort(lend_co_data_num[:,3]) 
lend_co_data_num

array([3121., 3061., 2160., ..., 4201., 2080., 4601.])

array([[ 2000.,    40.,   365., -2870.,  4241., 13621.],
       [ 2000.,    40.,   365., -2550.,  4171., 15041.],
       [ 1000.,    40.,   365., -2450.,  3280., 15340.],
       ...,
       [ 2000.,    40.,   365., 16751.,  5001., 16600.],
       [ 1000.,    40.,   365., 17650.,  3320., 15600.],
       [ 2000.,    40.,   365., 19001.,  4601., 16600.]])

In [21]:
# Restore to the original value
lend_co_data_num[:,3] = orig_c3
display(lend_co_data_num)


array([[ 2000.,    40.,   365.,  3121.,  4241., 13621.],
       [ 2000.,    40.,   365.,  3061.,  4171., 15041.],
       [ 1000.,    40.,   365.,  2160.,  3280., 15340.],
       ...,
       [ 2000.,    40.,   365.,  4201.,  5001., 16600.],
       [ 1000.,    40.,   365.,  2080.,  3320., 15600.],
       [ 2000.,    40.,   365.,  4601.,  4601., 16600.]])

In [22]:
# Using .sort() method to change the column inside de array
lend_co_data_num[:,3].sort()    # Override the original - inplace = True
lend_co_data_num

array([[ 2000.,    40.,   365., -2870.,  4241., 13621.],
       [ 2000.,    40.,   365., -2550.,  4171., 15041.],
       [ 1000.,    40.,   365., -2450.,  3280., 15340.],
       ...,
       [ 2000.,    40.,   365., 16751.,  5001., 16600.],
       [ 1000.,    40.,   365., 17650.,  3320., 15600.],
       [ 2000.,    40.,   365., 19001.,  4601., 16600.]])

In [23]:
# To change the reorder columns in the entire dataset (array)
lend_co_data_num.sort(axis=0)   # Ascend order values in the columns
lend_co_data_num

array([[ 1000.,    35.,   365., -2870., -2870.,  -350.],
       [ 1000.,    35.,   365., -2550., -2100.,   150.],
       [ 1000.,    35.,   365., -2450., -2000.,  1100.],
       ...,
       [ 9000.,   125.,   365., 16751., 18751., 54625.],
       [ 9000.,   165.,   365., 17650., 20001., 54625.],
       [ 9000.,   165.,   365., 19001., 22001., 64001.]])

### Notes and Examples from the Manual - numpy.sort
- Sorting Algorithms; average speed, worst case performance, work space size, and whether they are stable - FUTURE JM!!

In [26]:
display(a := np.array([[1,4],[3,1]]))
display(np.sort(a))             # sort along the last axis
display(np.sort(a, axis=None))  # sort the flattened array
np.sort(a, axis=0)              # sort along the first axis

array([[1, 4],
       [3, 1]])

array([[1, 4],
       [1, 3]])

array([1, 1, 3, 4])

array([[1, 1],
       [3, 4]])

In [27]:
# Use the order keyword to specify a field to use when sorting a structured array:
dtype = [('name', 'S10'), ('height', float), ('age', int)]
values = [('Arthur', 1.8, 41), ('Lancelot', 1.9, 38),
          ('Galahad', 1.7, 38)]
a = np.array(values, dtype=dtype)       # create a structured array
np.sort(a, order='height')                        

array([(b'Galahad', 1.7, 38), (b'Arthur', 1.8, 41),
       (b'Lancelot', 1.9, 38)],
      dtype=[('name', 'S10'), ('height', '<f8'), ('age', '<i4')])

In [None]:
# Sort by age, then height if ages are equal:
np.sort(a, order=['age', 'height'])      