# 8 Data Manipulation with NumPy
- Examine how to clean and preprocess data using NumPy.
- Hoy to discover missing values (and fill them up).
- Ways to remove irrelevant data.
- sort(), shuffle(), reshape(), stack(), strip()
## 8_06 Argument Sort in NumPy

#### numpy.argsort(a, axis=-1, kind=None, order=None, *, stable=None)
- Returns the indices that would sort an array.
- Perform an indirect sort along the given axis using the algorithm specified by the kind keyword. It returns an array of indices of the same shape as a that index data along the given axis in sorted order.

In [1]:
import numpy as np
np.__version__

'1.26.4'

In [2]:
# Function show_attr

def show_attr(arrnm: str) -> str:
    strout = f' {arrnm}: '

    for attr in ('shape', 'ndim', 'size', 'dtype'):     #, 'itemsize'):
            arrnm_attr = arrnm + '.' + attr
            strout += f'| {attr}: {eval(arrnm_attr)} '

    return strout

In [3]:
# Let's work with a dataset without NANs

lend_co_data_num = np.loadtxt('Lending-Company-Numeric-Data.csv',
                              delimiter=',')

display(show_attr('lend_co_data_num'))
lend_co_data_num

' lend_co_data_num: | shape: (1043, 6) | ndim: 2 | size: 6258 | dtype: float64 '

array([[ 2000.,    40.,   365.,  3121.,  4241., 13621.],
       [ 2000.,    40.,   365.,  3061.,  4171., 15041.],
       [ 1000.,    40.,   365.,  2160.,  3280., 15340.],
       ...,
       [ 2000.,    40.,   365.,  4201.,  5001., 16600.],
       [ 1000.,    40.,   365.,  2080.,  3320., 15600.],
       [ 2000.,    40.,   365.,  4601.,  4601., 16600.]])

In [4]:
np.argsort(lend_co_data_num)    # def. axis=-1, last axis (1), cols values in e/row)

## The indices that would sort the array

array([[1, 2, 0, 3, 4, 5],
       [1, 2, 0, 3, 4, 5],
       [1, 2, 0, 3, 4, 5],
       ...,
       [1, 2, 0, 3, 4, 5],
       [1, 2, 0, 3, 4, 5],
       [1, 2, 0, 3, 4, 5]], dtype=int64)

In [5]:
# Indices that would sort every value of columns, and check this values -350.0 | -2000.0 | 17650.0
np.set_printoptions(suppress=True)

display(np.sort(lend_co_data_num, axis=0))
display(np.argsort(lend_co_data_num, axis=0))
print(lend_co_data_num[482,5], '|', lend_co_data_num[85,4], '|', lend_co_data_num[718,3])

array([[ 1000.,    35.,   365., -2870., -2870.,  -350.],
       [ 1000.,    35.,   365., -2550., -2100.,   150.],
       [ 1000.,    35.,   365., -2450., -2000.,  1100.],
       ...,
       [ 9000.,   125.,   365., 16751., 18751., 54625.],
       [ 9000.,   165.,   365., 17650., 20001., 54625.],
       [ 9000.,   165.,   365., 19001., 22001., 64001.]])

array([[ 537,  443,    0,   32,   32,  482],
       [ 639,  327,  687,  166,  166,  493],
       [ 849,  432,  688,   85,   85,  166],
       ...,
       [  27,  326,  355,  568, 1019,  568],
       [ 277,   27,  357,  718, 1033,  534],
       [ 420,  408, 1042,  912,  912,   27]], dtype=int64)

-350.0 | -2000.0 | 17650.0


In [6]:
## Using argsort to arrange individual rows or cols
# Ex. arrange the vals of the 1st col and get their indices
display(np.argsort(lend_co_data_num[:,0]))       # USEFUL!

## We can take this order to rearrange all the rows based on their values in the 1st col.
lend_co_data_num = lend_co_data_num[np.argsort(lend_co_data_num[:,0])]
lend_co_data_num
# The values in the first col are arranged appropriately but not the rest, so we
# successfully rearranged all the rows based on the values in the first position.

array([537, 639, 849, ...,  27, 277, 420], dtype=int64)

array([[ 1000.,    40.,   365.,  2200.,  3400., 15600.],
       [ 1000.,    40.,   365.,  2200.,  3800., 15600.],
       [ 1000.,    40.,   365.,  2000.,  3950., 15600.],
       ...,
       [ 9000.,   165.,   365., 14501., 16846., 64001.],
       [ 9000.,   125.,   365., 12001., 15751., 38626.],
       [ 9000.,   125.,   365., 12251., 14251., 25626.]])

In [None]:
## USEFUL ex. when each row of a dataset contains information about a specific client
# (or date) since we want to keep this info intact.
# Ex. analyzing a portfolio of stocks, each row in the dataset can represent the stock
# of a diff company 