# 8 Data Manipulation with NumPy
- Examine how to clean and preprocess data using NumPy.
- Hoy to discover missing values (and fill them up).
- Ways to remove irrelevant data.
- sort(), shuffle(), reshape(), stack(), strip()
## 8_4 Removing Values from Ndarrays

#### numpy.delete(arr, obj, axis=None)
Return a new array with sub-arrays along an axis deleted. For a one dimensional array, this returns those entries not returned by arr[obj].

In [1]:
import numpy as np
np.__version__

'1.26.4'

In [2]:
# Function show_attr

def show_attr(arrnm: str) -> str:
    strout = f' {arrnm}: '

    for attr in ('shape', 'ndim', 'size', 'dtype'):     #, 'itemsize'):
            arrnm_attr = arrnm + '.' + attr
            strout += f'| {attr}: {eval(arrnm_attr)} '

    return strout

In [3]:
# Let's work with a dataset that contains NANs.

lend_co_data_num = np.loadtxt('Lending-Company-Numeric-Data.csv',
                              delimiter=',')

display(show_attr('lend_co_data_num'))
lend_co_data_num

' lend_co_data_num: | shape: (1043, 6) | ndim: 2 | size: 6258 | dtype: float64 '

array([[ 2000.,    40.,   365.,  3121.,  4241., 13621.],
       [ 2000.,    40.,   365.,  3061.,  4171., 15041.],
       [ 1000.,    40.,   365.,  2160.,  3280., 15340.],
       ...,
       [ 2000.,    40.,   365.,  4201.,  5001., 16600.],
       [ 1000.,    40.,   365.,  2080.,  3320., 15600.],
       [ 2000.,    40.,   365.,  4601.,  4601., 16600.]])

In [4]:
# Let's delete the first element of the array, and now we have a
#  new 1-D array (flattened the original one less first element
display(del_1st_el := np.delete(lend_co_data_num, 0))
show_attr('del_1st_el')

array([   40.,   365.,  3121., ...,  4601.,  4601., 16600.])

' del_1st_el: | shape: (6257,) | ndim: 1 | size: 6257 | dtype: float64 '

In [5]:
# We want to get rid of an entire row -> axis= parameter
display(del_1st_row := np.delete(lend_co_data_num, 0, axis=0))
show_attr('del_1st_row')

array([[ 2000.,    40.,   365.,  3061.,  4171., 15041.],
       [ 1000.,    40.,   365.,  2160.,  3280., 15340.],
       [ 2000.,    40.,   365.,  3041.,  4241., 15321.],
       ...,
       [ 2000.,    40.,   365.,  4201.,  5001., 16600.],
       [ 1000.,    40.,   365.,  2080.,  3320., 15600.],
       [ 2000.,    40.,   365.,  4601.,  4601., 16600.]])

' del_1st_row: | shape: (1042, 6) | ndim: 2 | size: 6252 | dtype: float64 '

In [6]:
# Deleting the 1st column
display(del_1st_col := np.delete(lend_co_data_num, 0, axis=1))
show_attr('del_1st_col')

array([[   40.,   365.,  3121.,  4241., 13621.],
       [   40.,   365.,  3061.,  4171., 15041.],
       [   40.,   365.,  2160.,  3280., 15340.],
       ...,
       [   40.,   365.,  4201.,  5001., 16600.],
       [   40.,   365.,  2080.,  3320., 15600.],
       [   40.,   365.,  4601.,  4601., 16600.]])

' del_1st_col: | shape: (1043, 5) | ndim: 2 | size: 5215 | dtype: float64 '

In [7]:
# Deleting multiple elements, rows or columns -> tuple or list as argument
display(del_cols_0_2_4 := np.delete(lend_co_data_num, (0,2,4), axis=1))
show_attr('del_cols_0_2_4')

array([[   40.,  3121., 13621.],
       [   40.,  3061., 15041.],
       [   40.,  2160., 15340.],
       ...,
       [   40.,  4201., 16600.],
       [   40.,  2080., 15600.],
       [   40.,  4601., 16600.]])

' del_cols_0_2_4: | shape: (1043, 3) | ndim: 2 | size: 3129 | dtype: float64 '

In [8]:
# Deleting rows and columns simultaneously. 1st one then another in the same line of code
# 1st display the original dataset (array)
print(show_attr('lend_co_data_num'))
display(lend_co_data_num)


del_rs_0_last_cs_1_2_4 = np.delete(np.delete(lend_co_data_num,
                                   (1,2,4),
                                   axis=1), (0,-1), axis=0)

print(show_attr('del_rs_0_last_cs_1_2_4'))
del_rs_0_last_cs_1_2_4

 lend_co_data_num: | shape: (1043, 6) | ndim: 2 | size: 6258 | dtype: float64 


array([[ 2000.,    40.,   365.,  3121.,  4241., 13621.],
       [ 2000.,    40.,   365.,  3061.,  4171., 15041.],
       [ 1000.,    40.,   365.,  2160.,  3280., 15340.],
       ...,
       [ 2000.,    40.,   365.,  4201.,  5001., 16600.],
       [ 1000.,    40.,   365.,  2080.,  3320., 15600.],
       [ 2000.,    40.,   365.,  4601.,  4601., 16600.]])

 del_rs_0_last_cs_1_2_4: | shape: (1041, 3) | ndim: 2 | size: 3123 | dtype: float64 


array([[ 2000.,  3061., 15041.],
       [ 1000.,  2160., 15340.],
       [ 2000.,  3041., 15321.],
       ...,
       [ 2000.,  4240., 16600.],
       [ 2000.,  4201., 16600.],
       [ 1000.,  2080., 15600.]])

## numpy.delete() - Examples from de Manual
- np.s_[::] -> a nicer way to build up index tuples for arrays
- Boolean mask: Often it is preferable to use a boolean mask, cause allows further use of mask.

In [9]:
# np.s_[::] -> a nicer way to build up index tuples for arrays
np.s_[1:9:2]

slice(1, 9, 2)

In [10]:
# Create a masl for array J to delete cols 1 and 3
display(J := np.arange(1,45,3).reshape(3,5))
display(mask := np.ones_like(J, dtype=bool))
# mask[:,[1,3]] = False
display(mask)
display(J[mask])        # reshaped to 1D !!! - better here .delete()

array([[ 1,  4,  7, 10, 13],
       [16, 19, 22, 25, 28],
       [31, 34, 37, 40, 43]])

array([[ True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True]])

array([[ True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True]])

array([ 1,  4,  7, 10, 13, 16, 19, 22, 25, 28, 31, 34, 37, 40, 43])

In [11]:
# Deleting the 2nd row and passing axis as positional argument
display(arr := np.array([[1,2,3,4], [5,6,7,8], [9,10,11,12]]))
np.delete(arr, 1, 0)

array([[ 1,  2,  3,  4],
       [ 5,  6,  7,  8],
       [ 9, 10, 11, 12]])

array([[ 1,  2,  3,  4],
       [ 9, 10, 11, 12]])

In [12]:
# Deleting cols using np.s_, a nicer way to build up tuples for arrays
display(tup_np_s_1 := (np.s_[::2]))     # np.index_exp ¡?
np.delete(arr, np.s_[::2], axis=1)      # del cols 0,2 and every 2


slice(None, None, 2)

array([[ 2,  4],
       [ 6,  8],
       [10, 12]])

In [13]:
# Deleting cols with odd index
display(np.delete(lend_co_data_num, np.s_[1::2], axis=1))

array([[2000.,  365., 4241.],
       [2000.,  365., 4171.],
       [1000.,  365., 3280.],
       ...,
       [2000.,  365., 5001.],
       [1000.,  365., 3320.],
       [2000.,  365., 4601.]])

In [14]:
# Deleting several elements
display(arr)                            # to remember
display(np.delete(arr, [1,3,5], None))  # del els 1,3, and 5
np.delete(arr, np.s_[1::2])             # all els with index odd

array([[ 1,  2,  3,  4],
       [ 5,  6,  7,  8],
       [ 9, 10, 11, 12]])

array([ 1,  3,  5,  7,  8,  9, 10, 11, 12])

array([ 1,  3,  5,  7,  9, 11])

In [15]:
# .delete() is a way to filter. Often is preferable a boolean mask
display(arr := np.arange(12) + 1)
display(mask := np.ones(len(arr), dtype=bool))
mask[[0,2,4]] = False
display(mask)
display(result := arr[mask,...])
display(r1 := arr[mask])
# Equivalent to: (but allows further use of mask)
np.delete(arr, (0,2,4), axis=0) 

array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12])

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True])

array([False,  True, False,  True, False,  True,  True,  True,  True,
        True,  True,  True])

array([ 2,  4,  6,  7,  8,  9, 10, 11, 12])

array([ 2,  4,  6,  7,  8,  9, 10, 11, 12])

array([ 2,  4,  6,  7,  8,  9, 10, 11, 12])