# 8 Data Manipulation with NumPy
- Examine how to clean and preprocess data using NumPy.
- Hoy to discover missing values (and fill them up).
- Ways to remove irrelevant data.
- sort(), shuffle(), reshape(), stack(), strip()
## 8_07 Argument Where in NumPy

#### numpy.argwhere(a)
- Find the indices of array elements that are non-zero, grouped by element.
- JM: In general the function will return the indices of a condition True as its argument (element-wise).
- Unlike np.argsort(), np.argwhere do not have an equiv. mathod

#### 'Argument' functions - Name origins -> Astronomy
- Location of planets and celestial bodies in the black sky: tables to determine where it sits in space.
- These tables would be filled based on the angle at which the celestial objects reflect light.
- An those very same angles were called __arguments__.
- The name literally translates to that which elucidates something else. (El nombre se traduce literalmente como aquello que aclara algo más.)
- So, you can think of arguments as coordinates in space, which perfectly translates to how indices work in N-D arrays.
- Therefore, functions that return coordinates (indices) within an array are called __argument functions__.

In [1]:
import numpy as np
np.__version__

'1.26.4'

In [2]:
# Function show_attr

def show_attr(arrnm: str) -> str:
    strout = f' {arrnm}: '

    for attr in ('shape', 'ndim', 'size', 'dtype'):     #, 'itemsize'):
            arrnm_attr = arrnm + '.' + attr
            strout += f'| {attr}: {eval(arrnm_attr)} '

    return strout

In [3]:
# Let's work with a dataset without NANs

lend_co_data_num = np.loadtxt('Lending-Company-Numeric-Data.csv',
                              delimiter=',')

display(show_attr('lend_co_data_num'))
lend_co_data_num

' lend_co_data_num: | shape: (1043, 6) | ndim: 2 | size: 6258 | dtype: float64 '

array([[ 2000.,    40.,   365.,  3121.,  4241., 13621.],
       [ 2000.,    40.,   365.,  3061.,  4171., 15041.],
       [ 1000.,    40.,   365.,  2160.,  3280., 15340.],
       ...,
       [ 2000.,    40.,   365.,  4201.,  5001., 16600.],
       [ 1000.,    40.,   365.,  2080.,  3320., 15600.],
       [ 2000.,    40.,   365.,  4601.,  4601., 16600.]])

In [4]:
# As the manual indicate, np.argwhere() return the indices of NON-ZERO elements
display(np.argwhere(lend_co_data_num))

## Each file are the x,y / row, col indices of an non-zero value
print(lend_co_data_num[0,1], ' - ', lend_co_data_num[1042,4])

array([[   0,    0],
       [   0,    1],
       [   0,    2],
       ...,
       [1042,    3],
       [1042,    4],
       [1042,    5]], dtype=int64)

40.0  -  4601.0


In [5]:
# If we want to find the elements equal to zero
display(np.argwhere(lend_co_data_num == False))
display(np.argwhere(lend_co_data_num == 0))
display(lend_co_data_num[116])                  # 5th element of row[116]
print(lend_co_data_num[116,4], ' - ', lend_co_data_num[430,3])

array([[116,   4],
       [430,   3]], dtype=int64)

array([[116,   4],
       [430,   3]], dtype=int64)

array([ 1000.,    50.,   365., -1450.,     0., 13850.])

0.0  -  0.0


In [6]:
# If the condition (np.argwhere(True) <- argument) is True, return the indices of the element
# Ex. elementes < 41 or odd elements or negative or dtype or whatever condition
display(np.argwhere(lend_co_data_num < 41))
display(np.argwhere((lend_co_data_num < 40) & (lend_co_data_num > 0)))
print(lend_co_data_num[327,1], ' - ', lend_co_data_num[443,1])

display(np.argwhere(lend_co_data_num % 2 != 0))
print(lend_co_data_num[0])
print(lend_co_data_num[1042])

array([[   0,    1],
       [   1,    1],
       [   2,    1],
       ...,
       [1040,    1],
       [1041,    1],
       [1042,    1]], dtype=int64)

array([[327,   1],
       [432,   1],
       [443,   1],
       [816,   1]], dtype=int64)

35.0  -  35.0


array([[   0,    2],
       [   0,    3],
       [   0,    4],
       ...,
       [1042,    2],
       [1042,    3],
       [1042,    4]], dtype=int64)

[ 2000.    40.   365.  3121.  4241. 13621.]
[ 2000.    40.   365.  4601.  4601. 16600.]


#### np.argwhere vs conditional slicing filtering (boolean indexing)
- slicing give us the actual values. > If you need a new array containing the values you can use in computations directly
- np.argwhere() returns their coordinates within the array. > If you need the indices of these values to obtain other information about your dataset before you proceed with the next step of your analytic work 

In [7]:
## FUTURE some examples of the vs.

In [8]:
## np.argwhere combined with np.isnan() - dataset w/missing vals.
lending_NAN = np.genfromtxt('Lending-Company-Numeric-Data-NAN.csv', delimiter=';')
display(lending_NAN)
np.isnan(lending_NAN).sum()

array([[ 2000.,    40.,   365.,  3121.,  4241., 13621.],
       [ 2000.,    40.,   365.,  3061.,  4171., 15041.],
       [ 1000.,    40.,   365.,  2160.,  3280., 15340.],
       ...,
       [   nan,    40.,   365.,  4201.,  5001., 16600.],
       [ 1000.,    40.,   365.,  2080.,  3320., 15600.],
       [ 2000.,    40.,   365.,  4601.,  4601., 16600.]])

260

In [9]:
# w/np.isnan() we get an entire array w/False and True values (True in the NANs places)
display(np.isnan(lending_NAN))
#display(lending_NAN[np.isnan(lending_NAN)])     # Show all the NANs

# then to get the coordinates of all missing values (NANs)
display(np.argwhere(np.isnan(lending_NAN)))
print(lending_NAN[175])                         # To see a couple of them

# USEFUL -> discover the missing values in our dataset efficiently

array([[False, False, False, False, False, False],
       [False, False, False, False, False, False],
       [False, False, False, False, False, False],
       ...,
       [ True, False, False, False, False, False],
       [False, False, False, False, False, False],
       [False, False, False, False, False, False]])

array([[  11,    3],
       [  15,    3],
       [  27,    3],
       [  58,    3],
       [  60,    4],
       [  85,    4],
       [ 117,    5],
       [ 152,    1],
       [ 152,    2],
       [ 152,    4],
       [ 172,    1],
       [ 175,    1],
       [ 175,    2],
       [ 176,    3],
       [ 177,    4],
       [ 178,    5],
       [ 211,    3],
       [ 229,    0],
       [ 230,    1],
       [ 237,    1],
       [ 247,    3],
       [ 251,    5],
       [ 252,    4],
       [ 258,    1],
       [ 260,    3],
       [ 262,    4],
       [ 271,    5],
       [ 272,    4],
       [ 284,    2],
       [ 284,    3],
       [ 297,    1],
       [ 297,    2],
       [ 300,    3],
       [ 315,    3],
       [ 315,    5],
       [ 327,    4],
       [ 336,    4],
       [ 343,    0],
       [ 344,    2],
       [ 346,    2],
       [ 363,    3],
       [ 375,    3],
       [ 377,    2],
       [ 398,    5],
       [ 416,    4],
       [ 428,    0],
       [ 432,    1],
       [ 433,

[ 2000.    nan    nan  1851.  3051. 13561.]


In [10]:
## np.argwhere(np.isnan()) useful to replace all NANs, we have the coordinates
for arr_ix in np.argwhere(np.isnan(lending_NAN)):   # arr_ix is a 1-D array w/two vals, the coords of NANs
    lending_NAN[arr_ix[0], arr_ix[1]] = 0           #np.pi.__round__(2)

display(lending_NAN)
display(np.isnan(lend_co_data_num).sum())
print(lending_NAN[175])                             # To see the same row as early


array([[ 2000.,    40.,   365.,  3121.,  4241., 13621.],
       [ 2000.,    40.,   365.,  3061.,  4171., 15041.],
       [ 1000.,    40.,   365.,  2160.,  3280., 15340.],
       ...,
       [    0.,    40.,   365.,  4201.,  5001., 16600.],
       [ 1000.,    40.,   365.,  2080.,  3320., 15600.],
       [ 2000.,    40.,   365.,  4601.,  4601., 16600.]])

0

[ 2000.     0.     0.  1851.  3051. 13561.]


In [11]:
## JM Tries
display(A := np.array([[1,np.nan,3], [np.nan,8,7]]))
for arr_ix in np.argwhere(np.isnan(A)):
    print(arr_ix, type(arr_ix))
    A[arr_ix[0], arr_ix[1]] = np.pi.__round__(4)
A

array([[ 1., nan,  3.],
       [nan,  8.,  7.]])

[0 1] <class 'numpy.ndarray'>
[1 0] <class 'numpy.ndarray'>


array([[1.    , 3.1416, 3.    ],
       [3.1416, 8.    , 7.    ]])

### Notes and Exmples from Manual - np.argwhere
- np.argwhere(a) is almost the same as np.transpose(np.nonzero(a)), but produces a result of the correct shape for a 0D array.
- The output of argwhere is not suitable for indexing arrays. For this purpose use nonzero(a) instead.

In [12]:
display(x := np.arange(6).reshape(2,3))
display(np.argwhere(x > 1))                # Indices of els (of x) > 1

for arr_ix in np.argwhere(x > 1):          # Let's see those els > 1
    print(x[arr_ix[0], arr_ix[1]], end='  ')

array([[0, 1, 2],
       [3, 4, 5]])

array([[0, 2],
       [1, 0],
       [1, 1],
       [1, 2]], dtype=int64)

2  3  4  5  

In [13]:
display(gt_1 :=np.nonzero(x > 1), type(gt_1))   # tuple of 2 ndarrays (row, col)
# np.transpose(gt_1)                            # IDEM to np.argwhere(x < 1)
display(np.transpose(np.nonzero(x > 1)))

display(x[np.nonzero(x > 1)])                    # Let's see those els > 1

# Assign new value
x[np.nonzero(x > 1)] = 99
x


(array([0, 1, 1, 1], dtype=int64), array([2, 0, 1, 2], dtype=int64))

tuple

array([[0, 2],
       [1, 0],
       [1, 1],
       [1, 2]], dtype=int64)

array([2, 3, 4, 5])

array([[ 0,  1, 99],
       [99, 99, 99]])

In [14]:
# Iteration of nonzero resutl
for tp in np.nonzero(x):
    print(tp, end='  -  ')
    for i in range(len(tp)):
        endchr = '  '
        if i == len(tp) - 1:
            endchr = '\n'
        print(tp[i], end=endchr)

[0 0 1 1 1]  -  0  0  1  1  1
[1 2 0 1 2]  -  1  2  0  1  2


In [18]:
# Let's replicate the change of NANs using np.nonzero() - replace missing vals w/np.nonzero()
lending_NANb = np.genfromtxt('Lending-Company-Numeric-Data-NAN.csv', delimiter=';')
display(lending_NANb)
print(np.isnan(lending_NANb).sum(), ' | ',np.nanmin(lending_NANb))

# display(lending_NANb[np.nonzero(np.isnan(lending_NANb))])
lending_NANb[np.nonzero(np.isnan(lending_NANb))] = np.nanmin(lending_NANb) - 1
display(lending_NANb)
print(np.isnan(lending_NANb).sum(), ' | ',np.nanmin(lending_NANb))

array([[ 2000.,    40.,   365.,  3121.,  4241., 13621.],
       [ 2000.,    40.,   365.,  3061.,  4171., 15041.],
       [ 1000.,    40.,   365.,  2160.,  3280., 15340.],
       ...,
       [   nan,    40.,   365.,  4201.,  5001., 16600.],
       [ 1000.,    40.,   365.,  2080.,  3320., 15600.],
       [ 2000.,    40.,   365.,  4601.,  4601., 16600.]])

260  |  -2870.0


array([nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, na

array([[ 2000.,    40.,   365.,  3121.,  4241., 13621.],
       [ 2000.,    40.,   365.,  3061.,  4171., 15041.],
       [ 1000.,    40.,   365.,  2160.,  3280., 15340.],
       ...,
       [-2871.,    40.,   365.,  4201.,  5001., 16600.],
       [ 1000.,    40.,   365.,  2080.,  3320., 15600.],
       [ 2000.,    40.,   365.,  4601.,  4601., 16600.]])

0  |  -2871.0


In [16]:
# Cause the result of nonzero is a tuble of arrays also could use for loop
lending_NANc = np.array([[1,np.nan,3], [np.nan,8,7]])
display(lending_NANc)
print(np.isnan(lending_NANc).sum(), ' | ',np.nanmax(lending_NANc))

print(' -> Original_max:', original_max := np.nanmax(lending_NANc))
for arr_tp in np.transpose(np.nonzero(np.isnan(lending_NANc))):
    # print(arr_tp, arr_tp[0], arr_tp[1])
    lending_NANc[arr_tp[0], arr_tp[1]] = original_max + 1
display(lending_NANc)
print(np.isnan(lending_NANc).sum(), ' | ',np.nanmax(lending_NANc))

array([[ 1., nan,  3.],
       [nan,  8.,  7.]])

2  |  8.0
 -> Original_max: 8.0


array([[1., 9., 3.],
       [9., 8., 7.]])

0  |  9.0


In [17]:
# Cause the result of nonzero is a tuble of arrays also could use for loop
np.set_printoptions(suppress=True)
data = np.genfromtxt('Lending-Company-Numeric-Data-NAN.csv', delimiter=';')
data[1,1] = data[3,4] = np.nan
display(data)
print(np.isnan(data).sum(), ' | ',np.nanmax(data))
# display(np.nonzero(np.isnan(data)))
# display(np.transpose(np.nonzero(np.isnan(data))))
# display(np.argwhere(np.isnan(data)))

print(' -> Original_max:', original_max := np.nanmax(data))
for arr_tp in np.transpose(np.nonzero(np.isnan(data))):
    # print(arr_tp, arr_tp[0][0], arr_tp[1][0])
    data[arr_tp[0], arr_tp[1]] = original_max +1
display(data)
print(np.isnan(data).sum(), ' | ',np.nanmax(data))

array([[ 2000.,    40.,   365.,  3121.,  4241., 13621.],
       [ 2000.,    nan,   365.,  3061.,  4171., 15041.],
       [ 1000.,    40.,   365.,  2160.,  3280., 15340.],
       ...,
       [   nan,    40.,   365.,  4201.,  5001., 16600.],
       [ 1000.,    40.,   365.,  2080.,  3320., 15600.],
       [ 2000.,    40.,   365.,  4601.,  4601., 16600.]])

262  |  64001.0
 -> Original_max: 64001.0


array([[ 2000.,    40.,   365.,  3121.,  4241., 13621.],
       [ 2000., 64002.,   365.,  3061.,  4171., 15041.],
       [ 1000.,    40.,   365.,  2160.,  3280., 15340.],
       ...,
       [64002.,    40.,   365.,  4201.,  5001., 16600.],
       [ 1000.,    40.,   365.,  2080.,  3320., 15600.],
       [ 2000.,    40.,   365.,  4601.,  4601., 16600.]])

0  |  64002.0
