# 8 Data Manipulation with NumPy
- Examine how to clean and preprocess data using NumPy.
- Hoy to discover missing values (and fill them up).
- Ways to remove irrelevant data.
- sort(), shuffle(), reshape(), stack(), strip()
## 8_07 Argument Where in NumPy

#### numpy.argwhere(a)
- Find the indices of array elements that are non-zero, grouped by element.

In [1]:
import numpy as np
np.__version__

'1.26.4'

In [2]:
# Function show_attr

def show_attr(arrnm: str) -> str:
    strout = f' {arrnm}: '

    for attr in ('shape', 'ndim', 'size', 'dtype'):     #, 'itemsize'):
            arrnm_attr = arrnm + '.' + attr
            strout += f'| {attr}: {eval(arrnm_attr)} '

    return strout

In [3]:
# Let's work with a dataset without NANs

lend_co_data_num = np.loadtxt('Lending-Company-Numeric-Data.csv',
                              delimiter=',')

display(show_attr('lend_co_data_num'))
lend_co_data_num

' lend_co_data_num: | shape: (1043, 6) | ndim: 2 | size: 6258 | dtype: float64 '

array([[ 2000.,    40.,   365.,  3121.,  4241., 13621.],
       [ 2000.,    40.,   365.,  3061.,  4171., 15041.],
       [ 1000.,    40.,   365.,  2160.,  3280., 15340.],
       ...,
       [ 2000.,    40.,   365.,  4201.,  5001., 16600.],
       [ 1000.,    40.,   365.,  2080.,  3320., 15600.],
       [ 2000.,    40.,   365.,  4601.,  4601., 16600.]])

In [4]:
np.argsort(lend_co_data_num)    # def. axis=-1, last axis (1), cols values in e/row)

## The indices that would sort the array

array([[1, 2, 0, 3, 4, 5],
       [1, 2, 0, 3, 4, 5],
       [1, 2, 0, 3, 4, 5],
       ...,
       [1, 2, 0, 3, 4, 5],
       [1, 2, 0, 3, 4, 5],
       [1, 2, 0, 3, 4, 5]], dtype=int64)

In [5]:
# Indices that would sort every value of columns, and check this values -350.0 | -2000.0 | 17650.0
np.set_printoptions(suppress=True, precision=2)

display(np.sort(lend_co_data_num, axis=0))
display(np.argsort(lend_co_data_num, axis=0))
print(lend_co_data_num[482,5], '|', lend_co_data_num[85,4], '|', lend_co_data_num[718,3])

array([[ 1000.,    35.,   365., -2870., -2870.,  -350.],
       [ 1000.,    35.,   365., -2550., -2100.,   150.],
       [ 1000.,    35.,   365., -2450., -2000.,  1100.],
       ...,
       [ 9000.,   125.,   365., 16751., 18751., 54625.],
       [ 9000.,   165.,   365., 17650., 20001., 54625.],
       [ 9000.,   165.,   365., 19001., 22001., 64001.]])

array([[ 537,  443,    0,   32,   32,  482],
       [ 639,  327,  687,  166,  166,  493],
       [ 849,  432,  688,   85,   85,  166],
       ...,
       [  27,  326,  355,  568, 1019,  568],
       [ 277,   27,  357,  718, 1033,  534],
       [ 420,  408, 1042,  912,  912,   27]], dtype=int64)

-350.0 | -2000.0 | 17650.0


In [6]:
## Using argsort to arrange individual rows or cols
# Ex. arrange the vals of the 1st col and get their indices
display(np.argsort(lend_co_data_num[:,0]))       # USEFUL!

## We can take this order to rearrange all the rows based on their values in the 1st col.
display(lend_co_data_num)
lend_co_data_num = lend_co_data_num[np.argsort(lend_co_data_num[:,0])]
lend_co_data_num
# The values in the first col are arranged appropriately but not the rest, so we
# successfully rearranged all the rows based on the values in the first position.
# All the roads should remain intact and just be arranged in ascending order
# based on the value stored in the first position of e/row
## JM like in excel sort for a certain field (col)
## this is because -> data[indices] make the same for each col

array([537, 639, 849, ...,  27, 277, 420], dtype=int64)

array([[ 2000.,    40.,   365.,  3121.,  4241., 13621.],
       [ 2000.,    40.,   365.,  3061.,  4171., 15041.],
       [ 1000.,    40.,   365.,  2160.,  3280., 15340.],
       ...,
       [ 2000.,    40.,   365.,  4201.,  5001., 16600.],
       [ 1000.,    40.,   365.,  2080.,  3320., 15600.],
       [ 2000.,    40.,   365.,  4601.,  4601., 16600.]])

array([[ 1000.,    40.,   365.,  2200.,  3400., 15600.],
       [ 1000.,    40.,   365.,  2200.,  3800., 15600.],
       [ 1000.,    40.,   365.,  2000.,  3950., 15600.],
       ...,
       [ 9000.,   165.,   365., 14501., 16846., 64001.],
       [ 9000.,   125.,   365., 12001., 15751., 38626.],
       [ 9000.,   125.,   365., 12251., 14251., 25626.]])

In [7]:
## USEFUL ex. when each row of a dataset contains information about a specific client
# (or date) since we want to keep this info intact.
# Ex. analyzing a portfolio of stocks, each row in the dataset can represent the stock
# of a diff company
# Let see stockjm.csv
display(sjm := np.loadtxt('sjm.txt', delimiter=','))
# Cia names 101, 102, ... first row dates

# Oder by ascending order of 2021
display(ix_col1 := np.argsort(sjm[:,1]))
sjm[np.argsort(sjm[:,1])]

array([[101.  ,  29.79,  90.96,  29.48, 130.  ,  32.38],
       [102.  ,  54.4 ,  55.09,  94.6 ,  54.53,  51.89],
       [103.  , 272.11, 278.97, 276.02, 275.24, 166.26],
       [104.  , 191.83, 102.89,  99.71,  39.29,  96.81],
       [105.  , 216.07, 304.49, 391.25, 282.47, 278.7 ],
       [106.  , 182.54, 184.58, 174.39, 176.43, 172.37],
       [107.  , 172.9 , 171.83, 171.02, 168.76, 166.15],
       [108.  ,  42.88,  43.3 ,  42.52,  43.42, 142.09],
       [109.  ,  20.97,  21.2 ,  21.72,  22.25, 122.38],
       [110.  , 377.56, 178.59, 360.46, 348.18, 339.64]])

array([8, 0, 7, 1, 6, 5, 3, 4, 2, 9], dtype=int64)

array([[109.  ,  20.97,  21.2 ,  21.72,  22.25, 122.38],
       [101.  ,  29.79,  90.96,  29.48, 130.  ,  32.38],
       [108.  ,  42.88,  43.3 ,  42.52,  43.42, 142.09],
       [102.  ,  54.4 ,  55.09,  94.6 ,  54.53,  51.89],
       [107.  , 172.9 , 171.83, 171.02, 168.76, 166.15],
       [106.  , 182.54, 184.58, 174.39, 176.43, 172.37],
       [104.  , 191.83, 102.89,  99.71,  39.29,  96.81],
       [105.  , 216.07, 304.49, 391.25, 282.47, 278.7 ],
       [103.  , 272.11, 278.97, 276.02, 275.24, 166.26],
       [110.  , 377.56, 178.59, 360.46, 348.18, 339.64]])

In [8]:
## ndarray.argsort() doesn´t overwrite the original array (Unlike Sort)
# np.argsort() = ndarray.argsort()
# np.sort() != ndarray.sort()  <- the last inplace=True

### Notes and Exmples from Manual - np.argsort()
- As of NumPy 1.4.0 argsort works with real/complex arrays containing nan values. The enhanced sort order is documented in sort.

In [9]:
# 1 D array
display(x1 := np.array([3, 1, 2]))
display(np.argsort(x1))

array([3, 1, 2])

array([1, 2, 0], dtype=int64)

In [10]:
# 2D array
display(x2 := np.array([[0, 3], [2, 2]]))
display(ind := np.argsort(x2, axis=0))      # sorts along first axis (down)
np.take_along_axis(x2, ind, axis=0)          # same as np.sort(x, axis=0)

array([[0, 3],
       [2, 2]])

array([[0, 1],
       [1, 0]], dtype=int64)

array([[0, 2],
       [2, 3]])

## Preparing real data -> stockjm.csv

In [11]:
### JM - JM - JM Prepare Stock dataset (stockjm_csv)
np.set_printoptions(linewidth=150)

display(stock := np.genfromtxt('stock.txt',
                               delimiter=',',
                               dtype=str))

display(show_attr('stock'))
display(s1 := np.delete(stock, (-1,-2,-3,-4), axis=1))
show_attr('s1')
# display(s2 := stock[:,(0,1,2,3,4,5)])
# show_attr('s2'

array([['Date', '2020-09-01', '2020-09-02', '2020-09-03', '2020-09-04', '2020-09-08', '2020-09-09', '2020-09-10', '2020-09-11', '2020-09-14'],
       ['GM', '29.790001', '30.959999', '29.480000', '30.000000', '32.380001', '31.950001', '30.170000', '30.459999', '31.180000'],
       ['ETFC', '54.400002', '55.090000', '54.599998', '54.529999', '51.889999', '52.939999', '52.139999', '52.240002', '53.000000'],
       ['ANTM', '272.109131', '278.974640', '276.015198', '275.237976', '266.260010', '267.420013', '260.839996', '258.029999', '265.600006'],
       ['AME', '101.832825', '102.890884', '99.706734', '99.287506', '96.812050', '99.167725', '97.930000', '99.910004', '101.389999'],
       ['MCO', '296.070007', '304.489990', '291.250000', '282.470001', '278.700012', '288.410004', '283.779999', '284.940002', '288.679993'],
       ['URI', '182.539993', '184.580002', '174.389999', '176.429993', '172.369995', '175.889999', '171.960007', '174.630005', '176.580002'],
       ['CME', '172.895493',

' stock: | shape: (11, 10) | ndim: 2 | size: 110 | dtype: <U10 '

array([['Date', '2020-09-01', '2020-09-02', '2020-09-03', '2020-09-04', '2020-09-08'],
       ['GM', '29.790001', '30.959999', '29.480000', '30.000000', '32.380001'],
       ['ETFC', '54.400002', '55.090000', '54.599998', '54.529999', '51.889999'],
       ['ANTM', '272.109131', '278.974640', '276.015198', '275.237976', '266.260010'],
       ['AME', '101.832825', '102.890884', '99.706734', '99.287506', '96.812050'],
       ['MCO', '296.070007', '304.489990', '291.250000', '282.470001', '278.700012'],
       ['URI', '182.539993', '184.580002', '174.389999', '176.429993', '172.369995'],
       ['CME', '172.895493', '171.830948', '171.015106', '168.756653', '166.149994'],
       ['PFG', '42.880001', '43.299999', '42.520000', '43.419998', '42.090000'],
       ['KSS', '20.969999', '21.200001', '21.719999', '22.250000', '22.379999'],
       ['MSCI', '377.559998', '378.589996', '360.459991', '348.179993', '339.640015']], dtype='<U10')

' s1: | shape: (11, 6) | ndim: 2 | size: 66 | dtype: <U10 '

In [12]:
# Change the strname of a cia by 100 + num
for i in range(s1.shape[0]):
    val = str(i + 100)
    s1[i,0] = val
s1

array([['100', '2020-09-01', '2020-09-02', '2020-09-03', '2020-09-04', '2020-09-08'],
       ['101', '29.790001', '30.959999', '29.480000', '30.000000', '32.380001'],
       ['102', '54.400002', '55.090000', '54.599998', '54.529999', '51.889999'],
       ['103', '272.109131', '278.974640', '276.015198', '275.237976', '266.260010'],
       ['104', '101.832825', '102.890884', '99.706734', '99.287506', '96.812050'],
       ['105', '296.070007', '304.489990', '291.250000', '282.470001', '278.700012'],
       ['106', '182.539993', '184.580002', '174.389999', '176.429993', '172.369995'],
       ['107', '172.895493', '171.830948', '171.015106', '168.756653', '166.149994'],
       ['108', '42.880001', '43.299999', '42.520000', '43.419998', '42.090000'],
       ['109', '20.969999', '21.200001', '21.719999', '22.250000', '22.379999'],
       ['110', '377.559998', '378.589996', '360.459991', '348.179993', '339.640015']], dtype='<U10')

In [13]:
# Change the date of row[0] by 2020 + num
for i in range(1, s1.shape[1]):
    val = str(20200 + i)
    s1[0,i] = val
s1

array([['100', '20201', '20202', '20203', '20204', '20205'],
       ['101', '29.790001', '30.959999', '29.480000', '30.000000', '32.380001'],
       ['102', '54.400002', '55.090000', '54.599998', '54.529999', '51.889999'],
       ['103', '272.109131', '278.974640', '276.015198', '275.237976', '266.260010'],
       ['104', '101.832825', '102.890884', '99.706734', '99.287506', '96.812050'],
       ['105', '296.070007', '304.489990', '291.250000', '282.470001', '278.700012'],
       ['106', '182.539993', '184.580002', '174.389999', '176.429993', '172.369995'],
       ['107', '172.895493', '171.830948', '171.015106', '168.756653', '166.149994'],
       ['108', '42.880001', '43.299999', '42.520000', '43.419998', '42.090000'],
       ['109', '20.969999', '21.200001', '21.719999', '22.250000', '22.379999'],
       ['110', '377.559998', '378.589996', '360.459991', '348.179993', '339.640015']], dtype='<U10')

In [14]:
# savetxt - comment in order not to write every run!
# np.savetxt('stockjm.csv',
#            s1,
#            delimiter=',',
#            fmt='%s')

In [15]:
# loadtxt
np.set_printoptions(precision=2, linewidth=75)

display(stockjm := np.loadtxt('stockjm.csv', delimiter=','))
show_attr('stockjm')

array([[  100.  , 20201.  , 20202.  , 20203.  , 20204.  , 20205.  ],
       [  101.  ,    29.79,    30.96,    29.48,    30.  ,    32.38],
       [  102.  ,    54.4 ,    55.09,    54.6 ,    54.53,    51.89],
       [  103.  ,   272.11,   278.97,   276.02,   275.24,   266.26],
       [  104.  ,   101.83,   102.89,    99.71,    99.29,    96.81],
       [  105.  ,   296.07,   304.49,   291.25,   282.47,   278.7 ],
       [  106.  ,   182.54,   184.58,   174.39,   176.43,   172.37],
       [  107.  ,   172.9 ,   171.83,   171.02,   168.76,   166.15],
       [  108.  ,    42.88,    43.3 ,    42.52,    43.42,    42.09],
       [  109.  ,    20.97,    21.2 ,    21.72,    22.25,    22.38],
       [  110.  ,   377.56,   378.59,   360.46,   348.18,   339.64]])

' stockjm: | shape: (11, 6) | ndim: 2 | size: 66 | dtype: float64 '