# 8 Data Manipulation with NumPy
- Examine how to clean and preprocess data using NumPy.
- Hoy to discover missing values (and fill them up).
- Ways to remove irrelevant data.
- sort(), shuffle(), reshape(), stack(), strip()
## 8_08 Shuffing NDarrays
- Different ways we can randomly rearrange N-D arrays.
- Rearranging the parts of a dataset.
- We do so without a fixed pattern.
- The end goal is that a random sample would be representative of the entire dataset.

#### random.shuffle(x)
- Modify a sequence in-place by shuffling its contents.
- This function only shuffles the array along the first axis of a multi-dimensional array. The order of sub-arrays is changed but their contents remains the same.

#### random.Generator.shuffle(x, axis=0) <- method
- Modify an array or sequence in-place by shuffling its contents.
- The order of sub-arrays is changed but their contents remains the same.

#### Imagine a dataset as a deck (baraja-mazo) of cards.
- each row is a different card.
- We shuffle (barajar - rearrange - mix) their order gets mixed up but the cards remain whole and we don´t lose any cards. 
- Keeping rows intact of often crucial.

In [17]:
import numpy as np
np.__version__

'1.26.4'

In [18]:
# Function show_attr

def show_attr(arrnm: str) -> str:
    strout = f' {arrnm}: '

    for attr in ('shape', 'ndim', 'size', 'dtype'):     #, 'itemsize'):
            arrnm_attr = arrnm + '.' + attr
            strout += f'| {attr}: {eval(arrnm_attr)} '

    return strout

In [19]:
# To see more clear we´ll use only a part of the dataset

lend_co_8rows = np.loadtxt('Lending-Company-Numeric-Data.csv',
                           delimiter=',')[:8]

display(show_attr('lend_co_8rows'))
lend_co_8rows

' lend_co_8rows: | shape: (8, 6) | ndim: 2 | size: 48 | dtype: float64 '

array([[ 2000.,    40.,   365.,  3121.,  4241., 13621.],
       [ 2000.,    40.,   365.,  3061.,  4171., 15041.],
       [ 1000.,    40.,   365.,  2160.,  3280., 15340.],
       [ 2000.,    40.,   365.,  3041.,  4241., 15321.],
       [ 2000.,    50.,   365.,  3470.,  4820., 13720.],
       [ 2000.,    40.,   365.,  3201.,  4141., 14141.],
       [ 2000.,    50.,   365.,  1851.,  3251., 17701.],
       [ 2000.,    40.,   365.,  3971.,  4131., 15351.]])

In [20]:
display(np.random.shuffle(lend_co_8rows))   # <- inplace True.
lend_co_8rows

None

array([[ 2000.,    40.,   365.,  3121.,  4241., 13621.],
       [ 2000.,    40.,   365.,  3041.,  4241., 15321.],
       [ 2000.,    50.,   365.,  1851.,  3251., 17701.],
       [ 1000.,    40.,   365.,  2160.,  3280., 15340.],
       [ 2000.,    50.,   365.,  3470.,  4820., 13720.],
       [ 2000.,    40.,   365.,  3971.,  4131., 15351.],
       [ 2000.,    40.,   365.,  3201.,  4141., 14141.],
       [ 2000.,    40.,   365.,  3061.,  4171., 15041.]])

In [21]:
# Load the entire dataset
lend_co_num = np.loadtxt('Lending-Company-Numeric-Data.csv', delimiter=',')
display(lend_co_num)
show_attr('lend_co_num')

array([[ 2000.,    40.,   365.,  3121.,  4241., 13621.],
       [ 2000.,    40.,   365.,  3061.,  4171., 15041.],
       [ 1000.,    40.,   365.,  2160.,  3280., 15340.],
       ...,
       [ 2000.,    40.,   365.,  4201.,  5001., 16600.],
       [ 1000.,    40.,   365.,  2080.,  3320., 15600.],
       [ 2000.,    40.,   365.,  4601.,  4601., 16600.]])

' lend_co_num: | shape: (1043, 6) | ndim: 2 | size: 6258 | dtype: float64 '

In [22]:
## COMMENT: if you ise function or obj (method?) many times is practical to
# import it directly to avoil large strings like in random Generator and
# PCG64. We ara goin to do for shuffle now
from numpy.random import shuffle

In [23]:
shuffle(lend_co_num)
lend_co_num

array([[ 2000.,    40.,   365.,  3121.,  5161., 13441.],
       [ 4000.,    50.,   365.,  5500.,  6800., 14665.],
       [ 1000.,    40.,   365.,  2170.,  3570., 15600.],
       ...,
       [ 2000.,    50.,   365.,   850.,  1900., 10850.],
       [ 4000.,    50.,   365.,  5700.,  7400., 22250.],
       [ 4000.,    50.,   365.,  5350.,  6600., 13470.]])

In [24]:
# Generator shuffle method
from numpy.random import Generator as gen 
from numpy.random import PCG64 as pcg 

array_RG = gen(pcg())
array_RG.shuffle(lend_co_num)
lend_co_num

array([[ 1000.,    40.,   365.,  2280.,  3280., 14280.],
       [ 2000.,    40.,   365.,  2400.,  2400.,  2600.],
       [ 1000.,    40.,   365.,  2700.,  3500., 11610.],
       ...,
       [ 2000.,    50.,   365.,  4251.,  4951., 20250.],
       [ 2000.,    40.,   365.,  5001.,  5001., 16600.],
       [ 2000.,    40.,   365.,  3080.,  4300., 16600.]])

In [25]:
# np.random.shuffle is different from the array_RG.shuffle method.
# np.random.shuffle function is part of the numpy random module
# array_RG.shuffle method work with gen objects 

In [26]:
## .shuffle method prevails over he use of seeds. Can't replicate the same shuffel twice
array_RG2 = gen(pcg(seed=365))
array_RG2.shuffle(lend_co_num)
lend_co_num
# We want the shuffled array to be as unpredictable as possible.

array([[ 4000.,    50.,   365.,  6350.,  7050., 22250.],
       [ 2000.,    50.,   365.,  3351.,  4951., 20250.],
       [ 2000.,    40.,   365.,  3200.,  4320., 16600.],
       ...,
       [ 4000.,    50.,   365.,  5450.,  6050., 22250.],
       [ 2000.,    40.,   365.,  3300.,  4280., 16600.],
       [ 1000.,    40.,   365.,  2360.,  3420., 15600.]])

#### BUT random.Generator.shuffle accept axis=, not only first
- random.Generator.shuffle(x, axis=0)
    - Modify an array or sequence in-place by shuffling its contents.
    - The order of sub-arrays is changed but their contents remains the same.

In [27]:
display(C := np.arange(16).reshape(4,4))
array_RG.shuffle(C)     # axis=0, shuffle entire rows into each other
C

array([[ 0,  1,  2,  3],
       [ 4,  5,  6,  7],
       [ 8,  9, 10, 11],
       [12, 13, 14, 15]])

array([[ 4,  5,  6,  7],
       [ 8,  9, 10, 11],
       [12, 13, 14, 15],
       [ 0,  1,  2,  3]])

In [28]:
display(C := np.arange(16).reshape(4,4))
array_RG.shuffle(C, axis=0)     # axis=0, shuffle entire rows into each other
C

array([[ 0,  1,  2,  3],
       [ 4,  5,  6,  7],
       [ 8,  9, 10, 11],
       [12, 13, 14, 15]])

array([[ 0,  1,  2,  3],
       [12, 13, 14, 15],
       [ 4,  5,  6,  7],
       [ 8,  9, 10, 11]])

In [29]:
display(C := np.arange(16).reshape(4,4))
array_RG.shuffle(C, axis=1)     # shuffle entire columns into each other
C

array([[ 0,  1,  2,  3],
       [ 4,  5,  6,  7],
       [ 8,  9, 10, 11],
       [12, 13, 14, 15]])

array([[ 1,  2,  0,  3],
       [ 5,  6,  4,  7],
       [ 9, 10,  8, 11],
       [13, 14, 12, 15]])

In [30]:
# display(C := np.arange(16).reshape(4,4))
# array_RG.shuffle(C, axis=None)
# TypeError: 'NoneType' object cannot be interpreted as an integer

### Notes and Exmples from Manual - np.random.shuffle

In [31]:
# 1-D Array
display(arr := np.arange(10))
np.random.shuffle(arr)
arr

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

array([0, 8, 4, 2, 1, 7, 3, 6, 5, 9])

In [32]:
# 2-D array - MultiD_arrays are only shuffled along the first axis
display(arr := np.arange(9).reshape((3, 3)))
np.random.shuffle(arr)      # 2-D array, first axis -> rows
arr

array([[0, 1, 2],
       [3, 4, 5],
       [6, 7, 8]])

array([[6, 7, 8],
       [3, 4, 5],
       [0, 1, 2]])