# 8 Data Manipulation with NumPy
- Examine how to clean and preprocess data using NumPy.
- Hoy to discover missing values (and fill them up).
- Ways to remove irrelevant data.
- sort(), shuffle(), reshape(), stack(), strip()
## 8_1 Checking for Missing Values in Ndarrays

#### numpy.isnan(x, /, out=None, *, where=True, casting='same_kind', order='K', dtype=None, subok=True[, signature]) = <ufunc 'isnan'>
- Test element-wise for NaN and return result as a boolean array.

#### numpy.genfromtxt(fname, dtype=<class 'float'>, comments='#', delimiter=None, skip_header=0, skip_footer=0, converters=None, missing_values=None, filling_values=None, usecols=None, names=None, excludelist=None, deletechars=" !#$%&'()*+, -./:;<=>?@[\\]^{|}~", replace_space='_', autostrip=False, case_sensitive=True, defaultfmt='f%i', unpack=None, usemask=False, loose=True, invalid_raise=True, max_rows=None, encoding=None, *, ndmin=0, like=None)
- **filling_values=** ; variable, optional; The set of values to be used as default when the data are missing.
- Load data from a text file, with missing values handled as specified.
- Each line past the first skip_header lines is split at the delimiter character, and characters following the comments character are discarded.

In [15]:
import numpy as np
np.__version__

'1.26.4'

In [16]:
# Function show_attr

def show_attr(arrnm: str) -> str:
    strout = f' {arrnm}: '

    for attr in ('shape', 'ndim', 'size', 'dtype'):     #, 'itemsize'):
            arrnm_attr = arrnm + '.' + attr
            strout += f'| {attr}: {eval(arrnm_attr)} '

    return strout

In [17]:
lend_co_data_num = np.loadtxt('Lending-Company-Numeric-Data.csv',
                              delimiter=',')

display(show_attr('lend_co_data_num'))
# lend_co_data_num[[0,1,-2,-1]]
display(lend_co_data_num)


' lend_co_data_num: | shape: (1043, 6) | ndim: 2 | size: 6258 | dtype: float64 '

array([[ 2000.,    40.,   365.,  3121.,  4241., 13621.],
       [ 2000.,    40.,   365.,  3061.,  4171., 15041.],
       [ 1000.,    40.,   365.,  2160.,  3280., 15340.],
       ...,
       [ 2000.,    40.,   365.,  4201.,  5001., 16600.],
       [ 1000.,    40.,   365.,  2080.,  3320., 15600.],
       [ 2000.,    40.,   365.,  4601.,  4601., 16600.]])

In [18]:
# Let's check is there are NANs values - np.isnan() -> elementwise
# False equiv 0. True equiv 1. Any num != 0 equiv True

dtset = lend_co_data_num

display(np.isnan(dtset))
display(np.isnan(dtset)[np.isnan(dtset) == True]) # jm
display(np.isnan(dtset)[np.isnan(dtset) == False]) # jm
display(np.isnan(dtset).sum())
display(np.isnan(dtset).sum(axis=0))
display(np.sum(np.isnan(dtset)))
display(np.sum(np.isnan(dtset), axis=0))
display(np.sum(np.isnan(dtset), axis=1))

array([[False, False, False, False, False, False],
       [False, False, False, False, False, False],
       [False, False, False, False, False, False],
       ...,
       [False, False, False, False, False, False],
       [False, False, False, False, False, False],
       [False, False, False, False, False, False]])

array([], dtype=bool)

array([False, False, False, ..., False, False, False])

0

array([0, 0, 0, 0, 0, 0])

0

array([0, 0, 0, 0, 0, 0])

array([0, 0, 0, ..., 0, 0, 0])

In [19]:
# Let's work with a dataset that contains NANs

# lend_co_data_num_NAN = np.loadtxt('Lending-Company-Numeric-Data-NAN.csv',
#                                     delimiter=';')
# ValueError: could not convert string '' to float64 at row 11, column 4.

lend_co_data_num_NAN = np.genfromtxt('Lending-Company-Numeric-Data-NAN.csv',
                                     delimiter=';')

display(show_attr('lend_co_data_num_NAN'))
# lend_co_data_num[[0,1,-2,-1]]
display(lend_co_data_num_NAN)

' lend_co_data_num_NAN: | shape: (1043, 6) | ndim: 2 | size: 6258 | dtype: float64 '

array([[ 2000.,    40.,   365.,  3121.,  4241., 13621.],
       [ 2000.,    40.,   365.,  3061.,  4171., 15041.],
       [ 1000.,    40.,   365.,  2160.,  3280., 15340.],
       ...,
       [   nan,    40.,   365.,  4201.,  5001., 16600.],
       [ 1000.,    40.,   365.,  2080.,  3320., 15600.],
       [ 2000.,    40.,   365.,  4601.,  4601., 16600.]])

In [20]:
# Let's check is there are NANs values - np.isnan() -> elementwise
# False equiv 0. True equiv 1. Any num != 0 equiv True
dtset = lend_co_data_num_NAN

display(np.isnan(dtset))
display(np.isnan(dtset)[np.isnan(dtset) == True]) # jm
display(np.isnan(dtset)[np.isnan(dtset) == False]) # jm
display(np.isnan(dtset).sum())
display(np.isnan(dtset).sum(axis=0))
display(np.sum(np.isnan(dtset)))
display(np.sum(np.isnan(dtset), axis=0))
display(np.sum(np.isnan(dtset), axis=1))

array([[False, False, False, False, False, False],
       [False, False, False, False, False, False],
       [False, False, False, False, False, False],
       ...,
       [ True, False, False, False, False, False],
       [False, False, False, False, False, False],
       [False, False, False, False, False, False]])

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,

array([False, False, False, ..., False, False, False])

260

array([34, 48, 52, 62, 38, 26])

260

array([34, 48, 52, 62, 38, 26])

array([0, 0, 0, ..., 1, 0, 0])

In [21]:
# We can user filling_values parameter of genfromtxt()
# Not 0 or gusees value better grater than maximun of the dtset

display(max_dtset:= np.nanmax(lend_co_data_num))
display(temp_fill:= np.nanmax(lend_co_data_num_NAN).round(2) + 1)

# Reload the dtset w/NANs changing NANs by temp_value
lend_co_data_num_NAN = np.genfromtxt('Lending-Company-Numeric-Data-NAN.csv',
                                     delimiter=';',
                                     filling_values=temp_fill)

display(show_attr('lend_co_data_num_NAN'))
# lend_co_data_num[[0,1,-2,-1]]
print(lend_co_data_num_NAN)

64001.0

64002.0

' lend_co_data_num_NAN: | shape: (1043, 6) | ndim: 2 | size: 6258 | dtype: float64 '

[[2.0000e+03 4.0000e+01 3.6500e+02 3.1210e+03 4.2410e+03 1.3621e+04]
 [2.0000e+03 4.0000e+01 3.6500e+02 3.0610e+03 4.1710e+03 1.5041e+04]
 [1.0000e+03 4.0000e+01 3.6500e+02 2.1600e+03 3.2800e+03 1.5340e+04]
 ...
 [6.4002e+04 4.0000e+01 3.6500e+02 4.2010e+03 5.0010e+03 1.6600e+04]
 [1.0000e+03 4.0000e+01 3.6500e+02 2.0800e+03 3.3200e+03 1.5600e+04]
 [2.0000e+03 4.0000e+01 3.6500e+02 4.6010e+03 4.6010e+03 1.6600e+04]]


In [22]:
# Check the NANs in recent loaded dtset
dtset = lend_co_data_num_NAN

display(np.isnan(dtset))
display(np.isnan(dtset)[np.isnan(dtset) == True]) # jm
display(np.isnan(dtset)[np.isnan(dtset) == False]) # jm
display(np.isnan(dtset).sum())
display(np.isnan(dtset).sum(axis=0))
display(np.sum(np.isnan(dtset)))
display(np.sum(np.isnan(dtset), axis=0))
display(np.sum(np.isnan(dtset), axis=1))

array([[False, False, False, False, False, False],
       [False, False, False, False, False, False],
       [False, False, False, False, False, False],
       ...,
       [False, False, False, False, False, False],
       [False, False, False, False, False, False],
       [False, False, False, False, False, False]])

array([], dtype=bool)

array([False, False, False, ..., False, False, False])

0

array([0, 0, 0, 0, 0, 0])

0

array([0, 0, 0, 0, 0, 0])

array([0, 0, 0, ..., 0, 0, 0])

### Notes and Examples from the Manual - numpy.isnan()
- NumPy uses the IEEE Standard for Binary Floating-Point for Arithmetic (IEEE 754). This means that Not a Number is not equivalent to infinity.
- np.inf -> IEEE 754 floating point representation of (positive) infinity.
- np.log() -> Natural logarithm (base e), element-wise. log(exp(x)) = x.

In [23]:
display(np.isnan(np.nan))
display(np.isnan(np.inf))

True

False

In [60]:
# e ** 2 = 7.3891 => log(7.3891) = 2 | e ** x = 7.3891 => log(e ** x) = log(7.3891) => x = 2
print(np.log(7.3891).round(2))
print(np.round(np.e ** 2, 4))

display(np.array([np.log(-1.),1.,np.log(0)]))
# np.log(-1) => NAN; np.log only to positive reals
np.isnan([np.log(-1.),1.,np.log(0)])

2.0
7.3891


  display(np.array([np.log(-1.),1.,np.log(0)]))
  display(np.array([np.log(-1.),1.,np.log(0)]))


array([ nan,   1., -inf])

  np.isnan([np.log(-1.),1.,np.log(0)])
  np.isnan([np.log(-1.),1.,np.log(0)])


array([ True, False, False])

## Some Python cases of mod operator %

In [25]:
for i in range(24):
    print('   ' * (i % 6), i % 6)
    # print(f"{'   ' * (i % 5)}{i % 5}")

 0
    1
       2
          3
             4
                5
 0
    1
       2
          3
             4
                5
 0
    1
       2
          3
             4
                5
 0
    1
       2
          3
             4
                5


In [26]:
from time import sleep

In [27]:
for i in range(19, -1, -1):
    endln = ' - '
    mod_5 = i % 5
    if mod_5 == 0:
        endln ='\n'
    print(mod_5, end=endln)


4 - 3 - 2 - 1 - 0
4 - 3 - 2 - 1 - 0
4 - 3 - 2 - 1 - 0
4 - 3 - 2 - 1 - 0


In [28]:
from time import sleep

for i in range(20):
    endln = ''
    mod_5 = i % 5
    if mod_5 == 4:
        endln ='\n'
    print(f"{'  ' * (mod_5)}*", end= endln)
    sleep(1)


*

  *    *      *        *
*  *    *      *        *
*  *    *      *        *
*  *    *      *        *


In [29]:
letters = [chr(i) for i in range(65,85)]
for i in range(19, -1, -1):
    endln = ' '
    mod_5 = i % 5
    if mod_5 == 0:
        endln ='\n'
    print(letters[i], end=endln)

T S R Q P
O N M L K
J I H G F
E D C B A
