# 8 Data Manipulation with NumPy
- Examine how to clean and preprocess data using NumPy.
- Hoy to discover missing values (and fill them up).
- Ways to remove irrelevant data.
- sort(), shuffle(), reshape(), stack(), strip()
## 8_2 Substituing Missing Values in NDarrays

#### numpy.where(condition, [x, y, ]/)
- Return elements chosen from x or y depending on condition.

In [1]:
import numpy as np
np.__version__

'1.26.4'

In [2]:
# Function show_attr

def show_attr(arrnm: str) -> str:
    strout = f' {arrnm}: '

    for attr in ('shape', 'ndim', 'size', 'dtype'):     #, 'itemsize'):
            arrnm_attr = arrnm + '.' + attr
            strout += f'| {attr}: {eval(arrnm_attr)} '

    return strout

In [3]:
# Let's work with a dataset that contains NANs.

lend_co_data_num_NAN = np.genfromtxt('Lending-Company-Numeric-Data-NAN.csv',
                                     delimiter=';')

display(show_attr('lend_co_data_num_NAN'))
lend_co_data_num_NAN

' lend_co_data_num_NAN: | shape: (1043, 6) | ndim: 2 | size: 6258 | dtype: float64 '

array([[ 2000.,    40.,   365.,  3121.,  4241., 13621.],
       [ 2000.,    40.,   365.,  3061.,  4171., 15041.],
       [ 1000.,    40.,   365.,  2160.,  3280., 15340.],
       ...,
       [   nan,    40.,   365.,  4201.,  5001., 16600.],
       [ 1000.,    40.,   365.,  2080.,  3320., 15600.],
       [ 2000.,    40.,   365.,  4601.,  4601., 16600.]])

In [4]:
# Let's check is there are NANs values - np.isnan() -> elementwise
np.isnan(lend_co_data_num_NAN).sum()    # 260 NANs

260

In [5]:
# Change the NANs by a value grater than the max of the array

display(max_dtset:= np.nanmax(lend_co_data_num_NAN))
display(temp_fill:= np.nanmax(lend_co_data_num_NAN).round(2) + 1)

# Reload the dtset w/NANs changing NANs by temp_value
lend_co_data_num_NAN = np.genfromtxt('Lending-Company-Numeric-Data-NAN.csv',
                                     delimiter=';',
                                     filling_values=temp_fill)

display(show_attr('lend_co_data_num_NAN'))
print(lend_co_data_num_NAN)

## Now all the originals NANs ara equal to the maximum of this array

64001.0

64002.0

' lend_co_data_num_NAN: | shape: (1043, 6) | ndim: 2 | size: 6258 | dtype: float64 '

[[2.0000e+03 4.0000e+01 3.6500e+02 3.1210e+03 4.2410e+03 1.3621e+04]
 [2.0000e+03 4.0000e+01 3.6500e+02 3.0610e+03 4.1710e+03 1.5041e+04]
 [1.0000e+03 4.0000e+01 3.6500e+02 2.1600e+03 3.2800e+03 1.5340e+04]
 ...
 [6.4002e+04 4.0000e+01 3.6500e+02 4.2010e+03 5.0010e+03 1.6600e+04]
 [1.0000e+03 4.0000e+01 3.6500e+02 2.0800e+03 3.3200e+03 1.5600e+04]
 [2.0000e+03 4.0000e+01 3.6500e+02 4.6010e+03 4.6010e+03 1.6600e+04]]


### Relacing missing with a more adecuate value
- As a result of what we did previously, if a value is equal to the maximum of the array, it is actually a 'missing' value
- We need to replace these values with more meaningful ones
- A common approach is to fill all missing vals with the mean of a given column. The reason is tha quite possibly this won't change the overall interpretation of the data set.
- All missing vals would be considered average, thus unimportant for consequent analysis.
- This is NOT always valid, but often is the preferred approach.

In [6]:
# Reload the original data and calc the temp_mean of each column
lend_co_data_num_NAN = np.genfromtxt('Lending-Company-Numeric-Data-NAN.csv',
                                     delimiter=';')

temp_means = np.nanmean(lend_co_data_num_NAN, axis=0).round(2)
print(show_attr('temp_means'))
display(temp_means)      # Stores the means of each column

## We want to keep track of the different means because they can change
## after filling out the missing vals

 temp_means: | shape: (6,) | ndim: 1 | size: 6 | dtype: float64 


array([ 2250.25,    46.11,   365.  ,  3895.99,  5160.75, 16571.44])

In [7]:
# Reload the original data filling NANs with temp_max
display(temp_max:= np.nanmax(lend_co_data_num_NAN).round(2) + 1)

lend_co_data_num_NAN = np.genfromtxt('Lending-Company-Numeric-Data-NAN.csv',
                                     delimiter=';',
                                     filling_values=temp_max)
print(show_attr('temp_means'))
display(temp_means) 
np.isnan(lend_co_data_num_NAN).sum(axis=0)      # = 0, 0..., NO NANs

64002.0

 temp_means: | shape: (6,) | ndim: 1 | size: 6 | dtype: float64 


array([ 2250.25,    46.11,   365.  ,  3895.99,  5160.75, 16571.44])

array([0, 0, 0, 0, 0, 0])

In [8]:
# New mean of the 1st column
display(np.mean(lend_co_data_num_NAN[:,0]).round(2))
## This average is distorted due to all the missing vals we've substituted
## with our filler (temp_max)
(np.mean(lend_co_data_num_NAN[:,0]).round(2) / temp_means[0]).round(2)
## BIG impact on the result: close to TWICE the original mean of col[0]

## This staggering diff could result in extracting misleading insights
## from the data -> then -> np.where() will help us.

4263.25

1.89

In [9]:
# np.where() to substitute the filling_values with the associated mean
# for each column. Let's see 1st column (col_0)
lend_co_data_num_NAN[:,0] = np.where(lend_co_data_num_NAN[:,0] == temp_max,
                                     temp_means[0],
                                     lend_co_data_num_NAN[:,0])
## Every non-filler val will remain unchanged (lend_co_data_num_NAN[:,0])
## E/fill will now contain the mean of the original column (temp_means[0])

# New mean of the first column
display(np.mean(lend_co_data_num_NAN[:,0]).round(2))
(np.mean(lend_co_data_num_NAN[:,0]).round(2) / temp_means[0]).round(2)
# Whenever we add the mean of a set to itself, the mean of the new set
# stays the same.

2250.25

1.0

In [10]:
# Generalize this transformation for all the columns
for i in range(lend_co_data_num_NAN.shape[1]):
    lend_co_data_num_NAN[:,i] = np.where(lend_co_data_num_NAN[:,i] == temp_max,
                                     temp_means[i],
                                     lend_co_data_num_NAN[:,i])

## Now all the means are the same as original and have NO NANs
new_means = np.mean(lend_co_data_num_NAN, axis=0).round(2)
display(new_means, temp_means)
display(np.array_equal(new_means, temp_means))
np.isnan(lend_co_data_num_NAN).sum()

array([ 2250.25,    46.11,   365.  ,  3895.99,  5160.75, 16571.44])

array([ 2250.25,    46.11,   365.  ,  3895.99,  5160.75, 16571.44])

True

0

In [11]:
# Of course we can use np.where to substitute others values
# Ex. if we don't want negatives values in our data set (ex. change for 0)
for i in range(lend_co_data_num_NAN.shape[1]):
    lend_co_data_num_NAN[:,i] = np.where(lend_co_data_num_NAN[:,i] < 0,
                                     0,
                                     lend_co_data_num_NAN[:,i])

display(lend_co_data_num_NAN[lend_co_data_num_NAN < 0])
lend_co_data_num_NAN[lend_co_data_num_NAN < 0].sum()

array([], dtype=float64)

0.0

## JM: try to change NANs withou filling val with temp_max
### It seems than CANNOT replace the NANs directly ¡??¡¡?

In [12]:
# Reload the original dataset with NANs
l_NANs = np.genfromtxt('Lending-Company-Numeric-Data-NAN.csv',
                       delimiter=';')

display(np.isnan(l_NANs).sum())     # 260 NANs in the dataset (ndarray)

# Calculate and store the actual - original means for e/column
display(original_means := np.nanmean(l_NANs, axis=0).round(2))

# Replace the NANs of each col w/its original mean (using for loop)
for i in range(l_NANs.shape[1]):
    l_NANs[:,i] = np.where(l_NANs[:,i] == np.nan,
                           original_means[i],
                           l_NANs[:,i])

display(new_means := np.mean(l_NANs, axis=0).round(2))
display(original_means)
display(np.array_equal(original_means, new_means))

# Number of NANs in the actual dataset 'l_NANs' 
display(np.isnan(l_NANs).sum())
l_NANs


260

array([ 2250.25,    46.11,   365.  ,  3895.99,  5160.75, 16571.44])

array([nan, nan, nan, nan, nan, nan])

array([ 2250.25,    46.11,   365.  ,  3895.99,  5160.75, 16571.44])

False

260

array([[ 2000.,    40.,   365.,  3121.,  4241., 13621.],
       [ 2000.,    40.,   365.,  3061.,  4171., 15041.],
       [ 1000.,    40.,   365.,  2160.,  3280., 15340.],
       ...,
       [   nan,    40.,   365.,  4201.,  5001., 16600.],
       [ 1000.,    40.,   365.,  2080.,  3320., 15600.],
       [ 2000.,    40.,   365.,  4601.,  4601., 16600.]])

In [13]:
display(a99 := np.array([1,2,99,4,5]))
display(a0 := np.where(a99 == 99, 0, a99))
display(a99 := np.where(a99 == 99, 0, a99))
print('-------------------------------------------------------')
display(aNAN := np.array([1,2,np.nan,4,5]))
display(a0N := np.where(aNAN == np.nan, 0, aNAN))
display(aNAN := np.where(aNAN == np.nan, 0, aNAN))


array([ 1,  2, 99,  4,  5])

array([1, 2, 0, 4, 5])

array([1, 2, 0, 4, 5])

-------------------------------------------------------


array([ 1.,  2., nan,  4.,  5.])

array([ 1.,  2., nan,  4.,  5.])

array([ 1.,  2., nan,  4.,  5.])