# 8 Data Manipulation with NumPy
- Examine how to clean and preprocess data using NumPy.
- Hoy to discover missing values (and fill them up).
- Ways to remove irrelevant data.
- sort(), shuffle(), reshape(), stack(), strip()
## 8_13 Finding Unique Values in NDarrays

#### numpy.unique(ar, return_index=False, return_inverse=False, return_counts=False, axis=None, *, equal_nan=True)
- Find the unique elements of an array.
- Returns the sorted unique elements of an array. There are three optional outputs in addition to the unique elements:
    * the indices of the input array that give the unique values
    * the indices of the unique array that reconstruct the input array
    * the number of times each unique value comes up in the input array

#### Co.

In [1]:
import numpy as np
np.__version__
np.set_printoptions(suppress=True)  # To avoid scientific notation

In [2]:
# Function show_attr

def show_attr(arrnm: str) -> str:
    strout = f' {arrnm}: '

    for attr in ('shape', 'ndim', 'size', 'dtype'):     #, 'itemsize'):
            arrnm_attr = arrnm + '.' + attr
            strout += f'| {attr}: {eval(arrnm_attr)} '

    return strout

In [3]:
# 1st dataset - lend_num, original without NANs
lend_num = np.loadtxt('Lending-Company-Numeric-Data.csv', delimiter=',')
display(lend_num)
show_attr('lend_num')

array([[ 2000.,    40.,   365.,  3121.,  4241., 13621.],
       [ 2000.,    40.,   365.,  3061.,  4171., 15041.],
       [ 1000.,    40.,   365.,  2160.,  3280., 15340.],
       ...,
       [ 2000.,    40.,   365.,  4201.,  5001., 16600.],
       [ 1000.,    40.,   365.,  2080.,  3320., 15600.],
       [ 2000.,    40.,   365.,  4601.,  4601., 16600.]])

' lend_num: | shape: (1043, 6) | ndim: 2 | size: 6258 | dtype: float64 '

In [4]:
# 2st dataset - lend_pre, original without NANs
lend_pre = np.genfromtxt('Lending-Company-Numeric-Data-NAN.csv', delimiter=';')
display(lend_pre)
show_attr('lend_pre')
print('Number of NANs:', np.isnan(lend_pre).sum())

# Process to replace NANs w/mean of each column
orig_means = np.nanmean(lend_pre, axis=0).round(2)  # Means of all columns
nan_ixs = np.argwhere(np.isnan(lend_pre))   # Indices of NANs 
for nan_ix in nan_ixs:                      # for e/NAN (couple of indices)
    # Change the NAN to the mean of its column 
    lend_pre[nan_ix[0], nan_ix[1]] = orig_means[nan_ix[1]]

# DONE, check the results
display(lend_pre)
show_attr('lend_pre')
print('Number of NANs:', np.isnan(lend_pre).sum())
means = np.mean(lend_pre, axis=0).round(2)          # Actual means
print('Original Means == Actual Means:', np.array_equal(orig_means, means))

array([[ 2000.,    40.,   365.,  3121.,  4241., 13621.],
       [ 2000.,    40.,   365.,  3061.,  4171., 15041.],
       [ 1000.,    40.,   365.,  2160.,  3280., 15340.],
       ...,
       [   nan,    40.,   365.,  4201.,  5001., 16600.],
       [ 1000.,    40.,   365.,  2080.,  3320., 15600.],
       [ 2000.,    40.,   365.,  4601.,  4601., 16600.]])

Number of NANs: 260


array([[ 2000.  ,    40.  ,   365.  ,  3121.  ,  4241.  , 13621.  ],
       [ 2000.  ,    40.  ,   365.  ,  3061.  ,  4171.  , 15041.  ],
       [ 1000.  ,    40.  ,   365.  ,  2160.  ,  3280.  , 15340.  ],
       ...,
       [ 2250.25,    40.  ,   365.  ,  4201.  ,  5001.  , 16600.  ],
       [ 1000.  ,    40.  ,   365.  ,  2080.  ,  3320.  , 15600.  ],
       [ 2000.  ,    40.  ,   365.  ,  4601.  ,  4601.  , 16600.  ]])

Number of NANs: 0
Original Means == Actual Means: True


In [5]:
# 1-D arrays return 1-D array larger, one after the other
display(lend_num[0], lend_num[2])
np.concatenate((lend_num[2], lend_num[0]))

array([ 2000.,    40.,   365.,  3121.,  4241., 13621.])

array([ 1000.,    40.,   365.,  2160.,  3280., 15340.])

array([ 1000.,    40.,   365.,  2160.,  3280., 15340.,  2000.,    40.,
         365.,  3121.,  4241., 13621.])

In [6]:
# The same susing axis=0 = axis=-1 | 1-D arryas have only one axis.
display(np.concatenate((lend_num[2], lend_num[0]), axis=-1))

array([ 1000.,    40.,   365.,  2160.,  3280., 15340.,  2000.,    40.,
         365.,  3121.,  4241., 13621.])

In [7]:
# 2-D array in the default axis (0) - idem np.vstack()
display(ct_1 := np.concatenate((lend_num, lend_pre)))
display(show_attr('ct_1'))   # 2086 (1043 x 2) rows, 6 cols
np.concatenate((lend_num, lend_pre), axis=0).shape

array([[ 2000.  ,    40.  ,   365.  ,  3121.  ,  4241.  , 13621.  ],
       [ 2000.  ,    40.  ,   365.  ,  3061.  ,  4171.  , 15041.  ],
       [ 1000.  ,    40.  ,   365.  ,  2160.  ,  3280.  , 15340.  ],
       ...,
       [ 2250.25,    40.  ,   365.  ,  4201.  ,  5001.  , 16600.  ],
       [ 1000.  ,    40.  ,   365.  ,  2080.  ,  3320.  , 15600.  ],
       [ 2000.  ,    40.  ,   365.  ,  4601.  ,  4601.  , 16600.  ]])

' ct_1: | shape: (2086, 6) | ndim: 2 | size: 12516 | dtype: float64 '

(2086, 6)

In [8]:
# 2-D array last axis (-1) = axis=1 - idem np.hstack()
display(ct_2 := np.concatenate((lend_num, lend_pre), axis=1))
display(show_attr('ct_2'))   # 1043 rows, 6 x 2 = 12 cols
np.hstack((lend_num, lend_pre)).shape

array([[ 2000.,    40.,   365., ...,  3121.,  4241., 13621.],
       [ 2000.,    40.,   365., ...,  3061.,  4171., 15041.],
       [ 1000.,    40.,   365., ...,  2160.,  3280., 15340.],
       ...,
       [ 2000.,    40.,   365., ...,  4201.,  5001., 16600.],
       [ 1000.,    40.,   365., ...,  2080.,  3320., 15600.],
       [ 2000.,    40.,   365., ...,  4601.,  4601., 16600.]])

' ct_2: | shape: (1043, 12) | ndim: 2 | size: 12516 | dtype: float64 '

(1043, 12)

In [9]:
display(ct_2 := np.concatenate((lend_num, lend_pre), axis=-1))
display(show_attr('ct_2'))   # 1043 rows, 6 x 2 = 12 cols

array([[ 2000.,    40.,   365., ...,  3121.,  4241., 13621.],
       [ 2000.,    40.,   365., ...,  3061.,  4171., 15041.],
       [ 1000.,    40.,   365., ...,  2160.,  3280., 15340.],
       ...,
       [ 2000.,    40.,   365., ...,  4201.,  5001., 16600.],
       [ 1000.,    40.,   365., ...,  2080.,  3320., 15600.],
       [ 2000.,    40.,   365., ...,  4601.,  4601., 16600.]])

' ct_2: | shape: (1043, 12) | ndim: 2 | size: 12516 | dtype: float64 '

In [10]:
# JM COMMENT - maybe FUTURE tries of add a row two a 2-D arrays
# 1. using vstack() same num of cols - or stack axis=9
# 2. using concatenate(), previously convert 1-D to 2-D adding a dimension
#   a. add a dim creating a ner array = ([1-D array])
#   b. reshaping the array an adding new dimension value of 1

In [11]:
# Two 3-D Array - Two Tensor rank 3
T1 = np.array([[[1, 2, 3, 4],
                [5, 6, 7, 8],
                [9, 10, 11, 12]],
               
               [[21, 22, 23, 24],
                [25, 26, 27, 28],
                [29, 10, 31, 32]]])

T2 = T1 * 2

display(T2)
show_attr('T2')

array([[[ 2,  4,  6,  8],
        [10, 12, 14, 16],
        [18, 20, 22, 24]],

       [[42, 44, 46, 48],
        [50, 52, 54, 56],
        [58, 20, 62, 64]]])

' T2: | shape: (2, 3, 4) | ndim: 3 | size: 24 | dtype: int32 '

In [12]:
ct_3 = np.concatenate((T1, T2), axis=0)    # idem np.concatenate((T1, T2))
display(ct_3)
print(show_attr('ct_3'))
# idem to vstack
np.vstack((T1, T2))
## axis=1 irem hstack
## axis=2 idem dstack
# np.concatenate() can replicate the outputs of these 3 stacking functions.


array([[[ 1,  2,  3,  4],
        [ 5,  6,  7,  8],
        [ 9, 10, 11, 12]],

       [[21, 22, 23, 24],
        [25, 26, 27, 28],
        [29, 10, 31, 32]],

       [[ 2,  4,  6,  8],
        [10, 12, 14, 16],
        [18, 20, 22, 24]],

       [[42, 44, 46, 48],
        [50, 52, 54, 56],
        [58, 20, 62, 64]]])

 ct_3: | shape: (4, 3, 4) | ndim: 3 | size: 48 | dtype: int32 


array([[[ 1,  2,  3,  4],
        [ 5,  6,  7,  8],
        [ 9, 10, 11, 12]],

       [[21, 22, 23, 24],
        [25, 26, 27, 28],
        [29, 10, 31, 32]],

       [[ 2,  4,  6,  8],
        [10, 12, 14, 16],
        [18, 20, 22, 24]],

       [[42, 44, 46, 48],
        [50, 52, 54, 56],
        [58, 20, 62, 64]]])

In [13]:
# Concatenation of 1-D arrays ->don´t need to have the same dimension
# More dims are diff, however, if we have arrays of the same dimensions
# but diff shapes, we can still concatenate them. But only if their dim
# match for the axis we are concatenating along (JM try in stack the same)

### FUTURE - Notes and Examples from Manual