# 8 Data Manipulation with NumPy
- Examine how to clean and preprocess data using NumPy.
- Hoy to discover missing values (and fill them up).
- Ways to remove irrelevant data.
- sort(), shuffle(), reshape(), stack(), strip()
## 8_13 Finding Unique Values in NDarrays

#### numpy.unique(ar, return_index=False, return_inverse=False, return_counts=False, axis=None, *, equal_nan=True)
- Find the unique elements of an array.
- Returns the sorted unique elements of an array. There are three optional outputs in addition to the unique elements:
    * the indices of the input array that give the unique values
    * the indices of the unique array that reconstruct the input array
    * the number of times each unique value comes up in the input array

#### np.unique(arr, return_index=, return_counts=)
- Inplace NO!, creates another array that contains all the diff. values
- Any value can feature only once in the output

In [45]:
import numpy as np
np.__version__
np.set_printoptions(suppress=True)  # To avoid scientific notation

In [46]:
# Function show_attr

def show_attr(arrnm: str) -> str:
    strout = f' {arrnm}: '

    for attr in ('shape', 'ndim', 'size', 'dtype'):     #, 'itemsize'):
            arrnm_attr = arrnm + '.' + attr
            strout += f'| {attr}: {eval(arrnm_attr)} '

    return strout

In [47]:
# 1st dataset - lend_num, original without NANs
lend_num = np.loadtxt('Lending-Company-Numeric-Data.csv', delimiter=',')
display(lend_num)
show_attr('lend_num')

array([[ 2000.,    40.,   365.,  3121.,  4241., 13621.],
       [ 2000.,    40.,   365.,  3061.,  4171., 15041.],
       [ 1000.,    40.,   365.,  2160.,  3280., 15340.],
       ...,
       [ 2000.,    40.,   365.,  4201.,  5001., 16600.],
       [ 1000.,    40.,   365.,  2080.,  3320., 15600.],
       [ 2000.,    40.,   365.,  4601.,  4601., 16600.]])

' lend_num: | shape: (1043, 6) | ndim: 2 | size: 6258 | dtype: float64 '

In [48]:
# Unique vals of the whole dataset
display(uniq_1 := np.unique(lend_num))
show_attr('uniq_1')
# 1-D array, of all unique vals of lend_num, arranged in incresing
# order, from -2870 (min) to 64001 (max)

array([-2870., -2550., -2450., ..., 52751., 54625., 64001.])

' uniq_1: | shape: (1140,) | ndim: 1 | size: 1140 | dtype: float64 '

In [49]:
# Uniques vals of the 2nd col
np.unique(lend_num[:,1])

array([ 35.,  40.,  50., 125., 165.])

In [50]:
# Increasing fashion using strings
ings = np.array(['b1', 'a1', 'C1', 'A2', 'A1', 'a2', 'B1', 'B2', 'c1'])
np.unique(ings)


array(['A1', 'A2', 'B1', 'B2', 'C1', 'a1', 'a2', 'b1', 'c1'], dtype='<U2')

In [51]:
# How many times each values appears -> Useful in statistics
display(np.unique(lend_num[:,1], return_counts=True))
# Can see 40 apears 567 times and 50. 451
print('Mean:', np.mean(lend_num[:,1]).round(2), end=' - ')
print('Median:', np.median(lend_num[:,1]).round(2), end=' - ')

from scipy.stats import mode as scmode
print('Mode:', scmode(lend_num[:,1]))

## The mean (average) of a data set is found by adding all numbers in the
# data set and then dividing by the number of values in the set.
## The median is the middle value (in the middle position) when a dataset
# is ordered from least to greatest.
## The mode is the number that occurs most often in a data set. It's always
# possible that there are two modes, and sometimes there is no mode at all

## Arithmetic Mean = sum() / num
# x₁ + x₂ + x₃ + .... = a + a + a + ... + a
## Geometric Mean = mult() / num
# x₁ * x₂ * x₃ * ... = b * b * b * ... * b
## Harmonic Mean = sum(1/el) / num
# 1/x₁ + 1/x₂ + 1/x₃ + ... = 1/h + 1/h + ... + 1/h.

# The arithmetic mean is one example of a statistic that describes the
# central tendency of a dataset. But any other formula or process that
# takes a dataset and generates a single number that represents a
# "typical" value is also a measure of central tendency. That includes
# the median and mode as well as more exotic things like the midrange or
# the arithmetic mean when you ignore the largest and smallest value.
# All of these numbers attempt to capture the spirit of a dataset by
# giving you a sense of a single "usual" value, and that is what makes
# them measures of central tendency..

(array([ 35.,  40.,  50., 125., 165.]),
 array([  4, 567, 451,  19,   2], dtype=int64))

Mean: 46.09 - Median: 40.0 - Mode: ModeResult(mode=40.0, count=567)


In [52]:
# Where the unique val appears firs (return_index)
np.unique(lend_num[:,1], return_counts=True, return_index=True)
# Indices array is in the middle: the array with 0 is the index
# arr cause is the only can contain a 0 and sure it will

(array([ 35.,  40.,  50., 125., 165.]),
 array([327,   0,   4,  19,  27], dtype=int64),
 array([  4, 567, 451,  19,   2], dtype=int64))

In [53]:
# JM np.unique with NANs ¡?¡?
lend_NANs = np.genfromtxt('Lending-Company-Numeric-Data-NAN.csv',
                          delimiter=';')
display(lend_NANs)
show_attr('lend_NANs')
print('Number of NANs:', np.isnan(lend_NANs).sum())

np.unique(lend_NANs, return_index=True, return_counts=True)
# See again that are 260 NANs as the greatest value

array([[ 2000.,    40.,   365.,  3121.,  4241., 13621.],
       [ 2000.,    40.,   365.,  3061.,  4171., 15041.],
       [ 1000.,    40.,   365.,  2160.,  3280., 15340.],
       ...,
       [   nan,    40.,   365.,  4201.,  5001., 16600.],
       [ 1000.,    40.,   365.,  2080.,  3320., 15600.],
       [ 2000.,    40.,   365.,  4601.,  4601., 16600.]])

Number of NANs: 260


(array([-2870., -2550., -2450., ..., 54625., 64001.,    nan]),
 array([ 195,  999,  513, ..., 3413,  167,   69], dtype=int64),
 array([  2,   1,   1, ...,   7,   1, 260], dtype=int64))

### np.unique() - Notes and Examples from Manual
- When an axis is specified the subarrays indexed by the axis are sorted. This is done by making the specified axis the first dimension of the array (move the axis to the first dimension to keep the order of the other axes) and then flattening the subarrays in C order.
- The flattened subarrays are then viewed as a structured type with each element given a label, with the effect that we end up with a 1-D array of structured types that can be treated in the same way as any other 1-D array.
- The result is that the flattened subarrays are sorted in lexicographic order starting with the first element.

In [54]:
# 2-D array unique in different axis (unique rows and unique cols)
display(a := np.array([[1, 0, 0], [1, 0, 0], [2, 3, 4]]))
display(np.unique(a, axis=0))   # Unique rows
display(np.unique(a, axis=1))   # Unique cols
np.unique(a, axis=0, return_index=True, return_counts=True)

array([[1, 0, 0],
       [1, 0, 0],
       [2, 3, 4]])

array([[1, 0, 0],
       [2, 3, 4]])

array([[0, 0, 1],
       [0, 0, 1],
       [3, 4, 2]])

(array([[1, 0, 0],
        [2, 3, 4]]),
 array([0, 2], dtype=int64),
 array([2, 1], dtype=int64))

In [55]:
# Return the indices of the original array that give the unique values:
display(a := np.array(['a', 'b', 'b', 'c', 'a']))
u, indices = np.unique(a, return_index=True)
display(u, indices)
a[indices]

array(['a', 'b', 'b', 'c', 'a'], dtype='<U1')

array(['a', 'b', 'c'], dtype='<U1')

array([0, 1, 3], dtype=int64)

array(['a', 'b', 'c'], dtype='<U1')

In [57]:
# Reconstruct the input values from the unique values and counts:
display(a := np.array([1, 2, 6, 4, 2, 3, 2]))
values, counts = np.unique(a, return_counts=True)
display(values, counts)
np.repeat(values, counts)   # # original order not preserved

array([1, 2, 6, 4, 2, 3, 2])

array([1, 2, 3, 4, 6])

array([1, 3, 1, 1, 1], dtype=int64)

array([1, 2, 2, 2, 3, 4, 6])

In [58]:
# Reconstruct the input array from the unique values and inverse:
display(a := np.array([1, 2, 6, 4, 2, 3, 2]))
u, indices = np.unique(a, return_inverse=True)
display(u, indices)
u[indices]

array([1, 2, 6, 4, 2, 3, 2])

array([1, 2, 3, 4, 6])

array([0, 1, 4, 3, 1, 2, 1], dtype=int64)

array([1, 2, 6, 4, 2, 3, 2])

In [56]:
# JM selecting els, rows and cols using specific indices
print(lst := [chr(i) for i in range(65,74)])
display(A := np.array(lst))
display(A[[0,5,7,-1]])
display(B := np.vstack((A, A, A, A)))
display(B[[0,2],(4,7)])
display(B[:,[4,7]])

['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I']


array(['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I'], dtype='<U1')

array(['A', 'F', 'H', 'I'], dtype='<U1')

array([['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I'],
       ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I'],
       ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I'],
       ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I']], dtype='<U1')

array(['E', 'H'], dtype='<U1')

array([['E', 'H'],
       ['E', 'H'],
       ['E', 'H'],
       ['E', 'H']], dtype='<U1')