In [1]:
import numpy as np

In [2]:
# create a 1d array:
np.arange(10)

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

In [7]:
# create a boolean array
np.full((3,3), 1, dtype=bool)

array([[ True,  True,  True],
       [ True,  True,  True],
       [ True,  True,  True]])

In [8]:
np.full((3,3), True, dtype=bool)

array([[ True,  True,  True],
       [ True,  True,  True],
       [ True,  True,  True]])

In [9]:
# or:
np.ones((3,3), dtype=bool)

array([[ True,  True,  True],
       [ True,  True,  True],
       [ True,  True,  True]])

In [10]:
np.zeros((3,3), dtype=bool)

array([[False, False, False],
       [False, False, False],
       [False, False, False]])

In [13]:
# extract odd number from array:
arr = np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
arr[arr%2==1]

array([1, 3, 5, 7, 9])

In [15]:
# replace items that satisfy a condition without affecting the original array
np.where(arr%2==1, -1, arr)

array([ 0, -1,  2, -1,  4, -1,  6, -1,  8, -1])

In [16]:
arr

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

In [17]:
# reshape an array
arr.reshape(2,-1)

array([[0, 1, 2, 3, 4],
       [5, 6, 7, 8, 9]])

In [22]:
a = np.arange(10).reshape(2,-1)
b = np.repeat(1, 10).reshape(2,-1)
b

array([[1, 1, 1, 1, 1],
       [1, 1, 1, 1, 1]])

In [20]:
# stack 2 arrays vertically
np.vstack([a,b])

array([[0, 1, 2, 3, 4],
       [5, 6, 7, 8, 9],
       [1, 1, 1, 1, 1],
       [1, 1, 1, 1, 1]])

In [23]:
# or:
np.concatenate([a,b], axis=0)

array([[0, 1, 2, 3, 4],
       [5, 6, 7, 8, 9],
       [1, 1, 1, 1, 1],
       [1, 1, 1, 1, 1]])

In [25]:
# or:
np.r_[a,b]

array([[0, 1, 2, 3, 4],
       [5, 6, 7, 8, 9],
       [1, 1, 1, 1, 1],
       [1, 1, 1, 1, 1]])

In [26]:
# stack horizontally
np.concatenate([a,b], axis=1)

array([[0, 1, 2, 3, 4, 1, 1, 1, 1, 1],
       [5, 6, 7, 8, 9, 1, 1, 1, 1, 1]])

In [27]:
# or:
np.hstack([a,b])

array([[0, 1, 2, 3, 4, 1, 1, 1, 1, 1],
       [5, 6, 7, 8, 9, 1, 1, 1, 1, 1]])

In [29]:
# or:
np.c_[a,b]

array([[0, 1, 2, 3, 4, 1, 1, 1, 1, 1],
       [5, 6, 7, 8, 9, 1, 1, 1, 1, 1]])

In [30]:
# repeat in two ways
np.repeat(a,3)

array([0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, 6, 6, 6, 7,
       7, 7, 8, 8, 8, 9, 9, 9])

In [34]:
np.tile(a.reshape(1,-1),3).reshape(3,-1)

array([[0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
       [0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
       [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]])

In [35]:
# get the common items between two python numpy arrays
a = np.array([1,2,3,2,3,4,3,4,5,6])
b = np.array([7,2,10,2,7,4,9,4,9,8])
np.intersect1d(a,b)

array([2, 4])

In [38]:
# get the positions where elements of two arrays match
np.where(a==b)

(array([1, 3, 5, 7]),)

In [36]:
# remove from one array those items that exist in another
np.setdiff1d(a,b)

array([1, 3, 5, 6])

In [43]:
# extract all numbers between a given range from a numpy array
a[np.where((a >= 4) & (a <= 6))]

array([4, 4, 5, 6])

In [45]:
# or:
a[np.where(np.logical_and(a>=4, a<=6))]

array([4, 4, 5, 6])

In [47]:
a[(a>=4)&(a<=6)]

array([4, 4, 5, 6])

In [48]:
def max_(x,y):
    if x>y:
        return x
    else:
        return y
pair_max = np.vectorize(max_, otypes=[float])
pair_max(a,b)

array([ 7.,  2., 10.,  2.,  7.,  4.,  9.,  4.,  9.,  8.])

In [51]:
# swap two columns in a 2d numpy array
arr = np.arange(9).reshape(3,3)
arr

array([[0, 1, 2],
       [3, 4, 5],
       [6, 7, 8]])

In [52]:
arr[:,[1,0,2]]

array([[1, 0, 2],
       [4, 3, 5],
       [7, 6, 8]])

In [54]:
# or switch rows
arr[[1,0,2],:]

array([[3, 4, 5],
       [0, 1, 2],
       [6, 7, 8]])

In [56]:
# reverse rows:
arr[::-1,:]

array([[6, 7, 8],
       [3, 4, 5],
       [0, 1, 2]])

In [57]:
# switch columns
arr[:,::-1]

array([[2, 1, 0],
       [5, 4, 3],
       [8, 7, 6]])

In [60]:
# create a 2D array containing random floats between 5 and 10
np.random.randint(5,10,(3,3))+np.random.random((3,3))

array([[9.4901274 , 6.59587723, 6.82015094],
       [5.49414071, 5.96308149, 7.22437531],
       [7.67008677, 7.20537258, 9.48760687]])

In [63]:
np.random.uniform(5, 10, (3,3))

array([[8.57777869, 7.15217007, 5.74688836],
       [9.1638037 , 7.71137547, 6.43465092],
       [5.14880703, 6.28438366, 9.47300353]])

In [64]:
# print only 3 decimal places in python numpy array
np.set_printoptions(precision=3)
np.random.uniform(5, 10, (3,3))

array([[6.34 , 8.587, 5.528],
       [8.598, 6.244, 7.747],
       [9.802, 9.666, 6.06 ]])

In [75]:
# suppressing the scientific notation (like 1e10)
rand_arr = np.random.random([3,3])/1e10
rand_arr

array([[6.1321065634e-11, 5.4098351549e-11, 8.5484141940e-11],
       [6.8817461917e-11, 2.8801524941e-11, 8.2952471200e-11],
       [4.5862977448e-11, 7.8413671467e-12, 2.7861609992e-11]])

In [77]:
np.set_printoptions(suppress=True, precision=10)
rand_arr

array([[0.0000000001, 0.0000000001, 0.0000000001],
       [0.0000000001, 0.          , 0.0000000001],
       [0.          , 0.          , 0.          ]])

In [84]:
# import a dataset with numbers and texts keeping the text intact in python numpy
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'
iris = np.genfromtxt(url, delimiter=',', dtype='object')
names = ('sepallength', 'sepalwidth', 'petallength', 'petalwidth', 'species')
iris[:5]

array([[b'5.1', b'3.5', b'1.4', b'0.2', b'Iris-setosa'],
       [b'4.9', b'3.0', b'1.4', b'0.2', b'Iris-setosa'],
       [b'4.7', b'3.2', b'1.3', b'0.2', b'Iris-setosa'],
       [b'4.6', b'3.1', b'1.5', b'0.2', b'Iris-setosa'],
       [b'5.0', b'3.6', b'1.4', b'0.2', b'Iris-setosa']], dtype=object)

In [95]:
np.array([i[4] for i in iris])[:5]

array([b'Iris-setosa', b'Iris-setosa', b'Iris-setosa', b'Iris-setosa',
       b'Iris-setosa'], dtype='|S15')

In [96]:
np.array([i[:4] for i in iris])[:5]

array([[b'5.1', b'3.5', b'1.4', b'0.2'],
       [b'4.9', b'3.0', b'1.4', b'0.2'],
       [b'4.7', b'3.2', b'1.3', b'0.2'],
       [b'4.6', b'3.1', b'1.5', b'0.2'],
       [b'5.0', b'3.6', b'1.4', b'0.2']], dtype=object)

In [102]:
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'
sepallength = np.genfromtxt(url, delimiter=',', dtype='float', usecols=[0]) # set used col[0]

# Solution
Smax, Smin = sepallength.max(), sepallength.min()
S = (sepallength - Smin)/(Smax - Smin)
S

array([0.2222222222, 0.1666666667, 0.1111111111, 0.0833333333,
       0.1944444444, 0.3055555556, 0.0833333333, 0.1944444444,
       0.0277777778, 0.1666666667, 0.3055555556, 0.1388888889,
       0.1388888889, 0.          , 0.4166666667, 0.3888888889,
       0.3055555556, 0.2222222222, 0.3888888889, 0.2222222222,
       0.3055555556, 0.2222222222, 0.0833333333, 0.2222222222,
       0.1388888889, 0.1944444444, 0.1944444444, 0.25        ,
       0.25        , 0.1111111111, 0.1388888889, 0.3055555556,
       0.25        , 0.3333333333, 0.1666666667, 0.1944444444,
       0.3333333333, 0.1666666667, 0.0277777778, 0.2222222222,
       0.1944444444, 0.0555555556, 0.0277777778, 0.1944444444,
       0.2222222222, 0.1388888889, 0.2222222222, 0.0833333333,
       0.2777777778, 0.1944444444, 0.75        , 0.5833333333,
       0.7222222222, 0.3333333333, 0.6111111111, 0.3888888889,
       0.5555555556, 0.1666666667, 0.6388888889, 0.25        ,
       0.1944444444, 0.4444444444, 0.4722222222, 0.5   

In [103]:
# or 
S = (sepallength - Smin)/sepallength.ptp() 
S

array([0.2222222222, 0.1666666667, 0.1111111111, 0.0833333333,
       0.1944444444, 0.3055555556, 0.0833333333, 0.1944444444,
       0.0277777778, 0.1666666667, 0.3055555556, 0.1388888889,
       0.1388888889, 0.          , 0.4166666667, 0.3888888889,
       0.3055555556, 0.2222222222, 0.3888888889, 0.2222222222,
       0.3055555556, 0.2222222222, 0.0833333333, 0.2222222222,
       0.1388888889, 0.1944444444, 0.1944444444, 0.25        ,
       0.25        , 0.1111111111, 0.1388888889, 0.3055555556,
       0.25        , 0.3333333333, 0.1666666667, 0.1944444444,
       0.3333333333, 0.1666666667, 0.0277777778, 0.2222222222,
       0.1944444444, 0.0555555556, 0.0277777778, 0.1944444444,
       0.2222222222, 0.1388888889, 0.2222222222, 0.0833333333,
       0.2777777778, 0.1944444444, 0.75        , 0.5833333333,
       0.7222222222, 0.3333333333, 0.6111111111, 0.3888888889,
       0.5555555556, 0.1666666667, 0.6388888889, 0.25        ,
       0.1944444444, 0.4444444444, 0.4722222222, 0.5   

In [107]:
# compute the softmax score
def softmax(x):
    exp_x = np.exp(x-np.max(x))
    return exp_x/exp_x.sum(axis=0)

In [108]:
softmax(sepallength)

array([0.0022195853, 0.0018172427, 0.0014878325, 0.0013462465,
       0.0020083638, 0.0029961268, 0.0013462465, 0.0020083638,
       0.0011022134, 0.0018172427, 0.0029961268, 0.0016443092,
       0.0016443092, 0.000997324 , 0.0044696959, 0.0040443481,
       0.0029961268, 0.0022195853, 0.0040443481, 0.0022195853,
       0.0029961268, 0.0022195853, 0.0013462465, 0.0022195853,
       0.0016443092, 0.0020083638, 0.0020083638, 0.0024530211,
       0.0024530211, 0.0014878325, 0.0016443092, 0.0029961268,
       0.0024530211, 0.0033112322, 0.0018172427, 0.0020083638,
       0.0033112322, 0.0018172427, 0.0011022134, 0.0022195853,
       0.0020083638, 0.0012181342, 0.0011022134, 0.0020083638,
       0.0022195853, 0.0016443092, 0.0022195853, 0.0013462465,
       0.0027110076, 0.0020083638, 0.014839913 , 0.0081443169,
       0.0134277086, 0.0033112322, 0.0090008622, 0.0040443481,
       0.0073692827, 0.0018172427, 0.0099474912, 0.0024530211,
       0.0020083638, 0.0049397779, 0.0054592989, 0.0060

In [106]:
# find the percentile scores of a numpy array
np.percentile(sepallength, q=[5, 95])

array([4.6  , 7.255])

In [112]:
# insert values at random positions in an arra
iris[np.random.randint(iris.shape[0], size=20), \
     np.random.randint(iris.shape[1], size=20)] = np.nan
iris

array([[b'5.1', b'3.5', b'1.4', b'0.2', b'Iris-setosa'],
       [nan, b'3.0', b'1.4', b'0.2', b'Iris-setosa'],
       [b'4.7', b'3.2', b'1.3', b'0.2', b'Iris-setosa'],
       [b'4.6', nan, b'1.5', b'0.2', b'Iris-setosa'],
       [b'5.0', b'3.6', b'1.4', b'0.2', b'Iris-setosa'],
       [b'5.4', b'3.9', b'1.7', b'0.4', b'Iris-setosa'],
       [b'4.6', b'3.4', nan, b'0.3', b'Iris-setosa'],
       [b'5.0', b'3.4', b'1.5', b'0.2', b'Iris-setosa'],
       [b'4.4', b'2.9', b'1.4', b'0.2', b'Iris-setosa'],
       [b'4.9', b'3.1', b'1.5', b'0.1', b'Iris-setosa'],
       [nan, b'3.7', b'1.5', b'0.2', b'Iris-setosa'],
       [b'4.8', b'3.4', b'1.6', b'0.2', b'Iris-setosa'],
       [b'4.8', b'3.0', b'1.4', nan, b'Iris-setosa'],
       [b'4.3', b'3.0', nan, nan, b'Iris-setosa'],
       [b'5.8', b'4.0', b'1.2', b'0.2', b'Iris-setosa'],
       [b'5.7', b'4.4', b'1.5', b'0.4', b'Iris-setosa'],
       [b'5.4', b'3.9', b'1.3', b'0.4', nan],
       [b'5.1', b'3.5', b'1.4', b'0.3', b'Iris-setosa'],
      

In [138]:
# find the position of missing values in numpy array
iris_2d = np.genfromtxt(url, delimiter=',', dtype='float', usecols=[0,1,2,3])
iris_2d[np.random.randint(150, size=20), np.random.randint(4, size=20)] = np.nan
np.where(np.isnan(iris_2d))

(array([  1,   5,  19,  20,  20,  25,  36,  40,  53,  57,  58,  67,  76,
         76,  83, 115, 119, 119, 128, 142]),
 array([1, 1, 2, 2, 3, 2, 2, 2, 1, 3, 1, 0, 0, 1, 1, 1, 1, 2, 0, 0]))

In [139]:
np.isnan(iris_2d).any()

True

In [141]:
# drop rows that contain a missing value from a numpy array
bool = np.array([~np.any(np.isnan(i)) for i in iris_2d])
iris_2d = iris_2d[bool]

In [142]:
np.isnan(iris_2d).any()

False

In [136]:
# find the correlation between two columns of a numpy array
np.corrcoef(iris_2d[:,0], iris_2d[:,1])[0,1]

-0.12845599163121704

In [152]:
np.unique([i[4] for i in iris], return_counts=True)

(array([b'Iris-setosa', b'Iris-versicolor', b'Iris-virginica', b'nan'],
       dtype='|S15'), array([46, 45, 46, 13]))

In [159]:
# create a new column from existing columns of a numpy array
sepallength = iris_2d[:, 0].astype('float')
petallength = iris_2d[:, 2].astype('float')
volume = (np.pi * petallength * (sepallength**2))/3
volume.shape

(133,)

In [160]:
# Introduce new dimension to match iris_2d's
volume = volume[:, np.newaxis]
volume.shape

(133, 1)

In [162]:
np.hstack([iris_2d, volume])[:4]

array([[ 5.1         ,  3.5         ,  1.4         ,  0.2         ,
        38.1326516293],
       [ 4.7         ,  3.2         ,  1.3         ,  0.2         ,
        30.0723720777],
       [ 4.6         ,  3.1         ,  1.5         ,  0.2         ,
        33.238050275 ],
       [ 5.          ,  3.6         ,  1.4         ,  0.2         ,
        36.6519142919]])

In [197]:
# probabilistic sampling in numpy
species = np.genfromtxt(url, delimiter=',', dtype='object', usecols=[4]) # mark: dtype='object'
probs = np.r_[np.linspace(0, 0.500, num=50), np.linspace(0.501, .750, num=50),\
              np.linspace(.751, 1.0, num=50)]
index = np.searchsorted(probs, np.random.random(150))
vals, counts = np.unique(species[index], return_counts=True)
# get the most frequent one:
vals[np.argmax(counts)]

b'Iris-setosa'

In [198]:
# replace all values greater than a given value to a given cutoff
a = np.random.uniform(1,50, 20)

# Using np.clip
np.clip(a, a_min=10, a_max=30)

array([26.5573132777, 30.          , 21.4491316666, 11.9235924844,
       30.          , 20.3644928386, 10.          , 30.          ,
       10.          , 30.          , 25.8554653008, 30.          ,
       15.1422905408, 30.          , 30.          , 30.          ,
       10.1047737681, 18.9639148699, 22.159373518 , 18.7696023249])

In [199]:
# Using np.where
np.where(a < 10, 10, np.where(a > 30, 30, a))

array([26.5573132777, 30.          , 21.4491316666, 11.9235924844,
       30.          , 20.3644928386, 10.          , 30.          ,
       10.          , 30.          , 25.8554653008, 30.          ,
       15.1422905408, 30.          , 30.          , 30.          ,
       10.1047737681, 18.9639148699, 22.159373518 , 18.7696023249])

In [200]:
# convert an array of arrays into a flat 1d array
arr1 = np.arange(3)
arr2 = np.arange(3,7)
arr3 = np.arange(7,10)

array_of_arrays = np.array([arr1, arr2, arr3])

# Solution 1
arr_2d = np.array([a for arr in array_of_arrays for a in arr])

# Solution 2:
arr_2d = np.concatenate(array_of_arrays)
arr_2d

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

In [201]:
# generate one-hot encodings for an array in numpy
arr = np.random.randint(1,4, size=6)
arr

def one_hot_encodings(arr):
    uniqs = np.unique(arr)
    out = np.zeros((arr.shape[0], uniqs.shape[0])) # mark here: column number should use unique
    for i, k in enumerate(arr):
        out[i, k-1] = 1
    return out

In [203]:
print(arr)
one_hot_encodings(arr)

[3 1 2 1 1 2]


array([[0., 0., 1.],
       [1., 0., 0.],
       [0., 1., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [0., 1., 0.]])

In [204]:
# create groud ids based on a given categorical variable
species = np.genfromtxt(url, delimiter=',', dtype='str', usecols=4)
species_small = np.sort(np.random.choice(species, size=20))
species_small

array(['Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa',
       'Iris-setosa', 'Iris-setosa', 'Iris-versicolor', 'Iris-versicolor',
       'Iris-versicolor', 'Iris-virginica', 'Iris-virginica',
       'Iris-virginica', 'Iris-virginica', 'Iris-virginica',
       'Iris-virginica', 'Iris-virginica', 'Iris-virginica',
       'Iris-virginica', 'Iris-virginica', 'Iris-virginica'], dtype='<U15')

In [209]:
 [np.argwhere(np.unique(species_small) == s)[0][0] \
  for val in np.unique(species_small) \
  for s in species_small[species_small==val]]

[0, 0, 0, 0, 0, 0, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]

In [218]:
# rank items in an array using numpy
a = np.random.randint(20, size=10)
a

array([ 1, 12,  5,  0, 14,  0,  7, 14,  7,  3])

In [219]:
a.argsort()

array([3, 5, 0, 9, 2, 6, 8, 1, 4, 7])

In [220]:
a.argsort().argsort()

array([2, 7, 4, 0, 8, 1, 5, 9, 6, 3])

In [221]:
# rank items in a multidimensional array using numpy
a = np.random.randint(20, size=[2,5])
a.ravel()

array([16,  3,  9, 15,  5, 18, 19, 10, 17, 14])

In [223]:
a.ravel().argsort().argsort().reshape(a.shape)

array([[6, 0, 2, 5, 1],
       [8, 9, 3, 7, 4]])

In [224]:
# find the maximum value in each row of a numpy array 2d
a = np.random.randint(1,10, [5,3])
np.amax(a, axis=1)

array([9, 9, 6, 8, 9])

In [225]:
np.apply_along_axis(np.max, arr=a, axis=1)

array([9, 9, 6, 8, 9])

In [226]:
np.apply_along_axis(lambda x: np.min(x)/np.max(x), arr=a, axis=1)

array([0.1111111111, 0.8888888889, 0.3333333333, 0.125       ,
       0.4444444444])

In [227]:
# compute the euclidean distance between two arrays
a = np.array([1,2,3,4,5])
b = np.array([4,5,6,7,8])

dist = np.linalg.norm(a-b)
dist

6.708203932499369

In [236]:
# find all the local maxima (or peaks) in a 1d array
# Peaks are points surrounded by smaller values on both sides.
a = np.array([1, 3, 7, 1, 2, 6, 0, 1])
double_diff = np.diff(np.sign(np.diff(a)))
peak = np.where(double_diff == -2)[0] + 1
peak

array([2, 5])

In [242]:
# create strides from a given 1D array
def gen_strides(arr, stride_len=5, window_len=5):
    n_strides = ((a.size-window_len)//stride_len) + 1
    return np.array([a[s:(s+window_len)] for s in np.arange(0, n_strides*stride_len, stride_len)])

gen_strides(np.arange(15), stride_len=2, window_len=4)

array([[1, 3, 7, 1],
       [7, 1, 2, 6],
       [2, 6, 0, 1]])