# 101 NumPy Questions
[link](https://www.machinelearningplus.com/python/101-numpy-exercises-python/)

In [2]:
#1. Import numpy
import numpy as np

In [3]:
#2. Create 1D array 0-9
np.arange(10)

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

In [4]:
#3. Create 3x3 boolean array
np.full((3,3), True, dtype=bool)

array([[ True,  True,  True],
       [ True,  True,  True],
       [ True,  True,  True]])

In [5]:
#4. Extract all odd number from array
arr = np.arange(10)
arr[ arr%2!=0]

array([1, 3, 5, 7, 9])

In [6]:
#5. Replace odd numbers with -1
arr = np.arange(10)
arr[ arr%2!=0] = -1
arr

array([ 0, -1,  2, -1,  4, -1,  6, -1,  8, -1])

In [7]:
#6. Replease all odd numbers in arr with -1 without changing arr
arr = np.arange(10)
np.where(arr%2!=0, -1, arr) # if even, return original value


array([ 0, -1,  2, -1,  4, -1,  6, -1,  8, -1])

In [8]:
#7. Convert 1D array to 2D array with 2 rows
np.arange(10).reshape((2,-1))

array([[0, 1, 2, 3, 4],
       [5, 6, 7, 8, 9]])

In [9]:
#8. Stack arrays a and b vertically
a = np.arange(10).reshape(2,-1)
b = np.repeat(1, 10).reshape(2,-1)

np.vstack((a,b))
np.row_stack((a,b))
np.concatenate([a,b],axis=0)

array([[0, 1, 2, 3, 4],
       [5, 6, 7, 8, 9],
       [1, 1, 1, 1, 1],
       [1, 1, 1, 1, 1]])

In [10]:
#9. Stack arrays a and b horizontally
np.hstack((a,b))
np.column_stack((a,b))
np.concatenate([a,b], axis=1)

array([[0, 1, 2, 3, 4, 1, 1, 1, 1, 1],
       [5, 6, 7, 8, 9, 1, 1, 1, 1, 1]])

In [11]:
#10. Create following pattern without hardcoding
a = np.array([1,2,3])
a.repeat(3)

array([1, 1, 1, 2, 2, 2, 3, 3, 3])

In [12]:
# 11. Get common items between a and b
a = np.array([1,2,3,2,3,4,3,4,5,6])
b = np.array([7,2,10,2,7,4,9,4,9,8])

np.intersect1d(a,b)

array([2, 4])

In [13]:
#12. From array a, remove all items present in array b

a = np.array([1,2,3,4,5])
b = np.array([5,6,7,8,9])

a[ a!= np.intersect1d(a,b) ]
np.setdiff1d(a,b)

array([1, 2, 3, 4])

In [14]:
# 13. Get position where elements of a and b match

a = np.array([1,2,3,2,3,4,3,4,5,6])
b = np.array([7,2,10,2,7,4,9,4,9,8])

np.where( a==b )


(array([1, 3, 5, 7]),)

In [15]:
# 14. Get all items between 5 and 10 from a

a = np.array([2, 6, 1, 9, 10, 3, 27])

index = np.where((a>=5)&(a<=10))
a[ index ]


array([ 6,  9, 10])

In [16]:
# 15. Convert the function maxx that works with two scalars to work on two arrays

def maxx(x,y):
    if x>=y: return x
    else: return y
    
a = np.array([5, 7, 9, 8, 6, 4, 5])
b = np.array([6, 3, 4, 8, 9, 7, 1])


pair_max = np.vectorize(maxx, otypes=[float])
pair_max(a,b)


array([6., 7., 9., 8., 9., 7., 5.])

In [17]:
# 16. Swap col1 and col2 in arr

arr = np.arange(9).reshape(3,3)
arr[:,[1,0,2]]


array([[1, 0, 2],
       [4, 3, 5],
       [7, 6, 8]])

In [18]:
# 17. Swap rows 1 snd 2 in the arr

arr = np.arange(9).reshape(3,3)
arr[[1,0,2],:]


array([[3, 4, 5],
       [0, 1, 2],
       [6, 7, 8]])

In [19]:
# 18. Reverse the rows of a 2D array

arr = np.arange(9).reshape(3,3)
arr[::-1, :] #rev rows; keep cols


array([[6, 7, 8],
       [3, 4, 5],
       [0, 1, 2]])

In [20]:
# 19. Reverse the columns of a 2D array

arr = np.arange(9).reshape(3,3)
arr[:, ::-1] # keep rows; rev cols


array([[2, 1, 0],
       [5, 4, 3],
       [8, 7, 6]])

In [21]:
# 20. Create 2D arr of random decimal numbers between 5-10

np.random.uniform(5,10, size=[3,3])


array([[9.49205915, 5.99919344, 6.26480778],
       [9.3282117 , 5.89162215, 6.94931688],
       [8.31544245, 5.14741607, 7.09200622]])

In [22]:
# 21. Print only 3 dcimal places of arr

rand_arr = np.random.random((3,3))
np.set_printoptions(precision=3) # change dec. num. of output

rand_arr

array([[0.887, 0.561, 0.625],
       [0.134, 0.95 , 0.715],
       [0.553, 0.047, 0.686]])

In [23]:
# 22. Prety print arr by suppressing scientific notation

rand_arr = np.random.random([3,3])/1e3

np.set_printoptions(precision=6, suppress=True) # suppress formatting
rand_arr

array([[0.000183, 0.000909, 0.000179],
       [0.000923, 0.00033 , 0.000827],
       [0.000235, 0.000942, 0.000925]])

In [24]:
# 23. Limit num of items printed in array to 6 el

np.set_printoptions(threshold=6) # THIS DIDN'T WORK
a = np.arange(15) 
a

array([ 0,  1,  2, ..., 12, 13, 14])

In [25]:
# 24. Print full numpy arr wthout truncating

a=np.arange(15)
np.set_printoptions(threshold = len(a))
a

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14])

In [26]:
# 25. Import iris dataset keeping the text intact

url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'
iris = np.genfromtxt(url, delimiter=',', dtype='object')
names = ('sepallength', 'sepalwidth', 'petallength', 'petalwidth', 'species')

iris[:3]

array([[b'5.1', b'3.5', b'1.4', b'0.2', b'Iris-setosa'],
       [b'4.9', b'3.0', b'1.4', b'0.2', b'Iris-setosa'],
       [b'4.7', b'3.2', b'1.3', b'0.2', b'Iris-setosa']], dtype=object)

In [27]:
# 26. Extract the text column 'species' from the 1D iris imported data

iris.T[-1]

array([b'Iris-setosa', b'Iris-setosa', b'Iris-setosa', ...,
       b'Iris-virginica', b'Iris-virginica', b'Iris-virginica'],
      dtype=object)

In [28]:
# 27. Convert 1D iris to 2D array by omitting species text field

iris2D = np.genfromtxt(url, 
                       delimiter=',', 
                       dtype='float', 
                       usecols=[0,1,2,3])
iris2D

array([[5.1, 3.5, 1.4, 0.2],
       [4.9, 3. , 1.4, 0.2],
       [4.7, 3.2, 1.3, 0.2],
       ...,
       [6.5, 3. , 5.2, 2. ],
       [6.2, 3.4, 5.4, 2.3],
       [5.9, 3. , 5.1, 1.8]])

In [29]:
# 28. Find mean, median, standard deviation of iris's sepallength

url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'
sepallength = np.genfromtxt(url, delimiter=',', dtype='float', usecols=[0])

mean, median, stdev = np.mean(sepallength), np.mean(sepallength), np.std(sepallength)
print(mean, median, stdev)

5.843333333333334 5.843333333333334 0.8253012917851409


In [30]:
# 29. Normalize sepallength exactly between 0 and 1
# x = x - x.min / (x.max - x.min)

sep_max = sepallength.max()
sep_min = sepallength.min()

norm_func = np.vectorize(lambda x: (x - sep_min)/(sep_max - sep_min))
norm_func(sepallength)


array([0.222222, 0.166667, 0.111111, ..., 0.611111, 0.527778, 0.444444])

In [31]:
# 30. Compute the soft max score of sepallength

url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'
sepallength = np.genfromtxt(url, delimiter=',', dtype='float', usecols=[0])

from scipy.special import softmax
np.set_printoptions(precision=3)
softmax(sepallength)

array([0.002, 0.002, 0.001, ..., 0.009, 0.007, 0.005])

In [32]:
# 31. Find 5th and 95th percentile of iris sepallength

np.percentile(sepallength, [5, 95])

array([4.6  , 7.255])

In [33]:
# 32. Insert np.nan values at 20 random positions in iris2D

i,j = iris2D.shape
i_rand = np.random.randint(150, size=20) # 20 random rows
j_rand = np.random.randint(4, size=20)   # 20 random cols

iris2D[i_rand, j_rand] = np.nan


In [34]:
# 33. Find number and position of missing values in iris2D sepallength

url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'
iris_2d = np.genfromtxt(url, delimiter=',', dtype='float')
iris_2d[np.random.randint(150, size=20), np.random.randint(4, size=20)] = np.nan

sepallength = iris_2d[:,0] #isolate sepal info

nan_count = np.count_nonzero( np.isnan(sepallength) )     # Quicker than sum( np.isnan(x) )
nan_idx = np.where(np.isnan(sepallength))                 # np.where(bool) returns indices of True


In [35]:
# 34. Filter rows of iris2D that has petallength>1,5 and sepallength<5

url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'
iris_2d = np.genfromtxt(url, delimiter=',', dtype='float',  usecols=[0,1,2,3])

cond = (iris_2d[:,0]<5) & (iris_2d[:,2]>1.5)
iris_2d[cond]



array([[4.8, 3.4, 1.6, 0.2],
       [4.8, 3.4, 1.9, 0.2],
       [4.7, 3.2, 1.6, 0.2],
       [4.8, 3.1, 1.6, 0.2],
       [4.9, 2.4, 3.3, 1. ],
       [4.9, 2.5, 4.5, 1.7]])

In [36]:
# 35. Select rows of iris_2d without nan values

url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'
iris_2d = np.genfromtxt(url, delimiter=',', dtype='float', usecols=[0,1,2,3])
iris_2d[np.random.randint(150, size=20), np.random.randint(4, size=20)] = np.nan

bool_ = np.isnan(iris_2d).any(axis=1) #evaluate across columsn, per row
iris_2d[~bool_,:]


array([[5.1, 3.5, 1.4, 0.2],
       [4.9, 3. , 1.4, 0.2],
       [4.7, 3.2, 1.3, 0.2],
       ...,
       [6.5, 3. , 5.2, 2. ],
       [6.2, 3.4, 5.4, 2.3],
       [5.9, 3. , 5.1, 1.8]])

In [37]:
# 36. Find correlation between sepallength and petallength

url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'
iris_2d = np.genfromtxt(url, delimiter=',', dtype='float', usecols=[0,1,2,3])


np.corrcoef(iris_2d[:,0],iris_2d[:,2])[0,1]

0.8717541573048718

In [38]:
# 37. Find out if iris_2d has any missing values?

url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'
iris_2d = np.genfromtxt(url, delimiter=',', dtype='float', usecols=[0,1,2,3])

np.isnan(iris_2d).any()


False

In [39]:
# 38. Replace all occurense of nan with 0 in numpy arr

url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'
iris_2d = np.genfromtxt(url, delimiter=',', dtype='float', usecols=[0,1,2,3])
iris_2d[np.random.randint(150, size=20), np.random.randint(4, size=20)] = np.nan

np.where(np.isnan(iris_2d), 0, iris_2d)

array([[5.1, 3.5, 1.4, 0.2],
       [4.9, 3. , 0. , 0.2],
       [4.7, 3.2, 1.3, 0.2],
       ...,
       [6.5, 3. , 5.2, 2. ],
       [6.2, 3.4, 5.4, 2.3],
       [5.9, 3. , 5.1, 1.8]])

In [40]:
# 39. Find the unique values and the count of unique values in iris species

url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'
iris = np.genfromtxt(url, delimiter=',', dtype='object')
names = ('sepallength', 'sepalwidth', 'petallength', 'petalwidth', 'species')

species = iris[:,-1]
np.unique(species, return_counts=True)


(array([b'Iris-setosa', b'Iris-versicolor', b'Iris-virginica'],
       dtype=object), array([50, 50, 50]))

In [41]:
iris[:,2]

array([b'1.4', b'1.4', b'1.3', ..., b'5.2', b'5.4', b'5.1'], dtype=object)

In [42]:
# 40. Bin petal length t oform a text array such that:
# <3  --> 'small'
# 3-5 --> 'medium'
# >=5 --> 'large'

url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'
petal_length = np.genfromtxt(url, delimiter=',', dtype='float', usecols=[2])

np.where(petal_length<3, 'small',
        np.where(petal_length>=5, 'large', 'medium')) # faster than np.vectorize(func)


array(['small', 'small', 'small', ..., 'large', 'large', 'large'],
      dtype='<U6')

In [43]:
# 41. Create new column for volumn
# vol = (pi * petallength * sepal_length^2) / 3

url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'
iris_2d = np.genfromtxt(url, delimiter=',', dtype='object')

sepallength = iris_2d[:, 0].astype('float')         # id sepallength
petallength = iris_2d[:, 1].astype('float')         # id petallength
volume = (np.pi * petallength * (sepallength**2))/3 # calc. volume
volume = volume[:, np.newaxis]                      # increase dimension (1x150 --> 150x1)
out = np.hstack([iris_2d, volume])                  # concat column at end


In [44]:
# 42. [Probabilistic sampling] Randomly sample species such that setose is 2x the number of else

url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'
iris = np.genfromtxt(url, delimiter=',', dtype='str')

species = iris[:,4]
a = np.unique(species)

species_out = np.random.choice(a, 150, p=[0.5, 0.25, 0.25])



In [45]:
# 43. How to get the second largest value of an array when grouped by another array? 
## What is value of second longest petallength of setosa? 

# Input
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'
iris = np.genfromtxt(url, delimiter=',', dtype='object')
names = ('sepallength', 'sepalwidth', 'petallength', 'petalwidth', 'species')

# Solution
setosa = iris[ iris[:,4]==b'Iris-setosa'] # isolate setosa
pet_len_set = setosa[:,2].astype('float') # isolate setosa petal length
np.sort(np.unique(pet_len_set))[-2] # sort unique values and isolate 2nd largest


1.7

In [47]:
# 44. Sort a 2D array by a column
## Sort iris dataset based on sepallength column

# Input
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'
iris = np.genfromtxt(url, delimiter=',', dtype='object')
names = ('sepallength', 'sepalwidth', 'petallength', 'petalwidth', 'species')

# Solution
idx = iris[:,0].argsort(axis=0)
iris[idx][:10]


array([[b'4.3', b'3.0', b'1.1', b'0.1', b'Iris-setosa'],
       [b'4.4', b'3.2', b'1.3', b'0.2', b'Iris-setosa'],
       [b'4.4', b'3.0', b'1.3', b'0.2', b'Iris-setosa'],
       ...,
       [b'4.6', b'3.4', b'1.4', b'0.3', b'Iris-setosa'],
       [b'4.6', b'3.2', b'1.4', b'0.2', b'Iris-setosa'],
       [b'4.7', b'3.2', b'1.3', b'0.2', b'Iris-setosa']], dtype=object)

In [48]:
# 45. Find most frequent value in array
## Find most frequent value of petal length

# Input
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'
iris = np.genfromtxt(url, delimiter=',', dtype='object')
names = ('sepallength', 'sepalwidth', 'petallength', 'petalwidth', 'species')

# Solution
values, counts = np.unique(iris[:,2], return_counts=True)
values[ np.argmax(counts) ] # np.argmax return idx of largest value


b'1.5'

In [49]:
# 46. Find position of 1st occurance of value greater than given value
## Find position of 1st petalwidth greater than 1

# Input
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'
iris = np.genfromtxt(url, delimiter=',', dtype='object')
petalwidth = iris[:,3].astype('float')

# My Solution
idx = np.where(petalwidth>1)[0] # array of idxs
idx.tolist()[0] # 1st value

# Alt Solution
idx = np.argwhere(petalwidth > 1) # nested array
idx = idx.tolist() # list of arrays
idx[0][0] # 1st value


50

#### 47.  Replace all values greater than a given value to a given cutoff

`np.clip(arr, a_min=X, a_max=Y)`: Limits min and max of array
- Replaces vals<X with X
- Replaces vals>Y with Y


In [52]:
## Replace all values greater than 30 to 30 and less than 10 to 10

# Input
np.random.seed(100)
arr = np.random.uniform(1,50,20)

# My Solution
np.where(arr<10, 10, 
        np.where(arr>30, 30, a))

# Alt Solution
np.clip(arr, a_min=10, a_max=30)

array([27.627, 14.64 , 21.801, ..., 10.   , 30.   , 14.43 ])

#### 48.  Get idx of top values from an array

`np.partition(arr, kth)`: Takes kth el == kth el in sorted order
- Retuns array
    - All smaller els moved before kth el
    - All larger items moved after kth el
Example: 
- Code
    - a = (6,5,4,3,2,1,0)
    - sorted(a) = (0,1,2,3,4,5,6)
    - np.partition(a,4) = (2, 0, 1, 3, 4, 5, 6)
- Explained
    - kth = 4
    - sorted(a)[kth] = 4 (4th item in sorted list)
    - partition[kth] = sorted(a)[kth]
    - partition[:kth] = all values < kth
    - partition[kth:] = values >= kth
    
`np.argpartition(arr, kth)`: Returns indices
Example:
- Code
    - a = (6,5,4,3,2,1,0)
    - sorted(a) = (0,1,2,3,4,5,6)
    - np.argpartition(a,4) = idx (4,6,5,3,2,1,0)
    - a[idx] = vals (2, 0, 1, 3, 4, 5, 6)

In [121]:
# Input
np.random.seed(100)
a = np.random.uniform(1,50, 20)

# My Solution
idx = a.argsort() #idx of values sorted (ascending)
top5_idx = idx[-5:] # idx of 5 largest vals
print('My Solution: ', top5_idx)

# Alt Solution
top5_idx = np.argpartition(-a,5)[:5]
print('Alt Solution: ', top5_idx)

My Solution:  [18  7  3 10 15]
Alt Solution:  [15 10  3  7 18]


#### 49.  Compute row-wise counts of all possible values in array

`np.bincount(arr)`: Counts occurances of each value
- x = np.array([1,2,2,3]
- np.bincount(x) = array( [0,1,2,1] )
    - i=2 corresponds to occurance of 2 in x
    

In [27]:
# Input
np.random.seed(100) # set seed
arr = np.random.randint(1,11,size=(6,10))

# My Solution
[np.bincount(arr[i])[1:].tolist() for i in range(arr.shape[0])]


[[1, 0, 2, 1, 1, 1, 0, 2, 2],
 [2, 1, 3, 0, 1, 0, 1, 0, 1, 1],
 [0, 3, 0, 2, 3, 1, 0, 1],
 [1, 0, 2, 1, 0, 1, 0, 2, 1, 2],
 [2, 2, 2, 0, 0, 1, 1, 1, 1],
 [1, 1, 1, 1, 1, 2, 0, 0, 2, 1]]

#### 50.  Convert array of arrays into flat 1D array

`np.concatenate(nest_arr)`: Joins sequence of arrays along existing axis

In [45]:
# Input
arr1 = np.arange(3)
arr2 = np.arange(3,7)
arr3 = np.arange(7,10)
arrs = np.array([arr1, arr2, arr3])

np.concatenate(arrs)



array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])