### Package Imports

In [1]:
import numpy as np

# NumPy Arrays & Properties

- ndim – the number of dimensions (axes) in the array
- shape – the size of the array for each dimension
- size – the total number of elements in the array
- dtype – the data type of the elements in the array

In [2]:
sales = [0, 5, 155]

sales_array = np.array(sales)

type(sales_array)

numpy.ndarray

In [3]:
print('ndim: ' + str(sales_array.ndim))

ndim: 1


In [4]:
print(f"ndim: {sales_array.ndim}")

ndim: 1


In [5]:
array = np.array(range(5))

array

array([0, 1, 2, 3, 4])

In [6]:
array_2d = np.array([range(5), range(5)])

array_2d

array([[0, 1, 2, 3, 4],
       [0, 1, 2, 3, 4]])

In [7]:
print(array_2d.ndim)

print(array_2d.shape)

print(array_2d.size)

print(array_2d.dtype)

2
(2, 5)
10
int64


In [8]:
# Transpose
array_2d.T

array([[0, 0],
       [1, 1],
       [2, 2],
       [3, 3],
       [4, 4]])

In [9]:
print(array.ndim)

print(array.shape)

print(array.size)

print(array.dtype)

1
(5,)
5
int64


# Assignment 1: Array Basics

Hi there,

Can you import Numpy and convert the following list comprehension (I just learned about comprehensions in an awesome course by Maven) into an array?

Once you've done that report the following about the array:
* The number of dimensions  (ndim)
* The shape (shape, size of array for each dimension)
* The number of elements in the array (size)
* The type of data contained inside (dtype)

In [10]:
my_list = [x * 10 for x in range(1, 11)]

my_list

[10, 20, 30, 40, 50, 60, 70, 80, 90, 100]

In [11]:
np_array = np.array(my_list).reshape(2,5)

np_array

array([[ 10,  20,  30,  40,  50],
       [ 60,  70,  80,  90, 100]])

In [12]:
print(f"Dimensions: {np_array.ndim}")

print(f"Shape: {np_array.shape}")

print(f"Size: {np_array.size}")

print(f"Data type: {np_array.dtype}")

Dimensions: 2
Shape: (2, 5)
Size: 10
Data type: int64


# Array Creation

#### Functions to create arrays

- ones: creates an array of ones of a given size, as float by default
    - np.ones((rows, cols), dtype)
- zeros: Creates an array of zeros of a given size, as float by default
    - np.zeros((rows, cols), dtype)
- arrange: Creates an array of integers with given start & stop values, and a step size (only stop is required, and is not inclusive)
    - np.arange(start, stop, step)
- linspace: Creates an array of floats with given start & stop values with n elements, separated by a consistent step size (stop is inclusive)
    - np.linspace(start, stop, n)
- reshape: Changes an array into the specified dimensions, if compatible
    - np.array.reshape(rows, cols)

In [13]:
# Create an array of ones
np.ones(5)

array([1., 1., 1., 1., 1.])

In [14]:
# Create an array of ones with explicit data type
np.ones(2, dtype='int')

array([1, 1])

In [15]:
# Create an array of zeros
np.zeros(100, dtype='int').reshape(5,20)

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])

In [16]:
# Create an array of zeros, reshape within function argument 1, or Transpose with T
np.zeros((5, 20), dtype='int')

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])

In [17]:
# Matrix math
np.identity(10).reshape(5, 20).T

array([[1., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0.],
       [0., 0., 0., 0., 0.],
       [0., 0., 1., 0., 0.],
       [0., 0., 0., 0., 0.],
       [0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 1.],
       [0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0.],
       [0., 0., 0., 0., 0.],
       [0., 0., 1., 0., 0.],
       [0., 0., 0., 0., 0.],
       [0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 1.]])

In [18]:
# Arrange: Start, Stop, Step
np.arange(10).reshape(5,2)

array([[0, 1],
       [2, 3],
       [4, 5],
       [6, 7],
       [8, 9]])

In [19]:
np.linspace(0, 100, 11, dtype='int')

array([  0,  10,  20,  30,  40,  50,  60,  70,  80,  90, 100])

In [20]:
np.linspace(0, 64, 8, dtype='int')

array([ 0,  9, 18, 27, 36, 45, 54, 64])

# Random Number Arrays

Create random number arrays from a variety of distributions. Great for sampling and simulation.

- default_rng: creates a random number generator (seed for reproducibility)
    - np.default_rng(seed)
- random: returns n random numbers from a uniform distribution between 0 and 1
    - rng.random(n)
- normal: returns n random numbers from a normal distribution with a given mean and standard deviation
    - rng.normal(mean, stdev, n)
    
    
- 'rng' is the standard variable name for the default_rng number generator

In [21]:
# import 
from numpy.random import default_rng

# create a random number generator with a seed of 12345 and assigning it
rng = default_rng(12345)

# use the random method to return an array with 10 random numbers
random_array = rng.random(10)

random_array

array([0.22733602, 0.31675834, 0.79736546, 0.67625467, 0.39110955,
       0.33281393, 0.59830875, 0.18673419, 0.67275604, 0.94180287])

In [22]:
# import 
from numpy.random import default_rng

# create a random number generator with a seed of 12345 and assigning it
rng = default_rng(12345)

mean, stddev = 5, 1

random_normal = rng.normal(mean, stddev, size=10)

random_normal

array([3.57617496, 6.26372846, 4.12933826, 4.74082677, 4.92465669,
       4.25911535, 3.6322073 , 5.6488928 , 5.36105811, 3.04713694])

**IMPORTANT**
- make sure to set a seed when generating random numbers to ensure you and others can recreate the work you've done (the value for the seed is less import)

In [23]:
rng = np.random.default_rng(616)

# Generate random numbers
rng.random(10)

array([0.39682145, 0.86568572, 0.46040359, 0.30599848, 0.57381588,
       0.08888468, 0.88194347, 0.73228387, 0.73215182, 0.56233394])

In [24]:
# Generate integers

rng.integers(0, 10, 100)

array([8, 3, 6, 0, 3, 3, 1, 2, 2, 4, 9, 1, 2, 5, 1, 8, 9, 0, 3, 1, 5, 8,
       0, 6, 1, 7, 0, 7, 6, 0, 3, 7, 1, 9, 4, 1, 6, 1, 4, 4, 9, 5, 3, 4,
       7, 8, 3, 3, 5, 0, 4, 9, 9, 5, 6, 5, 8, 2, 3, 0, 0, 8, 3, 8, 4, 8,
       8, 9, 7, 3, 8, 4, 9, 2, 6, 3, 0, 3, 2, 0, 0, 4, 9, 5, 4, 1, 8, 2,
       7, 7, 1, 2, 2, 4, 1, 6, 7, 3, 7, 5])

In [25]:
# Generate numbers with normal distribution
rng.normal(50, 5, 10)

array([53.53115044, 46.35468178, 46.1580894 , 52.89177118, 45.90693639,
       44.51748741, 57.41467712, 51.82624671, 53.9032125 , 53.21454641])

# Assignment 2: Array Creation

Thanks for your help with the first piece - I'm starting to understand some of the key differences between base Python data types and NumPy arrays. 

Does NumPy have anything like the range() function from base Python?

If so: 
* create the same array from assignment 1 using a NumPy function. 
* Make it 5 rows and 2 columns. 
* It's ok if the datatype is float or int.

In [26]:
# arange creates array as int and stop is NOT inclusive
np_array = np.arange(10, 101, 10).reshape(5, 2)

np_array

array([[ 10,  20],
       [ 30,  40],
       [ 50,  60],
       [ 70,  80],
       [ 90, 100]])

In [27]:
# arange creates array as int and stop is NOT inclusive
np_array = (np.arange(1, 11) * 10).reshape(5, 2)

np_array

array([[ 10,  20],
       [ 30,  40],
       [ 50,  60],
       [ 70,  80],
       [ 90, 100]])

In [28]:
# linspace creates array as float and stop IS inclusive
np_array_2 = np.linspace(10, 100, 10).reshape(5, 2)

np_array_2

array([[ 10.,  20.],
       [ 30.,  40.],
       [ 50.,  60.],
       [ 70.,  80.],
       [ 90., 100.]])

Looking good so far! One of our data scientists asked about random number generation in NumPy.

Can you create a 3x3 array of random numbers between 0 and 1? Use a random state of 2022.

Store the random array in a variable called `random_array`.

In [29]:
rng = default_rng(2022)

random_array = rng.random((3, 3)) #.reshape(3, 3)

random_array

array([[0.24742606, 0.09299006, 0.61176337],
       [0.06066207, 0.66103343, 0.75515778],
       [0.1108689 , 0.04305584, 0.41441747]])

# Indexing & Slicing Arrays

Indexing & slicing **ONE-dimensional** arrays is the same as base Python
   - array[index] = indexing to access a single element (0-indexed)
   - array[start:stop:step size] = slicing to access a series of elements (stop is NOT inclusive)

In [30]:
product = ['fruits', 'vegetables', 'cereal', 'dairy', 'eggs', 'snacks',
                 'beverages', 'coffee', 'tea', 'spices']

In [31]:
product_array = np.array(product)

product_array

array(['fruits', 'vegetables', 'cereal', 'dairy', 'eggs', 'snacks',
       'beverages', 'coffee', 'tea', 'spices'], dtype='<U10')

In [32]:
# first element
print(product_array[1])
# last element
print(product_array[-1])

vegetables
spices


In [33]:
# first 5 elements
print(product_array[:5])

['fruits' 'vegetables' 'cereal' 'dairy' 'eggs']


In [34]:
# start at 6th element and grab every other element until end of array
print(product_array[5::2])

['snacks' 'coffee' 'spices']


Indexing & slicing **TWO-dimensional** arrays requires an extra index or slice
   - array[row index, column index] = indexing to access a single element (0-indexed)
   - array[start:stop:step size, start:stop:step size] = slicing to access a series of elements (stop is NOT inclusive)

In [35]:
product_2D = (['fruits', 'vegetables', 'cereal', 'dairy', 'eggs'], 
           ['snacks', 'beverages', 'coffee', 'tea', 'spices'])

product_array2D = np.array(product_2D)

product_array2D

array([['fruits', 'vegetables', 'cereal', 'dairy', 'eggs'],
       ['snacks', 'beverages', 'coffee', 'tea', 'spices']], dtype='<U10')

In [36]:
# second row and 3 column element
product_array2D[1, 2]

'coffee'

In [37]:
# go to all rows and grab all elements starting from the 3rd in each row
product_array2D[:, 2:]

array([['cereal', 'dairy', 'eggs'],
       ['coffee', 'tea', 'spices']], dtype='<U10')

In [38]:
# second row and grab all elements
product_array2D[1:,:]

array([['snacks', 'beverages', 'coffee', 'tea', 'spices']], dtype='<U10')

# Assignment 3: Accessing Array Data


Slice and index the `random_array` we created in the previous exercise. Perform the following:

* Grab the first two 'rows' of the array
* Grab the entire first column
* Finally, grab the second element of the third row.

Thanks!


In [39]:
random_array

array([[0.24742606, 0.09299006, 0.61176337],
       [0.06066207, 0.66103343, 0.75515778],
       [0.1108689 , 0.04305584, 0.41441747]])

In [40]:
random_array.ndim

2

In [41]:
# first 2 rows of the array
random_array[:2, :]

array([[0.24742606, 0.09299006, 0.61176337],
       [0.06066207, 0.66103343, 0.75515778]])

In [42]:
# entire first column
random_array[:, 0]

array([0.24742606, 0.06066207, 0.1108689 ])

In [43]:
# second element of the third row
random_array[2, 1]

0.04305584439252108

# Array Operations

Arithmetic operators can be used to perform array operations.

In [44]:
# create an array
sales = [[0, 5, 155, 0, 518], [0, 1827, 616, 317, 325]]

sales_array = np.array(sales)

sales_array

array([[   0,    5,  155,    0,  518],
       [   0, 1827,  616,  317,  325]])

In [45]:
# add 2 to every element in the array
sales_array + 2

array([[   2,    7,  157,    2,  520],
       [   2, 1829,  618,  319,  327]])

In [46]:
# quantity is first row of array
quantity = sales_array[0, :]

# price is second row of array
price = sales_array[1, :]

# calculate profit (multiply operates on 2 arrays of the same length)
quantity * price

array([     0,   9135,  95480,      0, 168350])

**Array operations are applied via vectorization and broadcasting, which elimiates the need to loop through the array's elements**

In [47]:
# create random number generator

rng = np.random.default_rng(616)

# generate 10 numbers
inventory = rng.integers(0, 100, 10)

# print array
inventory

array([39, 39, 93, 86, 48, 46, 48, 30, 11, 57])

In [48]:
# subtract 24 from inventory
inventory - 24

array([ 15,  15,  69,  62,  24,  22,  24,   6, -13,  33])

In [49]:
# cut inventory in half
inventory / 2

array([19.5, 19.5, 46.5, 43. , 24. , 23. , 24. , 15. ,  5.5, 28.5])

In [50]:
# cut inventory in half
(inventory / 2).dtype

dtype('float64')

In [51]:
# random number of price
price = (rng.random(10) * 10).round(2)

price

array([0.89, 8.82, 7.32, 7.32, 5.62, 3.4 , 0.63, 3.57, 2.03, 4.31])

In [52]:
# calculate profit
price * inventory

array([ 34.71, 343.98, 680.76, 629.52, 269.76, 156.4 ,  30.24, 107.1 ,
        22.33, 245.67])

In [53]:
# use sum to get total inventory
(price * inventory).sum()

2520.4700000000003

In [54]:
# convert inventory to list
inventory_list = list(inventory)

# base python list
inventory_list

[39, 39, 93, 86, 48, 46, 48, 30, 11, 57]

In [55]:
# add 2 to each element in the list in base python

# method 1: use for loop and append +2
# create empty list
new_inventory = []

# create loop to append 2 to each element in list
for x in inventory_list:
    new_inventory.append(x + 2)
    
new_inventory

[41, 41, 95, 88, 50, 48, 50, 32, 13, 59]

In [56]:
# method 2: use list comprehension to add +2
[x + 2 for x in inventory_list]

[41, 41, 95, 88, 50, 48, 50, 32, 13, 59]

In [57]:
# multiply price x inventory in a list

[x * y for x, y in zip(inventory_list, price)]

[34.71,
 343.98,
 680.76,
 629.52,
 269.76,
 156.4,
 30.240000000000002,
 107.1,
 22.33,
 245.67]

In [58]:
# numpy operators are more concise and efficient in code
inventory + 2

array([41, 41, 95, 88, 50, 48, 50, 32, 13, 59])

# Assignment 4: Arithmetic Operations

The creativity of our marketing team knows no bounds!

They've asked us to come up with a simple algorithm to provide a random discount to our list of prices below. 

Before we do that, 

* Add a 5 dollar shipping fee to each price. Call this array `total`.

Once we have that, we want to use the random_array created in assignment 2 and apply them to the 6 prices.

* Grab the first 6 numbers from `random_array`, reshape it to one dimension. Call this `discount_pct`.
* Subtract `discount_pct` FROM 1, store this in `pct_owed`.
* Multiply `pct_owed` by `total` to get the final amount owed.

In [59]:
prices = np.array([5.99, 6.99, 22.49, 99.99, 4.99, 49.99])

In [60]:
# add 5 dollar shipping fee to each price
total = prices + 5

total

array([ 10.99,  11.99,  27.49, 104.99,   9.99,  54.99])

In [61]:
# grab first 6 numbers from random array and reshape to 1 dimension
# slice first 2 rows and all columns
discount_pct = random_array[:2,:].reshape(6)

discount_pct

array([0.24742606, 0.09299006, 0.61176337, 0.06066207, 0.66103343,
       0.75515778])

In [62]:
# calculate percentage owed
pct_owed = 1 - discount_pct

pct_owed

array([0.75257394, 0.90700994, 0.38823663, 0.93933793, 0.33896657,
       0.24484222])

In [63]:
# final amount owed
final_owed = pct_owed * total

final_owed.round(2)

array([ 8.27, 10.88, 10.67, 98.62,  3.39, 13.46])

In [64]:
# combine logic into a single operation

((1 - random_array[:2,:].reshape(6)) * total).round(2)

array([ 8.27, 10.88, 10.67, 98.62,  3.39, 13.46])

In [65]:
# final results
print(f"Discount %: {discount_pct}")
print(f"% Owed: {pct_owed}")
print(f"Total price: {total}")
print(f"Final Owed: {final_owed.round(2)}")

Discount %: [0.24742606 0.09299006 0.61176337 0.06066207 0.66103343 0.75515778]
% Owed: [0.75257394 0.90700994 0.38823663 0.93933793 0.33896657 0.24484222]
Total price: [ 10.99  11.99  27.49 104.99   9.99  54.99]
Final Owed: [ 8.27 10.88 10.67 98.62  3.39 13.46]


# Filtering Arrays

Filter arrays by indexing them with a logical test

In [66]:
sales_array

array([[   0,    5,  155,    0,  518],
       [   0, 1827,  616,  317,  325]])

In [67]:
# perform a logical test on the array to return a Boolean array witht eh results of the lofical test
sales_array != 0

array([[False,  True,  True, False,  True],
       [False,  True,  True,  True,  True]])

In [68]:
# index an array with a Boolean array returns an array with the elements where Boolean value is True
sales_array[sales_array != 0]

array([   5,  155,  518, 1827,  616,  317,  325])

Filter with multiple logical tests
- use | for OR conditions and & for AND conditions

In [69]:
# return an array with elements equal to 616 or less than 100
sales_array[(sales_array == 616) | (sales_array < 100)]

array([  0,   5,   0,   0, 616])

In [70]:
# return an array with elements greater than 100 AND less than 500, not inclusive
sales_array[(sales_array >100) & (sales_array <500)]

array([155, 317, 325])

In [71]:
# assign complex logical statement to a variable named Boolean 'mask'
mask = (sales_array > 100) & (sales_array < 500)

# returns when boolean test returns true
sales_array[mask]

array([155, 317, 325])

Filter arrays based on values in other arrays
- use boolean array returned from the other array to index the array you want to filter
- **remember**: both arrays dimension and size needs to be the same

In [72]:
sales_array2 = sales_array[0]

sales_array2

array([  0,   5, 155,   0, 518])

In [73]:
product_array2 = product_array[:5]

product_array2

array(['fruits', 'vegetables', 'cereal', 'dairy', 'eggs'], dtype='<U10')

In [74]:
# filter an array with products greater than 0 sales
product_array2[sales_array2 > 0]

array(['vegetables', 'cereal', 'eggs'], dtype='<U10')

# Modify array values by assigning new ones

In [75]:
sales_array2

array([  0,   5, 155,   0, 518])

In [76]:
sales_array2[1] = 25

sales_array2

array([  0,  25, 155,   0, 518])

In [79]:
my_array = np.arange(20)

my_array

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19])

In [81]:
# logical test to return True
# if remainder of dividing by 2 is equal to 0 = True
my_array % 2 == 0

array([ True, False,  True, False,  True, False,  True, False,  True,
       False,  True, False,  True, False,  True, False,  True, False,
        True, False])

In [82]:
# filter array based on logical test
my_array[my_array % 2 == 0]

array([ 0,  2,  4,  6,  8, 10, 12, 14, 16, 18])

In [86]:
# create an even odd array
even_odd = np.array(['even', 'odd'] * 10)

even_odd

array(['even', 'odd', 'even', 'odd', 'even', 'odd', 'even', 'odd', 'even',
       'odd', 'even', 'odd', 'even', 'odd', 'even', 'odd', 'even', 'odd',
       'even', 'odd'], dtype='<U4')

In [87]:
# define logical test
even_odd != 'odd'

array([ True, False,  True, False,  True, False,  True, False,  True,
       False,  True, False,  True, False,  True, False,  True, False,
        True, False])

In [89]:
# return even elements only
even_odd[even_odd != 'odd']

array(['even', 'even', 'even', 'even', 'even', 'even', 'even', 'even',
       'even', 'even'], dtype='<U4')

In [92]:
# set even numbers in array equal to zero; use logic to modify arrays
my_array[even_odd != 'odd'] = 0

my_array

array([ 0,  1,  0,  3,  0,  5,  0,  7,  0,  9,  0, 11,  0, 13,  0, 15,  0,
       17,  0, 19])

In [94]:
# set even numbers in array equal to zero; use logic to modify arrays
my_array[2] = 2

my_array

array([ 0,  1,  2,  3,  0,  5,  0,  7,  0,  9,  0, 11,  0, 13,  0, 15,  0,
       17,  0, 19])

In [98]:
# two logial test or condition
mask = (even_odd != 'odd') | (even_odd != 'even')
even_odd[mask]

array(['even', 'odd', 'even', 'odd', 'even', 'odd', 'even', 'odd', 'even',
       'odd', 'even', 'odd', 'even', 'odd', 'even', 'odd', 'even', 'odd',
       'even', 'odd'], dtype='<U4')

# Where Function

where() NumPy function performs a logical test and returns a given value if the test is True, or another if the test if False

np.where(logical test, value if True, value if False)

In [101]:
# create inventory array
inventory = [12, 102, 18, 0, 0]

inventory_array = np.array(inventory)

inventory_array

array([ 12, 102,  18,   0,   0])

In [103]:
product_array2

array(['fruits', 'vegetables', 'cereal', 'dairy', 'eggs'], dtype='<U10')

In [105]:
# logical test if inventory is <= 0, then assign "out of stock", if else, assign "in stock"
np.where(inventory_array <=0, "Out of Stock", "In Stock")

array(['In Stock', 'In Stock', 'In Stock', 'Out of Stock', 'Out of Stock'],
      dtype='<U12')

In [107]:
my_array

array([ 0,  1,  2,  3,  0,  5,  0,  7,  0,  9,  0, 11,  0, 13,  0, 15,  0,
       17,  0, 19])

In [106]:
np.where(my_array % 2 ==0, 'even', 'odd')

array(['even', 'odd', 'even', 'odd', 'even', 'odd', 'even', 'odd', 'even',
       'odd', 'even', 'odd', 'even', 'odd', 'even', 'odd', 'even', 'odd',
       'even', 'odd'], dtype='<U4')

In [109]:
# assign 9 if remainder is 9
np.where(my_array % 2 == 0, 'even', np.where(my_array == 9, my_array, 'odd'))

array(['even', 'odd', 'even', 'odd', 'even', 'odd', 'even', 'odd', 'even',
       '9', 'even', 'odd', 'even', 'odd', 'even', 'odd', 'even', 'odd',
       'even', 'odd'], dtype='<U21')

# Assignment 5: Filtering Arrays

Filter the product array to only include those with prices greater than 25.

Modify your logic to include cola, despite it not having a price greater than 25. 
Store the elements returned in an array called `fancy_feast_special`.

Next, create a shipping cost array where the cost is 0 if price is greater than 20, and 5 if not. 

In [110]:
products = np.array(
    ["salad", "bread", "mustard", "rare tomato", "cola", "gourmet ice cream"]
)

products

array(['salad', 'bread', 'mustard', 'rare tomato', 'cola',
       'gourmet ice cream'], dtype='<U17')

In [114]:
prices

array([ 5.99,  6.99, 22.49, 99.99,  4.99, 49.99])

In [115]:
products[prices > 25]

array(['rare tomato', 'gourmet ice cream'], dtype='<U17')

In [119]:
mask = (prices > 25) | (products == 'cola')

fancy_feast_special = products[mask]

fancy_feast_special

array(['rare tomato', 'cola', 'gourmet ice cream'], dtype='<U17')

In [122]:
shipping_cost = np.where(prices > 20, 0, 5)

shipping_cost

array([5, 5, 0, 0, 5, 0])

# Array Aggregation Methods

Array aggregation methods let you calculate metrics like sum, mean and max

- array.sum() = returns the sum of all values in an array
- array.max() = returns the max of all values in an array
- array.min() = returns the smallest value in an array
- array.mean() = returns the average of the values in an array

In [123]:
rng = np.random.default_rng(616)

price = (rng.random(10) * 10).round(2)

price

array([3.97, 8.66, 4.6 , 3.06, 5.74, 0.89, 8.82, 7.32, 7.32, 5.62])

In [126]:
inventory = rng.integers(0, 100, 10)

inventory

array([52, 89,  4, 66, 16, 72,  9, 76, 64,  2])

In [127]:
inventory.mean()

45.0

In [128]:
inventory.std()

31.849646779831012

In [131]:
# calculate revenue
(price * inventory)

array([206.44, 770.74,  18.4 , 201.96,  91.84,  64.08,  79.38, 556.32,
       468.48,  11.24])

In [132]:
# sum of total revenue
(price * inventory).sum()

2468.88

In [133]:
# index of least valuable product
(price * inventory).argmin()

9

In [134]:
# index of most valuable product
(price * inventory).argmax()

1

In [135]:
# aggregate two dimension arrays

price_2d = price.reshape(5, 2)

price_2d

array([[3.97, 8.66],
       [4.6 , 3.06],
       [5.74, 0.89],
       [8.82, 7.32],
       [7.32, 5.62]])

In [136]:
# sum of all rows in first and second column
price_2d.sum(axis=0)

array([30.45, 25.55])

In [137]:
# sum across all columns in each row
price_2d.sum(axis=1)

array([12.63,  7.66,  6.63, 16.14, 12.94])

# Array Functions

Array functions let you perform other aggregations like median and percentiles

In [138]:
# returns the median value
np.median(sales_array)

236.0

In [139]:
# returns a value in the nth percentile in an array
np.percentile(sales_array, 90)

737.0999999999996

In [140]:
# returns unique values
np.unique(sales_array)

array([   0,   25,  155,  317,  325,  518,  616, 1827])

In [141]:
# returns the square root of each value in an array
np.sqrt(sales_array)

array([[ 0.        ,  5.        , 12.4498996 ,  0.        , 22.75961335],
       [ 0.        , 42.74342055, 24.81934729, 17.80449381, 18.02775638]])

## Sorting Arrays

sort() method will sort arrays in place
- use the axis arguent to specify the dimension to sort by

In [144]:
# sort by axis = 1 by default, which sorts a two-dimensional array row by row
sales_array.sort()

sales_array


array([[   0,    0,   25,  155,  518],
       [   0,  317,  325,  616, 1827]])

In [151]:
product_value = np.array([205.4, 158.4, 138.4, 186.24, 6.24, 
                           202.92, 106.44, 3.68, 26.15, 403.2])

product_value

array([205.4 , 158.4 , 138.4 , 186.24,   6.24, 202.92, 106.44,   3.68,
        26.15, 403.2 ])

In [152]:
# the sort FUNCTION will return a copy of the sorted array
np.sort(product_value)

array([  3.68,   6.24,  26.15, 106.44, 138.4 , 158.4 , 186.24, 202.92,
       205.4 , 403.2 ])

In [155]:
# the sort METHOD will sort arrays in place
product_value.sort()

product_value

array([  3.68,   6.24,  26.15, 106.44, 138.4 , 158.4 , 186.24, 202.92,
       205.4 , 403.2 ])

In [156]:
# min
product_value[0]

3.68

In [157]:
# max
product_value[-1]

403.2

In [160]:
# descending order using negative step slice
product_value[::-1]

array([403.2 , 205.4 , 202.92, 186.24, 158.4 , 138.4 , 106.44,  26.15,
         6.24,   3.68])

# Assignment 6: Aggregating and Sorting Arrays

First, grab the top 3 highest priced items in our list. 

Then, calculated the mean, min, max, and median of the top three prices.

Finally, calculate the number of unique price tiers in our `price_tiers` array.

In [161]:
prices = np.array([5.99, 6.99, 22.49, 99.99, 4.99, 49.99])

prices

array([ 5.99,  6.99, 22.49, 99.99,  4.99, 49.99])

In [182]:
# grab the top 3 highest priced items in the list
prices.sort()

top3_prices = prices[:-4:-1]

top3_prices

array([99.99, 49.99, 22.49])

In [197]:
print(f"Mean: {top3_prices.min()}")
print(f"Max: {top3_prices.max()}")
print(f"Min: {top3_prices.min()}")
print(f"Median: {np.median(top3_prices)}")

Mean: 22.49
Max: 99.99
Min: 22.49
Median: 49.99


In [189]:
price_tiers = np.array(["budget", "budget", "mid-tier", "luxury", "mid-tier", "luxury"])

price_tiers

array(['budget', 'budget', 'mid-tier', 'luxury', 'mid-tier', 'luxury'],
      dtype='<U8')

In [191]:
np.unique(price_tiers)

array(['budget', 'luxury', 'mid-tier'], dtype='<U8')

# Vectorization

Vectorization is the process of pushing array operations into optimized C code, which is easier and more efficient than writing for loops.

Use vectorized operations whenever possible when manipulating data, and avoid writing loops. Use functions in NumPy and Pandas.

# Broadcasting



Broadcasting lets you perform vectorized operations with arrays of different sizes, where NumPy will expand the smaller array to 'fit' the larger one.

- single values (scalars) can be broadcast into arrays of any dimension
- dimensions with a length greater than one must be the same size

# Assignment 7: Bringing it All Together

Ok, final NumPy task - let's read in some data with the help of Pandas.

Our data scientist provided the code to read in a csv as a Pandas dataframe, and has converted the two columns of interest to arrays.

* Filter `sales_array` down to only sales where the product family was produce. 

* Then, randomly sample roughly half (random number < .5) of the produce sales and report the mean and median sales. Use a random seed of 2022.

* Finally, create a new array that has the values 'above_both', 'above_median', and 'below_both' based on whether the sales were above the median and mean of the sample, just above the median of the sample, or below both the median and mean of the sample. 

In [198]:
import pandas as pd
import numpy as np

retail_df = pd.read_csv(
    "../retail/retail_2016_2017.csv", skiprows=range(1, 11000), nrows=1000
)

family_array = np.array(retail_df["family"])
sales_array = np.array(retail_df["sales"])

In [206]:
np.unique(family_array)

array(['AUTOMOTIVE', 'BABY CARE', 'BEAUTY', 'BEVERAGES', 'BOOKS',
       'BREAD/BAKERY', 'CELEBRATION', 'CLEANING', 'DAIRY', 'DELI', 'EGGS',
       'FROZEN FOODS', 'GROCERY I', 'GROCERY II', 'HARDWARE',
       'HOME AND KITCHEN I', 'HOME AND KITCHEN II', 'HOME APPLIANCES',
       'HOME CARE', 'LADIESWEAR', 'LAWN AND GARDEN', 'LINGERIE',
       'LIQUOR,WINE,BEER', 'MAGAZINES', 'MEATS', 'PERSONAL CARE',
       'PET SUPPLIES', 'PLAYERS AND ELECTRONICS', 'POULTRY',
       'PREPARED FOODS', 'PRODUCE', 'SCHOOL AND OFFICE SUPPLIES',
       'SEAFOOD'], dtype=object)

In [201]:
sales_array.shape

(1000,)

In [219]:
# Filter sales where family product is produce
produce_sales = sales_array[family_array =='PRODUCE']

print(produce_sales.shape)

produce_sales

(30,)


array([1662.394,  447.064, 2423.944,  962.866, 1236.404,  298.441,
       1077.44 , 3404.531,  962.96 ,  279.505, 1852.786, 1089.319,
        726.516, 7860.031,  446.038, 1155.385,  120.202,  862.092,
        473.952,  254.263, 1272.755, 2775.771, 2030.762, 1657.432,
       2339.906,  722.333, 1567.843, 2458.456,  673.885, 8834.15 ])

In [222]:
rng = np.random.default_rng(2022)

random_array = rng.random(30)

produce_sample = produce_sales[random_array < 0.5]

produce_sample

array([1662.394,  447.064,  962.866, 1077.44 , 3404.531,  962.96 ,
       1089.319, 7860.031,  446.038, 1272.755, 2775.771, 2339.906,
        722.333, 1567.843, 2458.456,  673.885, 8834.15 ])

In [227]:
mean = produce_sample.mean()

mean

2268.102470588235

In [230]:
median = np.median(produce_sample)

median

1272.755

Finally, create a new array that has the values 'above_both', 'above_median', and 'below_both' based on whether the sales were above the median and mean of the sample, just above the median of the sample, or below both the median and mean of the sample.

In [233]:
compare_array = np.where(produce_sample < median, 'below_both',
         np.where(produce_sample > mean, 'above_both', 'above_median'),)

compare_array

array(['above_median', 'below_both', 'below_both', 'below_both',
       'above_both', 'below_both', 'below_both', 'above_both',
       'below_both', 'above_median', 'above_both', 'above_both',
       'below_both', 'above_median', 'above_both', 'below_both',
       'above_both'], dtype='<U12')

In [234]:
compare_array = np.where(((produce_sample < median) & (produce_sample < mean)), 'below_both',
         np.where(produce_sample > mean, 'above_both', 'above_median'),)

compare_array

array(['above_median', 'below_both', 'below_both', 'below_both',
       'above_both', 'below_both', 'below_both', 'above_both',
       'below_both', 'above_median', 'above_both', 'above_both',
       'below_both', 'above_median', 'above_both', 'below_both',
       'above_both'], dtype='<U12')

In [235]:
produce_sample

array([1662.394,  447.064,  962.866, 1077.44 , 3404.531,  962.96 ,
       1089.319, 7860.031,  446.038, 1272.755, 2775.771, 2339.906,
        722.333, 1567.843, 2458.456,  673.885, 8834.15 ])

In [236]:
print(mean)
print(median)

2268.102470588235
1272.755


# Key Takewaways

- NumPy forms the foundation for Pandas
- NumPy arrays are more efficent than base Python lists and tuples
- Array operations let you aggregate, filter, and sort data
    - broadcasting and vectorization make these operations convenient and efficient without the use of loops
    - the syntax for NumPy array operations is very similar to Pandas

Completed on Jan 5th, 2023