# Introduction

- Purpose: will learn how to effectively load, store, and manipulate in-memory data in Python

- Best to think of all data fundamentally as **arrays of numbers**

- Images can be thought of as 2D arrays: pixel brightness across the area

- Sound clips as 1D arrays: intensity versus time

- Text can be thought of as binary digits representing the frequency of certain words or pairs of words

- Regardless of the data, first step is to convert it to an array of numbers

- Efficient storage and manipulation of numerical arrays is fundamental process of doing data science
    - NumPy package and Pandas Package are specialized tools to handle such numerical arrays
- NumPy arrays are similar to Python's built-in `list` type
    - But NumPy arrays are more efficient storage and data operations as arrays grow larger


In [4]:
import numpy
numpy.__version__






'1.26.4'

## Understanding Data Types in Python



In [3]:
import array

L = list(range(10))
A = array.array('i', L)
A

array('i', [0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

In [12]:
import numpy as np

#integer array:
np.array([1, 4, 2, 5, 3])

np.array([3.14, 1, 2, 3, 4])

np.array([1, 3, 7, 8], dtype='float32')

# nested lists result in multi-dimensional arrays
# 2D array where the inner loop is treated as rows
np.array([range(i, i + 3) for i in [2, 4, 6]])

array([[2, 3, 4],
       [4, 5, 6],
       [6, 7, 8]])

In [20]:
# 3x5 array of 0s
print(np.zeros((3, 5), dtype='int'))
print("\n")
print(np.ones(10, dtype=float))
print(f"\n{np.full((5, 2), 3.14)}")
#similar to range() function:
#start, stop, step
print(np.arange(5, 65, 3))
print("\n")
print(np.linspace(0, 1, 7))
print("\n")
print(np.random.random((2,3)))

[[0 0 0 0 0]
 [0 0 0 0 0]
 [0 0 0 0 0]]


[1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]

[[3.14 3.14]
 [3.14 3.14]
 [3.14 3.14]
 [3.14 3.14]
 [3.14 3.14]]
[ 5  8 11 14 17 20 23 26 29 32 35 38 41 44 47 50 53 56 59 62]


[0.         0.16666667 0.33333333 0.5        0.66666667 0.83333333
 1.        ]


[[0.66718794 0.39318742 0.42840278]
 [0.51740212 0.36759477 0.01848693]]


In [2]:
import numpy as np
np.random.seed(0) #seed makes sure the random value is consistently produced again

x1 = np.random.randint(10, size=6)
x2 = np.random.randint(10, size=(3,4))
x3 = np.random.randint(10, size=(3,4,5))
print(x2)
print(x2[2])


[[3 5 2 4]
 [7 6 8 8]
 [1 6 7 7]]
[1 6 7 7]


In [42]:
grid = np.arange(1,10).reshape((3, 3))
grid

array([[1, 2, 3],
       [4, 5, 6],
       [7, 8, 9]])

In [7]:
grid = np.arange(16).reshape((4,4))
grid
upper, lower = np.vsplit(grid, [2])
print(upper)
print(lower)

[[0 1 2 3]
 [4 5 6 7]]
[[ 8  9 10 11]
 [12 13 14 15]]


# Computations on NumPy Arrays

- the key to making fast computations is to uie *vectorized* oeprations
- implemented through *universal functions* (ufuncs)

In [5]:
import numpy as np

x = np.arange(4)
x

x + 5
x * 15

array([ 0, 15, 30, 45])

# Aggregations: Min, Max, and Everything Between


In [31]:
import numpy as np
np.random.seed(0)
L = np.random.random(100)
print(L)
print(sum(L))
print(np.sum(L))

%timeit sum(L) #gives time of run
%timeit np.sum(L) #numpy ufuncs are much faster



[0.5488135  0.71518937 0.60276338 0.54488318 0.4236548  0.64589411
 0.43758721 0.891773   0.96366276 0.38344152 0.79172504 0.52889492
 0.56804456 0.92559664 0.07103606 0.0871293  0.0202184  0.83261985
 0.77815675 0.87001215 0.97861834 0.79915856 0.46147936 0.78052918
 0.11827443 0.63992102 0.14335329 0.94466892 0.52184832 0.41466194
 0.26455561 0.77423369 0.45615033 0.56843395 0.0187898  0.6176355
 0.61209572 0.616934   0.94374808 0.6818203  0.3595079  0.43703195
 0.6976312  0.06022547 0.66676672 0.67063787 0.21038256 0.1289263
 0.31542835 0.36371077 0.57019677 0.43860151 0.98837384 0.10204481
 0.20887676 0.16130952 0.65310833 0.2532916  0.46631077 0.24442559
 0.15896958 0.11037514 0.65632959 0.13818295 0.19658236 0.36872517
 0.82099323 0.09710128 0.83794491 0.09609841 0.97645947 0.4686512
 0.97676109 0.60484552 0.73926358 0.03918779 0.28280696 0.12019656
 0.2961402  0.11872772 0.31798318 0.41426299 0.0641475  0.69247212
 0.56660145 0.26538949 0.52324805 0.09394051 0.5759465  0.9292962

KeyboardInterrupt: 

In [36]:
import numpy as np
np.random.seed(0)
big_array = np.random.random(100)
print(big_array)
print(big_array.sum(), big_array.max(), big_array.min())
print(np.median(big_array))
#print(big_array.median())
print(big_array.sum())

[0.5488135  0.71518937 0.60276338 0.54488318 0.4236548  0.64589411
 0.43758721 0.891773   0.96366276 0.38344152 0.79172504 0.52889492
 0.56804456 0.92559664 0.07103606 0.0871293  0.0202184  0.83261985
 0.77815675 0.87001215 0.97861834 0.79915856 0.46147936 0.78052918
 0.11827443 0.63992102 0.14335329 0.94466892 0.52184832 0.41466194
 0.26455561 0.77423369 0.45615033 0.56843395 0.0187898  0.6176355
 0.61209572 0.616934   0.94374808 0.6818203  0.3595079  0.43703195
 0.6976312  0.06022547 0.66676672 0.67063787 0.21038256 0.1289263
 0.31542835 0.36371077 0.57019677 0.43860151 0.98837384 0.10204481
 0.20887676 0.16130952 0.65310833 0.2532916  0.46631077 0.24442559
 0.15896958 0.11037514 0.65632959 0.13818295 0.19658236 0.36872517
 0.82099323 0.09710128 0.83794491 0.09609841 0.97645947 0.4686512
 0.97676109 0.60484552 0.73926358 0.03918779 0.28280696 0.12019656
 0.2961402  0.11872772 0.31798318 0.41426299 0.0641475  0.69247212
 0.56660145 0.26538949 0.52324805 0.09394051 0.5759465  0.9292962

- can use *broadcasting* functionality instead of *vectorizing* funcitonality to remove slow Python loops

- Broadcasting is simply a set of rules for applying ufuncs on arrays of different sizes

In [38]:
import numpy as np
a = np.arange(5)
b = np.arange(10)[:, np.newaxis]
print(f"{a}\n")
print(f"{b}\n")
print(a+b)

[0 1 2 3 4]

[[0]
 [1]
 [2]
 [3]
 [4]
 [5]
 [6]
 [7]
 [8]
 [9]]

[[ 0  1  2  3  4]
 [ 1  2  3  4  5]
 [ 2  3  4  5  6]
 [ 3  4  5  6  7]
 [ 4  5  6  7  8]
 [ 5  6  7  8  9]
 [ 6  7  8  9 10]
 [ 7  8  9 10 11]
 [ 8  9 10 11 12]
 [ 9 10 11 12 13]]
