# Python for Data Analysis

### Numpy

NumPy is a linear algebra library for Python. Most other data science libraries in Python rely on it. 

In [3]:
#import 
import numpy as np

#### Numpy Arrays

1-D Vectors or n-D matrices

###### Casting a List to a NumPy array

In [5]:
# 1D Array

# Create a list
myList = [1,2,3]
# Cast it to a NumPy array
arr1 = np.array(myList)
arr1

array([1, 2, 3])

In [71]:
# 2D Array (can be noticed by number of square brackets on either end)

# Create a list
myMatrix = [[1,2,3],[4,5,6],[7,8,9]]
# Cast it to a NumPy array
arr2 = np.array(myMatrix)
arr2

array([[1, 2, 3],
       [4, 5, 6],
       [7, 8, 9]])

###### Creating an array with arange

`arange` is NumPy's built in range function. It takes a start, end, and increment and returns an array containing the specified values

In [8]:
#np.arange(start,stop,Optional:step)
np.arange(0,11,2)

array([ 0,  2,  4,  6,  8, 10])

###### Creating an array of zeros

In [9]:
# To create a vector, pass a number as the argument
np.zeros(3)

array([0., 0., 0.])

In [10]:
# To create a matrix, pass a tuple as the argument 
np.zeros((3,3)) # np.zeros((rows,cols))

array([[0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.]])

###### Creating an array of ones 

In [11]:
# To create a vector, pass a number as the argument
np.ones(4)

array([1., 1., 1., 1.])

In [12]:
# To create a matrix, pass a tuple as the argument 
np.ones((4,4)) # np.ones((rows,cols))

array([[1., 1., 1., 1.],
       [1., 1., 1., 1.],
       [1., 1., 1., 1.],
       [1., 1., 1., 1.]])

###### Create an array with linearly spaced data points

In [14]:
# Syntax: np.linspace(start,end,number of points)
np.linspace(0,10,50) # Note: Returns a 1D array

array([ 0.        ,  0.20408163,  0.40816327,  0.6122449 ,  0.81632653,
        1.02040816,  1.2244898 ,  1.42857143,  1.63265306,  1.83673469,
        2.04081633,  2.24489796,  2.44897959,  2.65306122,  2.85714286,
        3.06122449,  3.26530612,  3.46938776,  3.67346939,  3.87755102,
        4.08163265,  4.28571429,  4.48979592,  4.69387755,  4.89795918,
        5.10204082,  5.30612245,  5.51020408,  5.71428571,  5.91836735,
        6.12244898,  6.32653061,  6.53061224,  6.73469388,  6.93877551,
        7.14285714,  7.34693878,  7.55102041,  7.75510204,  7.95918367,
        8.16326531,  8.36734694,  8.57142857,  8.7755102 ,  8.97959184,
        9.18367347,  9.3877551 ,  9.59183673,  9.79591837, 10.        ])

###### Create an identity matrix 

An identity matrix has the same number of rows and columns. It has ones across the diagonal and zeros in all other positions.

In [16]:
# Syntax: np.eye(rows/cols)
np.eye(5) # Returns a 2D identity matrix with 5 rows and 5 columns

array([[1., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0.],
       [0., 0., 1., 0., 0.],
       [0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 1.]])

###### Creating arrays with random values


In [23]:
# Random decimals in [0.1)

#Syntax: np.random.rand(dim0,dim1,...dimn)
np.random.rand(4,5) # 4x5 array with random values in [0,1)

array([[0.56917302, 0.05407427, 0.52416213, 0.03847885, 0.80021745],
       [0.90001541, 0.14142558, 0.75115883, 0.50821146, 0.98504355],
       [0.54352542, 0.71966944, 0.1630343 , 0.91373674, 0.0627205 ],
       [0.78557284, 0.8020569 , 0.5747332 , 0.54500796, 0.09948941]])

In [24]:
# You can multiply by any number to make the array values in the range [0,n)

# Multiply by 100 to get random uniform percentages 
np.random.rand(4,5) * 100

array([[53.86741984, 36.4077494 ,  8.45523383, 39.42445448, 51.70756889],
       [32.12762217, 93.19454572,  6.74069396, 48.73559963,  6.30002572],
       [61.06957299, 59.67455017, 80.7914452 ,  3.69814625, 23.18610572],
       [ 2.24346398, 83.27183254, 19.90266183, 23.72061949,  0.38115931]])

In [26]:
# Random decimals from a standard normal distribution (Gaussian) centered around 0

# Syntax: np.random.randn(dim0,dim1,...dimn)
np.random.randn(7,4)

array([[-0.15723575, -0.88253217,  0.34465247,  0.46600816],
       [-0.04879528,  2.06707059,  2.90109884,  0.61548412],
       [-1.46275379,  1.47455577,  0.45522763,  0.24555068],
       [ 0.63723559, -0.00855502,  0.55067381,  0.49089246],
       [ 0.19293276,  1.81941795,  1.08784421, -0.61137709],
       [-1.67671716,  0.56413566,  1.11119478, -0.50797801],
       [ 0.67374776,  0.8281145 , -0.14161263,  1.14110179]])

In [30]:
# Random integers in a range 

# Syntax: np.random.randint(start,end,number of points)
np.random.randint(1,100,(10,5)) # Number of points can be a number for 1D vectors or a tuple for 2D vectors

array([[65, 29, 57, 26, 96],
       [98, 55, 29, 87, 13],
       [ 6, 59, 29, 20, 47],
       [76, 51, 91, 36, 52],
       [88, 51, 66, 24, 99],
       [ 5, 48, 75, 31, 88],
       [63, 35, 99, 25, 41],
       [19, 25, 51, 56, 33],
       [78, 77, 24, 66, 51],
       [ 7,  8,  5, 70,  7]])

##### Reshaping an array

In [32]:
# Creage an array of 20 random numbers
arr = np.random.randint(1,100,20)
# Syntax: array.reshape(rows,cols)
arr.reshape(4,5) # reshape the array to 4x5 

array([[27, 12, 33, 52, 34],
       [11, 46, 12, 23, 95],
       [92, 58, 39, 78, 30],
       [39, 52, 22, 17, 98]])

In [33]:
# Note: The total size of the new array must be unchanged
arr.reshape(7,3) # (7x3 = 21) != 20

ValueError: cannot reshape array of size 20 into shape (7,3)

###### Getting the maximum and minimum values in the array 

In [35]:
# Get the maximum value in the array
arr.max()

98

In [36]:
# Get the minimum value in the array
arr.min()

11

In [37]:
# You can get the index of the maximum and minimum values in an array 
arr.argmax() # The maximum value 98 has index 19

19

In [38]:
arr.argmin() # The minimum value 11 has index 5

5

###### Find out the shape of an array

In [40]:
arr.shape # 1D Vector with 20 elements

(20,)

In [41]:
arr = arr.reshape(4,5) # arr is now a 2D array with 4 rows and 5 cols
arr.shape 

(4, 5)

#### Array indexing

Forward indexing `array[0,1, ..., n-1]`

Backward indexing `array[-n, ..., -2, -1]`

In [73]:
arr2 = np.arange(10,20)
print(arr2)
# Syntax: array[index]
arr2[3] # Get the element at index 3

[10 11 12 13 14 15 16 17 18 19]


13

In [48]:
print(arr)
# Syntax: array[row index][col index]
arr[3] # Get the 4th row 

[[27 12 33 52 34]
 [11 46 12 23 95]
 [92 58 39 78 30]
 [39 52 22 17 98]]


array([39, 52, 22, 17, 98])

In [51]:
arr[1][3] # Get the fourth item in the second row

23

In [81]:
# You can also use single bracket notation to index 2D arrays
# Syntax: array[row,col]
arr[1,3]

23

###### Array Slicing

In [53]:
print(arr2)
arr2[3:7] # Get the 4,5,6,7th items

[10 11 12 13 14 15 16 17 18 19]


array([13, 14, 15, 16])

In [62]:
print(arr)
arr[1:3] # Get the second and third rows

[[27 12 33 52 34]
 [11 46 12 23 95]
 [92 58 39 78 30]
 [39 52 22 17 98]]


array([[11, 46, 12, 23, 95],
       [92, 58, 39, 78, 30]])

In [63]:
arr2[-5:] # get the last 5 items

array([15, 16, 17, 18, 19])

In [85]:
# You van grab subsections of arrays

# Grab the top 2x2 right corner of arr
print(arr)
# Syntax: array[rowstart:rowend, colstart:colend]
arr[:2,-2:] # Grab the items in the first two rows and last two columns

[[27 12 33 52 34]
 [11 46 12 23 95]
 [92 58 39 78 30]
 [39 52 22 17 98]]


array([[52, 34],
       [23, 95]])

#### Broadcasting

You can change values in a NumPy array with broadcasting

In [75]:
print(arr2)
# Broadcast elements in positions 4,5,6 to 100
arr2[4:7] = 100
print(arr2)

[10 11 12 13 14 15 16 17 18 19]
[ 10  11  12  13 100 100 100  17  18  19]


In [77]:
# Note: Broadcasting on array slices affects the original array (the slice is just a view of the original array)

# Grab a slice of the array
arr2_slice = arr2[4:7]
# Change the values in the slice to 47
arr2_slice[:] = 47
# Show the changed values in slice
print(arr2_slice)
# Values are also changed in original array
print(arr2)

[47 47 47]
[10 11 12 13 47 47 47 17 18 19]


In [80]:
# To leave original array values unchanged

# Copy values into separate array
# Syntax: array.copy()
copy_arr2 = arr2.copy()[7:] 
print(copy_arr2)
# Change the values in the copy
copy_arr2[:] = 42
# Values change in the copy 
print(copy_arr2)
# Values don't change in the original
print(arr2)

[17 18 19]
[42 42 42]
[10 11 12 13 47 47 47 17 18 19]
