Lists & Arrays: Notes from Python Data Science Handbook https://jakevdp.github.io/PythonDataScienceHandbook/02.01-understanding-data-types.html

In [1]:
import numpy
numpy.__version__

'2.3.3'

In [None]:
L=list(range(10))
L
type(L[0])

In [None]:
L2 = [str(c) for c in L] #iterates through each elementin L and converts them to individual strings
L2
type(L2[0])

In [None]:
L3 = [True,"2",3.0,4]
[type(item) for item in L3]

In [2]:
import numpy as np
import array
L=list(range(10)) #creating list 1-10
A=array.array('i',L) #creating array of list items in integer type
A

array('i', [0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

In [None]:
#integer array
np.array([1,4,2,5,3])

In [None]:
np.array([1,3,4],dtype='float32')

In [None]:
#nested lists result in multi-dimensional arrays
np.array([range(i,i+3) for i in [2,4,6]])

In [None]:
#create a length-10 integer array filled with zeros
np.zeros(10,dtype=int)

In [None]:
#create a 3x5 floating-point array filled with ones
np.ones((3,5), dtype=float)

In [None]:
#create a 3x5 array filled with 3.14
np.full((3,5),3.14)

In [None]:
#create an array filled with a linear sequence
#starting a 0, ending at 20, stepping by 2
# this is similar to the built-in range() function
np.arange(0,20,2)
#so the way this is formatted seems to be start,end,step

In [None]:
#create a 3x3 array of uniforly distributed random values between 0 and 1
np.random.random((3,3))

In [3]:
#create a 3x3 array of normally distributed random values with mean 0 and start deviation 1
np.random.normal(0,1, (3,3))

array([[-0.61741846, -0.26455021, -1.42341763],
       [ 0.7299385 , -0.25886412, -0.09033596],
       [ 0.474242  ,  0.51195458,  0.34975951]])

In [4]:
#create a 3x3 array of random intergers in the interval [0,10]
np.random.randint(0,10, (3,3))

array([[3, 2, 6],
       [2, 6, 7],
       [8, 8, 2]])

In [5]:
#create a 3x3 identity matrix
np.eye(3)

array([[1., 0., 0.],
       [0., 1., 0.],
       [0., 0., 1.]])

In [6]:
#create an unitizalied array of three integers, the values will be whatever happens to already exist at that memory location
np.empty(3)

array([1., 1., 1.])

Types of different datatypes: *used for dtype=*

Data type	Description
bool_	Boolean (True or False) stored as a byte
int_	Default integer type (same as C long; normally either int64 or int32)
intc	Identical to C int (normally int32 or int64)
intp	Integer used for indexing (same as C ssize_t; normally either int32 or int64)
int8	Byte (-128 to 127)
int16	Integer (-32768 to 32767)
int32	Integer (-2147483648 to 2147483647)
int64	Integer (-9223372036854775808 to 9223372036854775807)
uint8	Unsigned integer (0 to 255)
uint16	Unsigned integer (0 to 65535)
uint32	Unsigned integer (0 to 4294967295)
uint64	Unsigned integer (0 to 18446744073709551615)
float_	Shorthand for float64.
float16	Half precision float: sign bit, 5 bits exponent, 10 bits mantissa
float32	Single precision float: sign bit, 8 bits exponent, 23 bits mantissa
float64	Double precision float: sign bit, 11 bits exponent, 52 bits mantissa
complex_	Shorthand for complex128.
complex64	Complex number, represented by two 32-bit floats
complex128	Complex number, represented by two 64-bit floats

In [11]:
#numpy array attributes:
import numpy as np
np.random.seed(0) #seed for reporoducibility

x1 = np.random.randint(10,size=6) #1D array
x2 = np.random.randint(10, size=(3,4)) #2D array
x3 = np.random.randint(10, size=(3,4,5)) #3D array

#each array has attributes ndim (# of dimension), shape (size of each dimension, and size(total size of the array):

print("x3 ndim:" ,x3.ndim)
print("x3 shape:", x3.shape)
print("x3 size:", x3.size)

#another useful attribute ie dtype (data type of the array)
print("dtype:",x3.dtype)

x3 ndim: 3
x3 shape: (3, 4, 5)
x3 size: 60
dtype: int64


In [16]:
#Array indexing: Acessing single elements:

#in 1D array, ith value (counting from 0) can be accessed by specifying desired index in square brackets
x1

x1[0]

x1[4]

#similarly, you can use negative indices:

x1[-1]

np.int64(9)

In [17]:
#in multidimensional array, items can be accessed using a comms-seperated tuple of indices
x2

array([[3, 5, 2, 4],
       [7, 6, 8, 8],
       [1, 6, 7, 7]])

In [None]:
x2[0,0]
x2[2,0]
x2[2,-1]

#you can also use this notation to modify values:
x2[0,0] = 12
x2

#keep in mind that numpy arrays have a fixed type - will always be an integer array (for example) so inserting a float will truncate to decimal part

array([[12,  5,  2,  4],
       [ 7,  6,  8,  8],
       [ 1,  6,  7,  7]])

In [None]:
#Array Slicing: Accessing subarrays using :

#notation: x[start:stop:step]. Without specifying, it will default to start=0, stop = size of dimension, step =1.

#one dimensional arrays:
x = np.arange(10)
x

x[:5] #first five elements

x[5:] #elements after index 5
x[4:7] #elements from index 4 to 6
x[::2] #every other element
x[1::2] #every other element, starting at index 1

#we can use negative step to reverse start and stop. If we wanted to reverse an array:
x[::-1] #all elements, reversed
x[5::-2] #reversed every other from index 5





array([1, 3, 5, 7, 9])

In [26]:
#slicing multi-dimensional arrays:
x2

array([[12,  5,  2,  4],
       [ 7,  6,  8,  8],
       [ 1,  6,  7,  7]])

In [27]:
x2[:2, :3] #two rows, three columns

array([[12,  5,  2],
       [ 7,  6,  8]])

In [28]:
x2[:3, ::2] #all rows, every other column

array([[12,  2],
       [ 7,  8],
       [ 1,  7]])

In [29]:
#accessing array rows and columns:
#to do this, combine both indexing and slicing, sing an empty slice marked by a single conon (:):
print(x2[:,0]) #first column of x2 (because row is empty)
print(x2[0,:]) #first row of x2 (because column is empty)

#but for row access, the empty slice can be omitted for a more compact syntax:
print(x2[0]) #basicaly is equivalent to x2[0,:]


[12  7  1]
[12  5  2  4]
[12  5  2  4]


In [31]:
#sub-arrays as no-copy view, array slices return views rather than copies of array data (numpy array slicing differs from python list slicing in this way). In lists, slices will be copies. 

#ex:

print(x2)

#exctract 2x2 array from this:

x2_sub=x2[:2, :2]
print(x2_sub)

[[12  5  2  4]
 [ 7  6  8  8]
 [ 1  6  7  7]]
[[12  5]
 [ 7  6]]


In [32]:
#modifying the slice modifies the original:
x2_sub[0,0]=99
print(x2_sub)

[[99  5]
 [ 7  6]]


In [33]:
print(x2)

[[99  5  2  4]
 [ 7  6  8  8]
 [ 1  6  7  7]]


In [34]:
#THIS IS WHY WE NEED TO CREATE COPIES OF ARRAYS - to instead use excplicit copy of the data so that modifying doesn't modify the original
x2_sub_copy=x2[:2,:2].copy()
print(x2_sub_copy)

[[99  5]
 [ 7  6]]


In [None]:
#Reshaping Arrays:
#ex: put nmbrs 1-9 into a 3x3 grid:

grid = np.arange(1,10).reshape((3,3))
print(grid)

#this will only work if initial array maches size of reshaped array. reshape uses no0-copy view of the original array whenever possible.

[[1 2 3]
 [4 5 6]
 [7 8 9]]


In [39]:
#reshaping one-D array into two-D row or column mantrix: Can be more easily done using newaxis
x=np.array([1,2,3])

#row vector via reshape
x.reshape((1,3))

#row vector via newaxis
x[np.newaxis,:]

#column vector via reshape
x.reshape((3,1))

#column vector via newaxis
x[:,np.newaxis]

array([[1],
       [2],
       [3]])

In [43]:
#ARRAY CONCATENATION AND SPLITTING:

x=np.array([1,2,3])
y = np.array([3,2,1])
np.concatenate([x,y])

#concattenating more than two arrays at once:
z=([99,99,99])
print(np.concatenate([x,y,z]))

[ 1  2  3  3  2  1 99 99 99]


In [None]:
#two dimensional arrays:
grid=np.array([[1,2,3],[4,5,6]])
np.concatenate([grid,grid]) #concatenate along first axis (rows) *aka, esentially 0 index so vertical stacking

array([[1, 2, 3],
       [4, 5, 6],
       [1, 2, 3],
       [4, 5, 6]])

In [None]:
#concatenate along the second axis (columns) (aka 1st axis):
np.concatenate([grid,grid], axis=1)

#for working with arrays of mixed dimensions, it can be clearer to use np.vstack and np.hstack functions. Splitting amongst a 3rd dimension, you can use np.dstack.

array([[1, 2, 3, 1, 2, 3],
       [4, 5, 6, 4, 5, 6]])

In [None]:
x = np.array([1,2,3])
grid = np.array([[9,8,7],[6,5,4]])

#vertically stack the arrays
np.vstack([grid, grid])

array([[1, 2, 3],
       [9, 8, 7],
       [6, 5, 4]])

In [52]:
#horizontally stack the arrays:
y = np.array([[99],[99]])
np.hstack([grid,y])

array([[ 9,  8,  7, 99],
       [ 6,  5,  4, 99]])

In [None]:
#SPLITTING OF ARRAYS:

x = [1,2,3,99,99,3,2,1]
x1,x2,x3 = np.split(x, [3,5])
print(x1,x2,x3)

#meaning splitting at index 3 and 5, so first array is up to index 3 (not including), second array is from index 3 to 5 (not including), third array is from index 5 to end

grid = np.arange(16).reshape((4, 4))
grid

upper,lower = np.vsplit(grid, [2])
print(upper)
print(lower)

left,right = np.hsplit(grid, [2])
print(left)
print(right)

#vertical = vertical stacking axis
#horizontal = horizontal stacking axis

[1 2 3] [99 99] [3 2 1]
[[0 1 2 3]
 [4 5 6 7]]
[[ 8  9 10 11]
 [12 13 14 15]]
[[ 0  1]
 [ 4  5]
 [ 8  9]
 [12 13]]
[[ 2  3]
 [ 6  7]
 [10 11]
 [14 15]]


In [63]:
#Array Arithmetic:

x = np.array(4)
print("x     =", x)
print("x + 5 =", x + 5)
print("x - 5 =", x - 5)
print("x * 2 =", x * 2)
print("x / 2 =", x / 2)
print("x // 2 =", x // 2)  # floor division

x     = 4
x + 5 = 9
x - 5 = -1
x * 2 = 8
x / 2 = 2.0
x // 2 = 2


In [64]:
#unatry ufunc for negation, and ** operator for exponentation, and % operator for modulus:
print("-x     = ", -x)
print("x ** 2 = ", x ** 2)
print("x % 2  = ", x % 2)

-x     =  -4
x ** 2 =  16
x % 2  =  0


however, all of these arithmetic operations are simply wrappers around specifi functions build into

+	np.add	Addition (e.g., 1 + 1 = 2)
-	np.subtract	Subtraction (e.g., 3 - 2 = 1)
-	np.negative	Unary negation (e.g., -2)
*	np.multiply	Multiplication (e.g., 2 * 3 = 6)
/	np.divide	Division (e.g., 3 / 2 = 1.5)
//	np.floor_divide	Floor division (e.g., 3 // 2 = 1)
**	np.power	Exponentiation (e.g., 2 ** 3 = 8)
%	np.mod	Modulus/remainder (e.g., 9 % 4 = 1)


In [66]:
#Absolute value:

x = np.array([-2,-1,0,1,2])
abs(x)

#or, np.absolute/np.abs

np.absolute(x)
np.abs(x)

array([2, 1, 0, 1, 2])

In [None]:
#Trigonometric functions & exponents and logarithms and aggregates for binary ufuncs and outer products not as useful but if needed: look here 

# https://jakevdp.github.io/PythonDataScienceHandbook/02.03-computation-on-arrays-ufuncs.html

In [None]:
#Specifying Output: #numpy has the out function to specify where to store result of ufunc

x = np.arange(5)
y = np.empty(5)
np.multiply(x,10,out=y)
print(y)

[ 0. 10. 20. 30. 40.]


In [None]:
#summing the values in an array:

L = np.random.random(100)
sum(L)

#OR WE CAN DO NP.SUM and get the same value. Numpy's calculation is much faster 

np.sum(L)

np.float64(52.139025744667336)

In [72]:
#min & max:

np.min(big_array)
np.max(big_aray)

NameError: name 'big_array' is not defined

Other aggregate functions:
most aggregates have a NaN-safe counterpart

Function Name	NaN-safe Version	Description
np.sum	np.nansum	Compute sum of elements
np.prod	np.nanprod	Compute product of elements
np.mean	np.nanmean	Compute mean of elements
np.std	np.nanstd	Compute standard deviation
np.var	np.nanvar	Compute variance
np.min	np.nanmin	Find minimum value
np.max	np.nanmax	Find maximum value
np.argmin	np.nanargmin	Find index of minimum value
np.argmax	np.nanargmax	Find index of maximum value
np.median	np.nanmedian	Compute median of elements
np.percentile	np.nanpercentile	Compute rank-based statistics of elements
np.any	N/A	Evaluate whether any elements are true
np.all	N/A	Evaluate whether all elements are true


In [None]:
#Example: What is the average heigh of US Presidents?

