# Numpy Arrays

* Equivalent to lists, e.g., [1,2,3]
* Much **faster** than lists
* MATLAB 'like'
* Num = numerical, py = python
* Scientific computing

In [None]:
import numpy as np

In [None]:
# Define an array (list)
x = [1,2,3]

In [None]:
type(x)

list

In [None]:
# Numpy array
np.array([1,2,3])

array([1, 2, 3])

### Two-dimensional Arrays

In [None]:
np.array(([1,2,3],[4,5,6]))

array([[1, 2, 3],
       [4, 5, 6]])

# MATLAB 'like' Functions

In [None]:
np.zeros(5)

array([0., 0., 0., 0., 0.])

In [None]:
np.zeros(5)
np.ones(3)          # 1 x 3 array
np.ones((2,5))      # 2 x 5 array

array([1., 1., 1.])

In [None]:
np.full((3,5),17)   # 3 x 5 array of 17's

array([[17, 17, 17, 17, 17],
       [17, 17, 17, 17, 17],
       [17, 17, 17, 17, 17]])

In [None]:
np.full((3,5),1)    # 3 x 5 array of 1's; equivalent to np.ones((3,5))

array([[1, 1, 1, 1, 1],
       [1, 1, 1, 1, 1],
       [1, 1, 1, 1, 1]])

In [None]:
np.linspace(0,1,10)

array([0.        , 0.11111111, 0.22222222, 0.33333333, 0.44444444,
       0.55555556, 0.66666667, 0.77777778, 0.88888889, 1.        ])

In [None]:
np.arange(1,20)     # x = 1:19

array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
       18, 19])

### Reshaping and Resizing an array
Reshape will take an 1 x N element array and reshapes to an (m,n) as long as N = m*n (i.e., m and n are cofactors of N).


In [None]:
x = np.arange(1,21).reshape(4,5)
x

array([[ 1,  2,  3,  4,  5],
       [ 6,  7,  8,  9, 10],
       [11, 12, 13, 14, 15],
       [16, 17, 18, 19, 20]])

In [None]:
x.resize(2,10)
x

array([[ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10],
       [11, 12, 13, 14, 15, 16, 17, 18, 19, 20]])

In [None]:
x.resize(1,20)
x

array([[ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
        17, 18, 19, 20]])

In [None]:
# Flatten
flat = x.flatten()
flat

array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
       18, 19, 20])

In [None]:
# Flat doesn't share memory
flat[0] = 100
print(flat)
print(x)

[100   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17  18
  19  20]
[[ 1 77  3  4  5  6  7  8  9 10]
 [11 12 13 14 15 16 17 18 19 20]]


In [None]:
# ravel does
raveled = x.ravel()
print(raveled)

raveled[1] = 77
print(raveled)
print(x)

[ 1 77  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20]
[ 1 77  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20]
[[ 1 77  3  4  5  6  7  8  9 10]
 [11 12 13 14 15 16 17 18 19 20]]


### Performance Test: List v. Array

In [1]:
import random
import numpy as np

n = 100000
%timeit [random.randint(1,6) for i in range(n)]
%timeit np.random.randint(1,6,n)

1 loop, best of 5: 179 ms per loop
100 loops, best of 5: 2.11 ms per loop


#### Customize the `%timeit` function 
See page 247

#### Descriptive Statistics

In [None]:
x = np.array([1,2,3,4,5,6,7,8,9,10])

In [None]:
x.min()
x.max()
x.sum()
x.mean()
x.std()
x.var()

8.25

# Indexing and Slicing 
Create subarrays and access part of the array

In [None]:
x = np.array([1,2,3,4,5,6,7,8,9,10])

In [None]:
x[4]

5

In [None]:
# Extract element at index 4 to end
x[3:]

array([ 4,  5,  6,  7,  8,  9, 10])

In [None]:
# Extract elements indexed 3 - 7
x[3:8]

array([4, 5, 6, 7, 8])

In [None]:
# Extract elements indedxed 2, 5, and 8.
x[ [2,5,8] ]

array([3, 6, 9])

#### Slicing Two-dimensional Arrays

In [None]:
y = np.array([[1,2,3,4,5],[6,7,8,9,10], [11,12,13,14,15]])
y

array([[ 1,  2,  3,  4,  5],
       [ 6,  7,  8,  9, 10],
       [11, 12, 13, 14, 15]])

In [None]:
# Access elements
y[1,3]            # 2nd row, 4th column

9

In [None]:
y[0:1]          # 1st Row

array([[1, 2, 3, 4, 5]])

In [None]:
y[1:3]          # 2nd and 3rd row

array([[ 6,  7,  8,  9, 10],
       [11, 12, 13, 14, 15]])

In [None]:
z = np.array([[1,2,3,4,5],[6,7,8,9,10],[11,12,13,14,15]])
z


array([[ 1,  2,  3,  4,  5],
       [ 6,  7,  8,  9, 10],
       [11, 12, 13, 14, 15]])

In [None]:
z[:,2]

array([ 3,  8, 13])

In [None]:
z[0:2,2:3]

array([[3],
       [8]])

In [None]:
z = np.array([[1,2,3,4,5],[6,7,8,9,10],[11,12,13,14,15]])
z[:,2:3]

array([[ 3],
       [ 8],
       [13]])

In [None]:
z[:,2:4]

array([[ 3,  4],
       [ 8,  9],
       [13, 14]])

In [None]:
z[:,[2,4]]

array([[ 3,  5],
       [ 8, 10],
       [13, 15]])

In [None]:
z = np.array([[1,2,3,4,5],[6,7,8,9,10],[11,12,13,14,15]])
z

array([[ 1,  2,  3,  4,  5],
       [ 6,  7,  8,  9, 10],
       [11, 12, 13, 14, 15]])

#### Transpose
Swap rows and columns

In [None]:
z.T

array([[ 1,  6, 11],
       [ 2,  7, 12],
       [ 3,  8, 13],
       [ 4,  9, 14],
       [ 5, 10, 15]])

---

# Pandas Library
* **Pan**el **Da**ta
* Data frames - contain Nonhomogenous data (vs. numpy)
* Data organization and manipulation
* NumPy can be applied to data frames


#### Series
* One-dimensional collections
* Can contain missing data
* Can be arbitrarily indexed

In [None]:
import pandas as pd

In [None]:
grades = pd.Series([87,100,94])
grades

0     87
1    100
2     94
dtype: int64

In [None]:
# Repeated Series
pd.Series(5, range(3))

0    5
1    5
2    5
dtype: int64

#### Accessing elements

In [None]:
grades[2]

94

#### Descriptive Statistics


In [None]:
grades.describe()   # summary statistics or descriptive statistics
# in R: summary(grades)

count      3.000000
mean      93.666667
std        6.506407
min       87.000000
25%       90.500000
50%       94.000000
75%       97.000000
max      100.000000
dtype: float64

In [None]:
grades.mean()

93.66666666666667

In [None]:
grades.min()

87

In [None]:
grades.max()

100

#### Series with Arbitrary Indices

In [None]:
grades = pd.Series([97,100,94], index=['Joe', 'Bill', 'Carry'])
grades

Joe       97
Bill     100
Carry     94
dtype: int64

In [None]:
grades['Bill']

100

In [None]:
grades.Joe

97

### Series of Strings


In [None]:
import pandas as pd
hardware = pd.Series(['Hammer','Saw', 'Wrench', 'Paint', 'Nails'], index=[1,2,3,4,5])

In [None]:
# view it
hardware

1    Hammer
2       Saw
3    Wrench
4     Paint
5     Nails
dtype: object

#### Employ string methods

https://www.programiz.com/python-programming/methods/string

In [None]:
hardware.str.contains('ai')

1    False
2    False
3    False
4     True
5     True
dtype: bool

In [None]:
hardware.str.islower()

1    False
2    False
3    False
4    False
5    False
dtype: bool

In [None]:
x = hardware.str.upper()
x.str.isupper()

1    True
2    True
3    True
4    True
5    True
dtype: bool

#### Combine NumPy and Pandas

In [None]:
import numpy as np
import pandas as pd

In [None]:
np.random.seed(567)
temps = np.random.randint(60,101,6)  # start, end, length
temps

array([63, 82, 65, 69, 76, 93])

In [None]:
temperatures = pd.Series(temps)

In [None]:
temperatures

0    63
1    82
2    65
3    69
4    76
5    93
dtype: int64

In [None]:
temperatures.min()

63

In [None]:
temperatures.describe()

count     6.000000
mean     74.666667
std      11.430952
min      63.000000
25%      66.000000
50%      72.500000
75%      80.500000
max      93.000000
dtype: float64

## Data Frames in Pandas

**Data frames** are two-dimensional arrays (like tables in Excel).  
* Row headers (indices) can be customized.  
* Each column is a _series_
* Support missing data
* Highly used in data science
* Notion from R
 

In [None]:
import pandas as pd

#### Create DataFrame from dictionary

In [None]:
grades_dictionary = {'Alice':[87,62,95,77], 'Bob':[100, 99, 82, 75], 'Charles':[65, 86, 58, 88]}

In [None]:
grades_dictionary

{'Alice': [87, 62, 95, 77],
 'Bob': [100, 99, 82, 75],
 'Charles': [65, 86, 58, 88]}

In [None]:
grades = pd.DataFrame(grades_dictionary)
grades

Unnamed: 0,Alice,Bob,Charles
0,87,100,65
1,62,99,86
2,95,82,58
3,77,75,88


In [None]:
grades.describe()

Unnamed: 0,Alice,Bob,Charles
count,4.0,4.0,4.0
mean,80.25,89.0,74.25
std,14.221463,12.463279,15.019432
min,62.0,75.0,58.0
25%,73.25,80.25,63.25
50%,82.0,90.5,75.5
75%,89.0,99.25,86.5
max,95.0,100.0,88.0


### Customizing Index (Row headers)

* Implemented in definition:
`grades = pd.DataFrame(grades_dictionary, index = ['A','B','C','D'])`

* Post implementation (see below)

In [None]:
grades.index = ['Homework', 'Quizzes','Exam 1', 'Exam 2']
grades

Unnamed: 0,Alice,Bob,Charles
Homework,87,100,65
Quizzes,62,99,86
Exam 1,95,82,58
Exam 2,77,75,88


### Accessing data from DataFrame

In [None]:
grades['Alice']

Homework    87
Quizzes     62
Exam 1      95
Exam 2      77
Name: Alice, dtype: int64

In [None]:
grades.Bob

Homework    100
Quizzes      99
Exam 1       82
Exam 2       75
Name: Bob, dtype: int64

In [None]:
grades['Bob']

Homework    100
Quizzes      99
Exam 1       82
Exam 2       75
Name: Bob, dtype: int64

### Access Data 
https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.loc.html

* `loc` selects rows and columns with specific labels
* `iloc` selects rows and columns at specific integer positions

In [None]:
#create DataFrame
df = pd.DataFrame({'team': ['A', 'A', 'A', 'A', 'B', 'B', 'B', 'B'],
                   'points': [5, 7, 7, 9, 12, 9, 9, 4],
                   'assists': [11, 8, 10, 6, 6, 5, 9, 12]},
                   index=['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H'])

#view DataFrame
df

Unnamed: 0,team,points,assists
A,A,5,11
B,A,7,8
C,A,7,10
D,A,9,6
E,B,12,6
F,B,9,5
G,B,9,9
H,B,4,12


In [None]:
# Select rows with index labels 'E' and 'F'
df.loc[['E', 'F']]

Unnamed: 0,team,points,assists
E,B,12,6
F,B,9,5


In [None]:
# [rows , columns]
# rows 'E' to the end, and columns 'everything': up to assists
df.loc['E': , :'assists']

Unnamed: 0,team,points,assists
E,B,12,6
F,B,9,5
G,B,9,9
H,B,4,12


In [None]:
# Select 'E' and 'F' rows and 'team' and 'assists' columns
df.loc[['E', 'F'], ['team', 'assists']]

Unnamed: 0,team,assists
E,B,6
F,B,5


In [None]:
# Select rows in index positions 4 through 6 (not including 6)
df.iloc[4:6]

In [None]:
# Select rows in range 4 through 6 and columns in range 0 through 2
df.iloc[4:6, 0:2]

Unnamed: 0,team,points
E,B,12
F,B,9


In [None]:
# Select rows from 4 through end of rows and columns up to third column
df.iloc[4: , :3]

Unnamed: 0,team,points,assists
E,B,12,6
F,B,9,5
G,B,9,9
H,B,4,12


### Boolean Indexing

In [None]:
grades[grades >=85]

Unnamed: 0,Alice,Bob,Charles
Homework,87.0,100.0,
Quizzes,,99.0,86.0
Exam 1,95.0,,
Exam 2,,,88.0


In [None]:
grades[(grades >80) & (grades < 90) ]

Unnamed: 0,Alice,Bob,Charles
Homework,87.0,,
Quizzes,,,86.0
Exam 1,,82.0,
Exam 2,,,88.0


In [None]:
grades.loc['Homework']

Alice       87
Bob        100
Charles     65
Name: Homework, dtype: int64

In [None]:
grades.iloc[1:3,0:2]

Unnamed: 0,Alice,Bob
Quizzes,62,99
Exam 1,95,82


In [None]:
grades.iloc[[0,2],0:2]

Unnamed: 0,Alice,Bob
Homework,87,100
Exam 1,95,82


In [None]:
grades.iloc[[0,3],[0,2]]

Unnamed: 0,Alice,Charles
Homework,87,65
Exam 2,77,88


In [None]:
grades.Alice

Homework    87
Quizzes     62
Exam 1      95
Exam 2      77
Name: Alice, dtype: int64