# Intro to Numpy and Pandas

## 1. NumPy: Array Indexing and Slicing

### 1.1: Creating and Displaying Arrays

In [None]:
import numpy as np

In [3]:
arr2d = np.array([[1,2,3],[4,5,6],[7,8,9]])

In [4]:
arr2d

array([[1, 2, 3],
       [4, 5, 6],
       [7, 8, 9]])

In [5]:
arr2d[:2]

array([[1, 2, 3],
       [4, 5, 6]])

- We leave the first value (row) as just ':' to indicate that we want to consider all rows
- The second value (column) is 0:2 and grabs indicis 0 and 1 (index up to but not including 2)

In [6]:
arr2d[:,0:2]

array([[1, 2],
       [4, 5],
       [7, 8]])

In [7]:
arr2d[1:2, :2]

array([[4, 5]])

- 1:2 selects rows 2 and 3
- :2 selects columns 1 and 2
    - Could also just use 1
- Used together they grab the values at row index 1,2 and column index 0,1

- Rows 0,1
- Columns 1,2

In [8]:
arr2d[:2, 1:]

array([[2, 3],
       [5, 6]])

## 2. Pandas: Series

### 2.1: Simple Series

In [1]:
from pandas import Series, DataFrame

In [2]:
import pandas as pd

In [3]:
obj = Series ([4,7,-5,3])

In [4]:
obj

0    4
1    7
2   -5
3    3
dtype: int64

Display the values and index:

In [5]:
obj.values

array([ 4,  7, -5,  3])

In [6]:
obj.index

RangeIndex(start=0, stop=4, step=1)

### 2.2: Price Series

In [8]:
price = Series([92600, 92400, 92100, 94300, 92300])

In [9]:
price

0    92600
1    92400
2    92100
3    94300
4    92300
dtype: int64

In [None]:
# See price of index 0
price[0]

### 2.3: Specifying Index
- Replace index numbers 0-4 with our own index
- In this case we will use dates

In [13]:
GOOL = Series([1251.00, 1231.80, 1209.95, 1198.89, 1171.00], index=['09/03/2018','09/04/2018','09/05/2018','09/06/2018','09/07/2018',])

In [14]:
GOOL

09/03/2018    1251.00
09/04/2018    1231.80
09/05/2018    1209.95
09/06/2018    1198.89
09/07/2018    1171.00
dtype: float64

### 2.4: Merging Formats
- Create two series with similar data but mixed indicis 
- Add these series and notice that similar indicis are summed even if they aren't in the same order

In [16]:
mine = Series([10,20,30], index = ['GOOL','AMZN','MS'])
yours = Series([10,50,20], index = ['AMZN','GOOL','MS'])

In [19]:
merge = mine + yours
merge

AMZN    30
GOOL    60
MS      50
dtype: int64

### 2.5: Creating Dataframe

In [3]:
from pandas import Series, DataFrame

Create normal dictionary with list as values

In [2]:
XYZ = {'open': [1165, 1110, 1120,1100],
      'high': [1210, 1180, 1120,1115],
      'low': [1160, 1105, 1090,1090],
      'close': [1190, 1160, 1100,1110]}

#### Convert XYZ to DataFrame:

In [4]:
Daily_XYZ = DataFrame(XYZ)

In [5]:
Daily_XYZ

Unnamed: 0,open,high,low,close
0,1165,1210,1160,1190
1,1110,1180,1105,1160
2,1120,1120,1090,1100
3,1100,1115,1090,1110


#### Create list and set it as index:

In [6]:
date = ['09/03/2018','09/04/2018','09/05/2018','09/06/2018']

In [7]:
Daily_XYZ = DataFrame(XYZ, index = date)

In [8]:
Daily_XYZ

Unnamed: 0,open,high,low,close
09/03/2018,1165,1210,1160,1190
09/04/2018,1110,1180,1105,1160
09/05/2018,1120,1120,1090,1100
09/06/2018,1100,1115,1090,1110


#### Retrieve Specific Column or Row

In [9]:
Daily_XYZ['high']

09/03/2018    1210
09/04/2018    1180
09/05/2018    1120
09/06/2018    1115
Name: high, dtype: int64

- Use .loc for label-based index 
- Use .iloc for positional index

In [10]:
Daily_XYZ.loc['09/03/2018']

open     1165
high     1210
low      1160
close    1190
Name: 09/03/2018, dtype: int64

## 3. Pandas: Cleaning Data


In [11]:
import numpy as np

In [12]:
import pandas as pd

### 3.1: Intro to NaN
- Values 0-14
- Reshape to 5 rows and 3 columns
- Change index to state names
- Name columns c1-c3
- Experiment adding NaNs and manipulating DF

In [13]:
df = pd.DataFrame(np.arange(0,15).reshape(5,3),
     index = ['Texas', 'Ohio', 'Wisconsin', 'Colorado', 'Utah'],
     columns = ['c1','c2','c3'])

In [14]:
df

Unnamed: 0,c1,c2,c3
Texas,0,1,2
Ohio,3,4,5
WIsconsin,6,7,8
Colorado,9,10,11
Utah,12,13,14


Add column, make all values NaN with numpy.nan:

In [15]:
df['c4'] = np.nan

In [16]:
df

Unnamed: 0,c1,c2,c3,c4
Texas,0,1,2,
Ohio,3,4,5,
WIsconsin,6,7,8,
Colorado,9,10,11,
Utah,12,13,14,


Create new index "New Mexico" and add values 15, 16, 17, 18

In [17]:
df.loc['New Mexico'] = np.arange(15,19)

In [18]:
df

Unnamed: 0,c1,c2,c3,c4
Texas,0,1,2,
Ohio,3,4,5,
WIsconsin,6,7,8,
Colorado,9,10,11,
Utah,12,13,14,
New Mexico,15,16,17,18.0


Add value to Texas c4

In [20]:
df['c4']['Texas'] = 20

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [21]:
df

Unnamed: 0,c1,c2,c3,c4
Texas,0,1,2,20.0
Ohio,3,4,5,
WIsconsin,6,7,8,
Colorado,9,10,11,
Utah,12,13,14,
New Mexico,15,16,17,18.0


Create new row Oklahoma and make all values NaN

In [23]:
df.loc['Oklahoma'] = np.nan

In [24]:
df

Unnamed: 0,c1,c2,c3,c4
Texas,0.0,1.0,2.0,20.0
Ohio,3.0,4.0,5.0,
WIsconsin,6.0,7.0,8.0,
Colorado,9.0,10.0,11.0,
Utah,12.0,13.0,14.0,
New Mexico,15.0,16.0,17.0,18.0
Oklahoma,,,,


Add one more column of NaN values

In [25]:
df['c5'] = np.nan

In [26]:
df

Unnamed: 0,c1,c2,c3,c4,c5
Texas,0.0,1.0,2.0,20.0,
Ohio,3.0,4.0,5.0,,
WIsconsin,6.0,7.0,8.0,,
Colorado,9.0,10.0,11.0,,
Utah,12.0,13.0,14.0,,
New Mexico,15.0,16.0,17.0,18.0,
Oklahoma,,,,,


### 3.2: Finding Nulls
- Use .isnull function to return a value of TRUE if values is NaN
- Use .sum to sum up number of nulls
- Use .notnull to return only values that aren't null

In [28]:
df.isnull()

Unnamed: 0,c1,c2,c3,c4,c5
Texas,False,False,False,False,True
Ohio,False,False,False,True,True
WIsconsin,False,False,False,True,True
Colorado,False,False,False,True,True
Utah,False,False,False,True,True
New Mexico,False,False,False,False,True
Oklahoma,True,True,True,True,True


Sum the nulls for each column with **.sum**

In [29]:
df.isnull().sum()

c1    1
c2    1
c3    1
c4    5
c5    7
dtype: int64

Sum of null sums

In [30]:
df.isnull().sum().sum()

15

Select non-NaN items in c4 with **.notnull**

In [31]:
df.c4[df.c4.notnull()]

Texas         20.0
New Mexico    18.0
Name: c4, dtype: float64

### 3.3: Using .dropna to Drop Nulls
- Use .dropna(how = 'all') to drop all nulls from rows and columns
- These exercises return results from DF but don't edit it

Drop NaN values in c4 with **.dropna**

In [32]:
df.c4.dropna()

Texas         20.0
New Mexico    18.0
Name: c4, dtype: float64

Drop nulls from the entire dataframe
- No rows left to return, so it's empty

In [34]:
df.dropna()

Unnamed: 0,c1,c2,c3,c4,c5


Drop row if entire row is null

In [36]:
df.dropna(how = 'all')

Unnamed: 0,c1,c2,c3,c4,c5
Texas,0.0,1.0,2.0,20.0,
Ohio,3.0,4.0,5.0,,
WIsconsin,6.0,7.0,8.0,,
Colorado,9.0,10.0,11.0,,
Utah,12.0,13.0,14.0,,
New Mexico,15.0,16.0,17.0,18.0,


Drop column if entire column is null
- Remember axis = 1 refers to column

In [37]:
df.dropna(how = 'all', axis = 1)

Unnamed: 0,c1,c2,c3,c4
Texas,0.0,1.0,2.0,20.0
Ohio,3.0,4.0,5.0,
WIsconsin,6.0,7.0,8.0,
Colorado,9.0,10.0,11.0,
Utah,12.0,13.0,14.0,
New Mexico,15.0,16.0,17.0,18.0
Oklahoma,,,,


### Example 3.4: Editing Copy of DataFrame

Make copy of DF with **.copy**

In [44]:
df2 = df.copy()

In [45]:
df2

Unnamed: 0,c1,c2,c3,c4,c5
Texas,0.0,1.0,2.0,20.0,
Ohio,3.0,4.0,5.0,,
WIsconsin,6.0,7.0,8.0,,
Colorado,9.0,10.0,11.0,,
Utah,12.0,13.0,14.0,,
New Mexico,15.0,16.0,17.0,18.0,
Oklahoma,,,,,


Replace Oklahoma nulls in column 1 and 3 with 0

df2.loc['Oklahoma'].c1 = 0
df2.loc['Oklahoma'].c3 = 0

In [47]:
df2

Unnamed: 0,c1,c2,c3,c4,c5
Texas,0.0,1.0,2.0,20.0,
Ohio,3.0,4.0,5.0,,
WIsconsin,6.0,7.0,8.0,,
Colorado,9.0,10.0,11.0,,
Utah,12.0,13.0,14.0,,
New Mexico,15.0,16.0,17.0,18.0,
Oklahoma,0.0,,0.0,,


Drop columns with any nulls at all using how = any
- Notice only two columns survive

In [48]:
df2.dropna(how = 'any', axis = 1)

Unnamed: 0,c1,c3
Texas,0.0,2.0
Ohio,3.0,5.0
WIsconsin,6.0,8.0
Colorado,9.0,11.0
Utah,12.0,14.0
New Mexico,15.0,17.0
Oklahoma,0.0,0.0


### Example 3.5: Interpolation

#### Use **.fillna** to replace nulls with value in argument

In [49]:
filled = df.fillna(0)

In [50]:
filled

Unnamed: 0,c1,c2,c3,c4,c5
Texas,0.0,1.0,2.0,20.0,0.0
Ohio,3.0,4.0,5.0,0.0,0.0
WIsconsin,6.0,7.0,8.0,0.0,0.0
Colorado,9.0,10.0,11.0,0.0,0.0
Utah,12.0,13.0,14.0,0.0,0.0
New Mexico,15.0,16.0,17.0,18.0,0.0
Oklahoma,0.0,0.0,0.0,0.0,0.0


Be careful when replacing with zeroes
- May affect some analysis like finding the mean of a dataset

#### Filling Forward and Backward with Method Option
- Start at (first/last) non-null and fill it (backwards/forwards)
- Overwrite all nulls (forward/backward) with that value until you reach next non-null
- Continue filling with that value
- Forward: ffill
- Backward: bfill

In [51]:
df.c4.fillna(method = 'ffill')

Texas         20.0
Ohio          20.0
WIsconsin     20.0
Colorado      20.0
Utah          20.0
New Mexico    18.0
Oklahoma      18.0
Name: c4, dtype: float64

In [52]:
df.c4.fillna(method = 'bfill')

Texas         20.0
Ohio          18.0
WIsconsin     18.0
Colorado      18.0
Utah          18.0
New Mexico    18.0
Oklahoma       NaN
Name: c4, dtype: float64

### 3.6: Applying Custom Function with .apply()

Define frame with 4 rows, 3 columns, and index as state name:

In [56]:
frame = DataFrame(np.random.randn(4,3),columns = list('bde'),
                 index = ['Texas', 'Ohio', 'Wisconsin', 'Colorado'])

In [57]:
frame

Unnamed: 0,b,d,e
Texas,0.700814,0.153611,-0.848665
Ohio,0.167884,2.051426,0.23318
Wisconsin,0.65539,0.948974,-0.949737
Colorado,-1.064042,-0.222048,-1.031234


- Create custom function that finds the min and max and returns the series in Min and Max rows
- This will calculate min and max for each column and return

In [58]:
def f(x):
    return Series([x.min(), x.max()], index = ['Min', 'Max'])

Use .apply() to use your function on the dataframe

In [59]:
frame.apply(f)

Unnamed: 0,b,d,e
Min,-1.064042,-0.222048,-1.031234
Max,0.700814,2.051426,0.23318


### 3.7: Using .describe() for Summary Statistics

In [61]:
frame.describe()

Unnamed: 0,b,d,e
count,4.0,4.0,4.0
mean,0.115012,0.732991,-0.649114
std,0.82222,1.005435,0.592918
min,-1.064042,-0.222048,-1.031234
25%,-0.140098,0.059696,-0.970111
50%,0.411637,0.551293,-0.899201
75%,0.666746,1.224587,-0.578203
max,0.700814,2.051426,0.23318


## 4. Grouping and Aggregating

### 4.1: Concatenate in Numpy vs. Pandas

Create a simple 3x4 array of numbers 0-11 and concatenate with itself:

In [63]:
import numpy

In [65]:
arr = numpy.arange(12).reshape(3,4)

In [66]:
arr

array([[ 0,  1,  2,  3],
       [ 4,  5,  6,  7],
       [ 8,  9, 10, 11]])

In [67]:
numpy.concatenate([arr, arr], axis = 1)

array([[ 0,  1,  2,  3,  0,  1,  2,  3],
       [ 4,  5,  6,  7,  4,  5,  6,  7],
       [ 8,  9, 10, 11,  8,  9, 10, 11]])

- Concatenate with axis = 1
- Notice that arrays are placed side by side 

Create three series for Pandas Concat demo:

In [68]:
s1 = Series([0,1], index = ['a','b'])
s2 = Series([2,3,4], index = ['c','d','e'])
s3 = Series([5,6], index = ['f','g'])

By default pandas.concat is row based, uses axis = 0:

In [69]:
pd.concat([s1,s2, s3])

a    0
b    1
c    2
d    3
e    4
f    5
g    6
dtype: int64

Change to axis = 1:

In [70]:
pd.concat([s1,s2, s3], axis = 1)

Unnamed: 0,0,1,2
a,0.0,,
b,1.0,,
c,,2.0,
d,,3.0,
e,,4.0,
f,,,5.0
g,,,6.0


### 4.2 Grouping

In [73]:
# Load data
cars = pd.read_csv('Datasets/07_car.csv')

In [74]:
print(cars)

                    New    Category  Evaluation
0                   New       Sedan           1
1   Certified Pre-Owned         SUV           2
2                  Used  Cross-Over           2
3                   New         Van           3
4   Certified Pre-Owned  Cross-Over           4
5                  Used         Van           5
6                   New       Sedan           5
7                   New         SUV           5
8                   New         SUV           4
9                   New         SUV           1
10                 Used       Sedan           1
11                 Used       Sedan           2
12                 Used         Van           3
13                 Used  Cross-Over           4
14  Certified Pre-Owned         SUV           4
15  Certified Pre-Owned         SUV           3
16  Certified Pre-Owned         Van           2
17  Certified Pre-Owned  Cross-Over           3
18                  New       Sedan           4
19                  New       Sedan     

In [75]:
# Group by 'Category' in CSV
grouped = cars.groupby('Category')

In [76]:
grouped

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x7f8cd08b1690>

To actually look inside group, we need to use other methods:

In [79]:
# Display number of groups
grouped.ngroups

4

In [80]:
grouped.groups

{'Cross-Over': Int64Index([2, 4, 13, 17, 22], dtype='int64'),
 'SUV': Int64Index([1, 7, 8, 9, 14, 15], dtype='int64'),
 'Sedan': Int64Index([0, 6, 10, 11, 18, 19, 20], dtype='int64'),
 'Van': Int64Index([3, 5, 12, 16, 21], dtype='int64')}

Shows key as each group name and their indexes, a collection of named groups

In [81]:
# Using loop to see group details

def print_groups(groupobject):
    # Loop over all groups
    # Print group name and details
    for name, group in groupobject:
        print(name)
        print(group)

In [84]:
# Pass group to new function

print_groups(grouped)

Cross-Over
                    New    Category  Evaluation
2                  Used  Cross-Over           2
4   Certified Pre-Owned  Cross-Over           4
13                 Used  Cross-Over           4
17  Certified Pre-Owned  Cross-Over           3
22                 Used  Cross-Over           2
SUV
                    New Category  Evaluation
1   Certified Pre-Owned      SUV           2
7                   New      SUV           5
8                   New      SUV           4
9                   New      SUV           1
14  Certified Pre-Owned      SUV           4
15  Certified Pre-Owned      SUV           3
Sedan
     New Category  Evaluation
0    New    Sedan           1
6    New    Sedan           5
10  Used    Sedan           1
11  Used    Sedan           2
18   New    Sedan           4
19   New    Sedan           4
20   New    Sedan           5
Van
                    New Category  Evaluation
3                   New      Van           3
5                  Used      Van          

### 4.3 Aggregating

Using previous groupby() DF

In [85]:
# Find size of each category

grouped.size()

Category
Cross-Over    5
SUV           6
Sedan         7
Van           5
dtype: int64

In [86]:
# Grab head of grouped
# Remember grouped is grouping of categories
# In this case we are grabbing head of categories, not rows

grouped.head(3)

Unnamed: 0,New,Category,Evaluation
0,New,Sedan,1
1,Certified Pre-Owned,SUV,2
2,Used,Cross-Over,2
3,New,Van,3
4,Certified Pre-Owned,Cross-Over,4
5,Used,Van,5
6,New,Sedan,5
7,New,SUV,5
8,New,SUV,4
10,Used,Sedan,1


#### 4.3.1 Adding an extra column to group:

In [88]:
grouped2 = cars.groupby(['Category','New'])

In [89]:
print_groups(grouped2)

('Cross-Over', 'Certified Pre-Owned')
                    New    Category  Evaluation
4   Certified Pre-Owned  Cross-Over           4
17  Certified Pre-Owned  Cross-Over           3
('Cross-Over', 'Used')
     New    Category  Evaluation
2   Used  Cross-Over           2
13  Used  Cross-Over           4
22  Used  Cross-Over           2
('SUV', 'Certified Pre-Owned')
                    New Category  Evaluation
1   Certified Pre-Owned      SUV           2
14  Certified Pre-Owned      SUV           4
15  Certified Pre-Owned      SUV           3
('SUV', 'New')
   New Category  Evaluation
7  New      SUV           5
8  New      SUV           4
9  New      SUV           1
('Sedan', 'New')
    New Category  Evaluation
0   New    Sedan           1
6   New    Sedan           5
18  New    Sedan           4
19  New    Sedan           4
20  New    Sedan           5
('Sedan', 'Used')
     New Category  Evaluation
10  Used    Sedan           1
11  Used    Sedan           2
('Van', 'Certified Pre-Own

#### 4.3.2 Using .agg() Function

In [90]:
import numpy

In [91]:
grouped2.agg(np.mean)

Unnamed: 0_level_0,Unnamed: 1_level_0,Evaluation
Category,New,Unnamed: 2_level_1
Cross-Over,Certified Pre-Owned,3.5
Cross-Over,Used,2.666667
SUV,Certified Pre-Owned,3.0
SUV,New,3.333333
Sedan,New,3.8
Sedan,Used,1.5
Van,Certified Pre-Owned,2.0
Van,New,4.0
Van,Used,4.0


Use NumPy .agg() to aggregate the means of the groups

In [92]:
grouped2.agg([np.sum, np.std])

Unnamed: 0_level_0,Unnamed: 1_level_0,Evaluation,Evaluation
Unnamed: 0_level_1,Unnamed: 1_level_1,sum,std
Category,New,Unnamed: 2_level_2,Unnamed: 3_level_2
Cross-Over,Certified Pre-Owned,7,0.707107
Cross-Over,Used,8,1.154701
SUV,Certified Pre-Owned,9,1.0
SUV,New,10,2.081666
Sedan,New,19,1.643168
Sedan,Used,3,0.707107
Van,Certified Pre-Owned,2,
Van,New,8,1.414214
Van,Used,8,1.414214


Passing multiple functions (sum and std) as list to agg function