<a href="https://colab.research.google.com/github/jack-cao-623/python_learning/blob/main/dataframes_03_data_extraction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# DataFrames Part 3, Data Extraction
- .iloc[], .loc[]

## First lesson; set up

In [1]:
# libraries needed
import numpy as np
import pandas as pd

In [2]:
# load data
bond = pd.read_csv(
    'https://raw.githubusercontent.com/jack-cao-623/python_learning/main/pandas/jamesbond.csv'
)

bond.head(n = 5)

# one row per bond movie

Unnamed: 0,Film,Year,Actor,Director,Box Office,Budget,Bond Actor Salary
0,Dr. No,1962,Sean Connery,Terence Young,448.8,7.0,0.6
1,From Russia with Love,1963,Sean Connery,Terence Young,543.8,12.6,1.6
2,Goldfinger,1964,Sean Connery,Guy Hamilton,820.4,18.6,3.2
3,Thunderball,1965,Sean Connery,Terence Young,848.1,41.9,4.7
4,Casino Royale,1967,David Niven,Ken Hughes,315.0,85.0,


In [3]:
# non-null counts and data type
bond.info()
  # box office, budget, and actor salary are in millions

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26 entries, 0 to 25
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Film               26 non-null     object 
 1   Year               26 non-null     int64  
 2   Actor              26 non-null     object 
 3   Director           26 non-null     object 
 4   Box Office         26 non-null     float64
 5   Budget             26 non-null     float64
 6   Bond Actor Salary  18 non-null     float64
dtypes: float64(3), int64(1), object(3)
memory usage: 1.5+ KB


In [4]:
# reproduce non-null counts
(
    bond
      .notnull()
      .sum(axis = 'index')      # count of non null values
      .div(bond.shape[0])       # percent of non null values
)

Film                 1.000000
Year                 1.000000
Actor                1.000000
Director             1.000000
Box Office           1.000000
Budget               1.000000
Bond Actor Salary    0.692308
dtype: float64

## .set_index() and .reset_index() methods

In [5]:
# current axes
bond.axes

[RangeIndex(start=0, stop=26, step=1),
 Index(['Film', 'Year', 'Actor', 'Director', 'Box Office', 'Budget',
        'Bond Actor Salary'],
       dtype='object')]

In [6]:
# current row axis
bond.index

RangeIndex(start=0, stop=26, step=1)

In [7]:
# one row per Film
bond.head()

Unnamed: 0,Film,Year,Actor,Director,Box Office,Budget,Bond Actor Salary
0,Dr. No,1962,Sean Connery,Terence Young,448.8,7.0,0.6
1,From Russia with Love,1963,Sean Connery,Terence Young,543.8,12.6,1.6
2,Goldfinger,1964,Sean Connery,Guy Hamilton,820.4,18.6,3.2
3,Thunderball,1965,Sean Connery,Terence Young,848.1,41.9,4.7
4,Casino Royale,1967,David Niven,Ken Hughes,315.0,85.0,


In [8]:
# make Film the index
bond.set_index(keys = 'Film') # can also do index_col parameter in pd.read_csv()

Unnamed: 0_level_0,Year,Actor,Director,Box Office,Budget,Bond Actor Salary
Film,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Dr. No,1962,Sean Connery,Terence Young,448.8,7.0,0.6
From Russia with Love,1963,Sean Connery,Terence Young,543.8,12.6,1.6
Goldfinger,1964,Sean Connery,Guy Hamilton,820.4,18.6,3.2
Thunderball,1965,Sean Connery,Terence Young,848.1,41.9,4.7
Casino Royale,1967,David Niven,Ken Hughes,315.0,85.0,
You Only Live Twice,1967,Sean Connery,Lewis Gilbert,514.2,59.9,4.4
On Her Majesty's Secret Service,1969,George Lazenby,Peter R. Hunt,291.5,37.3,0.6
Diamonds Are Forever,1971,Sean Connery,Guy Hamilton,442.5,34.7,5.8
Live and Let Die,1973,Roger Moore,Guy Hamilton,460.3,30.8,
The Man with the Golden Gun,1974,Roger Moore,Guy Hamilton,334.0,27.7,


In [9]:
# make Film the index and then undo it
(
    bond
      .set_index('Film')
      .reset_index()
)

Unnamed: 0,Film,Year,Actor,Director,Box Office,Budget,Bond Actor Salary
0,Dr. No,1962,Sean Connery,Terence Young,448.8,7.0,0.6
1,From Russia with Love,1963,Sean Connery,Terence Young,543.8,12.6,1.6
2,Goldfinger,1964,Sean Connery,Guy Hamilton,820.4,18.6,3.2
3,Thunderball,1965,Sean Connery,Terence Young,848.1,41.9,4.7
4,Casino Royale,1967,David Niven,Ken Hughes,315.0,85.0,
5,You Only Live Twice,1967,Sean Connery,Lewis Gilbert,514.2,59.9,4.4
6,On Her Majesty's Secret Service,1969,George Lazenby,Peter R. Hunt,291.5,37.3,0.6
7,Diamonds Are Forever,1971,Sean Connery,Guy Hamilton,442.5,34.7,5.8
8,Live and Let Die,1973,Roger Moore,Guy Hamilton,460.3,30.8,
9,The Man with the Golden Gun,1974,Roger Moore,Guy Hamilton,334.0,27.7,


In [10]:
# for reset index, boolean drop paramter for dropping the index entirely

In [11]:
# make Year the index
bond_year_index = (
    bond
      .set_index('Year')
)

bond_year_index.head()

Unnamed: 0_level_0,Film,Actor,Director,Box Office,Budget,Bond Actor Salary
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1962,Dr. No,Sean Connery,Terence Young,448.8,7.0,0.6
1963,From Russia with Love,Sean Connery,Terence Young,543.8,12.6,1.6
1964,Goldfinger,Sean Connery,Guy Hamilton,820.4,18.6,3.2
1965,Thunderball,Sean Connery,Terence Young,848.1,41.9,4.7
1967,Casino Royale,David Niven,Ken Hughes,315.0,85.0,


## .loc[ ] is neither an attribute nor a method; it's a property
- loc for label location; but integer position is also a label
- Inclusive of last value, unlike .iloc[]
- Better to do .sort_index() for optimization; memory usage is better

In [12]:
# load data
bond = pd.read_csv(
    'https://raw.githubusercontent.com/jack-cao-623/python_learning/main/pandas/jamesbond.csv'
)

In [13]:
bond.head()

Unnamed: 0,Film,Year,Actor,Director,Box Office,Budget,Bond Actor Salary
0,Dr. No,1962,Sean Connery,Terence Young,448.8,7.0,0.6
1,From Russia with Love,1963,Sean Connery,Terence Young,543.8,12.6,1.6
2,Goldfinger,1964,Sean Connery,Guy Hamilton,820.4,18.6,3.2
3,Thunderball,1965,Sean Connery,Terence Young,848.1,41.9,4.7
4,Casino Royale,1967,David Niven,Ken Hughes,315.0,85.0,


In [14]:
bond.loc[0:5,:]   # index 0 thru 5, inclusive (0:5), all columns (:)

Unnamed: 0,Film,Year,Actor,Director,Box Office,Budget,Bond Actor Salary
0,Dr. No,1962,Sean Connery,Terence Young,448.8,7.0,0.6
1,From Russia with Love,1963,Sean Connery,Terence Young,543.8,12.6,1.6
2,Goldfinger,1964,Sean Connery,Guy Hamilton,820.4,18.6,3.2
3,Thunderball,1965,Sean Connery,Terence Young,848.1,41.9,4.7
4,Casino Royale,1967,David Niven,Ken Hughes,315.0,85.0,
5,You Only Live Twice,1967,Sean Connery,Lewis Gilbert,514.2,59.9,4.4


In [15]:
(
    bond
      .set_index('Film')                              # set Film column/Series as index
      .sort_index(axis = 0, ascending = True)         # sort the DataFrame by index, alphabetically
      .sort_index(axis = 1, ascending = True)         # sort the DataFrame by column, alphabetically; Film is index, not a column
      .loc['GoldenEye':, :]                           # grab all indices from GoldenEye thru to end, all columns
)

Unnamed: 0_level_0,Actor,Bond Actor Salary,Box Office,Budget,Director,Year
Film,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
GoldenEye,Pierce Brosnan,5.1,518.5,76.9,Martin Campbell,1995
Goldfinger,Sean Connery,3.2,820.4,18.6,Guy Hamilton,1964
Licence to Kill,Timothy Dalton,7.9,250.9,56.7,John Glen,1989
Live and Let Die,Roger Moore,,460.3,30.8,Guy Hamilton,1973
Moonraker,Roger Moore,,535.0,91.5,Lewis Gilbert,1979
Never Say Never Again,Sean Connery,,380.0,86.0,Irvin Kershner,1983
Octopussy,Roger Moore,7.8,373.8,53.9,John Glen,1983
On Her Majesty's Secret Service,George Lazenby,0.6,291.5,37.3,Peter R. Hunt,1969
Quantum of Solace,Daniel Craig,8.1,514.2,181.4,Marc Forster,2008
Skyfall,Daniel Craig,14.5,943.5,170.2,Sam Mendes,2012


In [16]:
# integer position is also a label
bond.loc[20:, 'Film':'Actor']

Unnamed: 0,Film,Year,Actor
20,The World Is Not Enough,1999,Pierce Brosnan
21,Die Another Day,2002,Pierce Brosnan
22,Casino Royale,2006,Daniel Craig
23,Quantum of Solace,2008,Daniel Craig
24,Skyfall,2012,Daniel Craig
25,Spectre,2015,Daniel Craig


In [17]:
# Get back info about Goldfinger film
(
    bond
      .set_index('Film')           # Film Series/column is the index
      .loc['Goldfinger', :]        # extract info about this film, all columns (:)
)

# output is a Series

Year                         1964
Actor                Sean Connery
Director             Guy Hamilton
Box Office                  820.4
Budget                       18.6
Bond Actor Salary             3.2
Name: Goldfinger, dtype: object

In [22]:
# index labels don't need to be unique

# two rows where Film is Casino Royale
(
    bond
      [bond['Film'] == 'Casino Royale']
      .set_index('Film')
)

Unnamed: 0_level_0,Year,Actor,Director,Box Office,Budget,Bond Actor Salary
Film,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Casino Royale,1967,David Niven,Ken Hughes,315.0,85.0,
Casino Royale,2006,Daniel Craig,Martin Campbell,581.5,145.3,3.3


In [20]:
# make Film the index and extract using .loc[]
(
    bond
      .set_index('Film')
      .loc['Casino Royale', :]
)

Unnamed: 0_level_0,Year,Actor,Director,Box Office,Budget,Bond Actor Salary
Film,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Casino Royale,1967,David Niven,Ken Hughes,315.0,85.0,
Casino Royale,2006,Daniel Craig,Martin Campbell,581.5,145.3,3.3


In [42]:
# .loc[] suppports list slicing
(
    bond
      .set_index('Film')
      .sort_index()                                # if you want to sort films alphabetically
      .loc['Die Another Day':'Skyfall':2, :]       # list slicing; #2 refers to step by 2, i.e., every other; defaults to 1
)

Unnamed: 0_level_0,Year,Actor,Director,Box Office,Budget,Bond Actor Salary
Film,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Die Another Day,2002,Pierce Brosnan,Lee Tamahori,465.4,154.2,17.9
For Your Eyes Only,1981,Roger Moore,John Glen,449.4,60.2,
GoldenEye,1995,Pierce Brosnan,Martin Campbell,518.5,76.9,5.1
Licence to Kill,1989,Timothy Dalton,John Glen,250.9,56.7,7.9
Moonraker,1979,Roger Moore,Lewis Gilbert,535.0,91.5,
Octopussy,1983,Roger Moore,John Glen,373.8,53.9,7.8
Quantum of Solace,2008,Daniel Craig,Marc Forster,514.2,181.4,8.1


In [44]:
# set Film as index, sort alphabetically, extract all Films from start up through Die Another Day
(
    bond
      .set_index('Film')
      .sort_index()
      .loc[:'Die Another Day']
)


Unnamed: 0_level_0,Year,Actor,Director,Box Office,Budget,Bond Actor Salary
Film,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
A View to a Kill,1985,Roger Moore,John Glen,275.2,54.5,9.1
Casino Royale,2006,Daniel Craig,Martin Campbell,581.5,145.3,3.3
Casino Royale,1967,David Niven,Ken Hughes,315.0,85.0,
Diamonds Are Forever,1971,Sean Connery,Guy Hamilton,442.5,34.7,5.8
Die Another Day,2002,Pierce Brosnan,Lee Tamahori,465.4,154.2,17.9


In [45]:
# set Film as index, sort alphabetically, extract all Films from Die Another Day to end
(
    bond
      .set_index('Film')
      .sort_index()
      .loc['Die Another Day':]
)

Output = None
# above suppresses output

In [46]:
# specific films as a list
(
    bond
      .set_index('Film')
      .sort_index()
      .loc[['Spectre', 'GoldenEye']]         # every element in list must exist; otherwise error 
)

Unnamed: 0_level_0,Year,Actor,Director,Box Office,Budget,Bond Actor Salary
Film,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Spectre,2015,Daniel Craig,Sam Mendes,726.7,206.3,
GoldenEye,1995,Pierce Brosnan,Martin Campbell,518.5,76.9,5.1


In [50]:
# check if a Film is in dataset
'Dr. No' in bond['Film']

False

In [51]:
# another way to do it; but not the real way
sum(bond['Film'] == 'Dr. No')

1

In [52]:
'apple' in ['apple', 'banana', 'pear']

True

## .iloc[ ] accessor

In [54]:
# reload data
bond = pd.read_csv(
    'https://raw.githubusercontent.com/jack-cao-623/python_learning/main/pandas/jamesbond.csv'
)

bond.head()

Unnamed: 0,Film,Year,Actor,Director,Box Office,Budget,Bond Actor Salary
0,Dr. No,1962,Sean Connery,Terence Young,448.8,7.0,0.6
1,From Russia with Love,1963,Sean Connery,Terence Young,543.8,12.6,1.6
2,Goldfinger,1964,Sean Connery,Guy Hamilton,820.4,18.6,3.2
3,Thunderball,1965,Sean Connery,Terence Young,848.1,41.9,4.7
4,Casino Royale,1967,David Niven,Ken Hughes,315.0,85.0,


In [56]:
# extract even indices below 10
bond.iloc[:10:2]

Unnamed: 0,Film,Year,Actor,Director,Box Office,Budget,Bond Actor Salary
0,Dr. No,1962,Sean Connery,Terence Young,448.8,7.0,0.6
2,Goldfinger,1964,Sean Connery,Guy Hamilton,820.4,18.6,3.2
4,Casino Royale,1967,David Niven,Ken Hughes,315.0,85.0,
6,On Her Majesty's Secret Service,1969,George Lazenby,Peter R. Hunt,291.5,37.3,0.6
8,Live and Let Die,1973,Roger Moore,Guy Hamilton,460.3,30.8,


In [57]:
# extract data in index position 0
bond.iloc[0]

Film                        Dr. No
Year                          1962
Actor                 Sean Connery
Director             Terence Young
Box Office                   448.8
Budget                         7.0
Bond Actor Salary              0.6
Name: 0, dtype: object

In [58]:
# extract data in index positions 15, 21, and 7
bond.iloc[[15, 21, 7]]

Unnamed: 0,Film,Year,Actor,Director,Box Office,Budget,Bond Actor Salary
15,A View to a Kill,1985,Roger Moore,John Glen,275.2,54.5,9.1
21,Die Another Day,2002,Pierce Brosnan,Lee Tamahori,465.4,154.2,17.9
7,Diamonds Are Forever,1971,Sean Connery,Guy Hamilton,442.5,34.7,5.8


In [60]:
# extract data in index positions 4 thru 8, inclusive
bond.iloc[4:9]

Unnamed: 0,Film,Year,Actor,Director,Box Office,Budget,Bond Actor Salary
4,Casino Royale,1967,David Niven,Ken Hughes,315.0,85.0,
5,You Only Live Twice,1967,Sean Connery,Lewis Gilbert,514.2,59.9,4.4
6,On Her Majesty's Secret Service,1969,George Lazenby,Peter R. Hunt,291.5,37.3,0.6
7,Diamonds Are Forever,1971,Sean Connery,Guy Hamilton,442.5,34.7,5.8
8,Live and Let Die,1973,Roger Moore,Guy Hamilton,460.3,30.8,


In [61]:
# note that unlike .loc[], .iloc[] excludes the last/right side

In [62]:
# from 20 to end
bond.iloc[20:]

Unnamed: 0,Film,Year,Actor,Director,Box Office,Budget,Bond Actor Salary
20,The World Is Not Enough,1999,Pierce Brosnan,Michael Apted,439.5,158.3,13.5
21,Die Another Day,2002,Pierce Brosnan,Lee Tamahori,465.4,154.2,17.9
22,Casino Royale,2006,Daniel Craig,Martin Campbell,581.5,145.3,3.3
23,Quantum of Solace,2008,Daniel Craig,Marc Forster,514.2,181.4,8.1
24,Skyfall,2012,Daniel Craig,Sam Mendes,943.5,170.2,14.5
25,Spectre,2015,Daniel Craig,Sam Mendes,726.7,206.3,


In [63]:
# from beginning to 4, exclude index position 4
bond.iloc[:4]

Unnamed: 0,Film,Year,Actor,Director,Box Office,Budget,Bond Actor Salary
0,Dr. No,1962,Sean Connery,Terence Young,448.8,7.0,0.6
1,From Russia with Love,1963,Sean Connery,Terence Young,543.8,12.6,1.6
2,Goldfinger,1964,Sean Connery,Guy Hamilton,820.4,18.6,3.2
3,Thunderball,1965,Sean Connery,Terence Young,848.1,41.9,4.7


In [69]:
# string index labels have a position, can use .iloc[]
(
    bond
      .set_index('Film')
      .sort_index()
      .iloc[0]                      # index in position 0 is called 'A View to a Kill'; same as below
      #.loc['A View to a Kill']
)

Year                        1985
Actor                Roger Moore
Director               John Glen
Box Office                 275.2
Budget                      54.5
Bond Actor Salary            9.1
Name: A View to a Kill, dtype: object

## Second arguments in .loc[ ] and .iloc []

In [70]:
# reload data
bond = pd.read_csv(
    'https://raw.githubusercontent.com/jack-cao-623/python_learning/main/pandas/jamesbond.csv'
)

bond.head()

Unnamed: 0,Film,Year,Actor,Director,Box Office,Budget,Bond Actor Salary
0,Dr. No,1962,Sean Connery,Terence Young,448.8,7.0,0.6
1,From Russia with Love,1963,Sean Connery,Terence Young,543.8,12.6,1.6
2,Goldfinger,1964,Sean Connery,Guy Hamilton,820.4,18.6,3.2
3,Thunderball,1965,Sean Connery,Terence Young,848.1,41.9,4.7
4,Casino Royale,1967,David Niven,Ken Hughes,315.0,85.0,
