# Section 5: DataFrames III: Data Extraction

In [2]:
import pandas as pd

### Intro to the DataFrames III Module + Import Dataset

In [4]:
bond = pd.read_csv('./data/jamesbond.csv')

In [5]:
bond.head()

Unnamed: 0,Film,Year,Actor,Director,Box Office,Budget,Bond Actor Salary
0,Dr. No,1962,Sean Connery,Terence Young,448.8,7.0,0.6
1,From Russia with Love,1963,Sean Connery,Terence Young,543.8,12.6,1.6
2,Goldfinger,1964,Sean Connery,Guy Hamilton,820.4,18.6,3.2
3,Thunderball,1965,Sean Connery,Terence Young,848.1,41.9,4.7
4,Casino Royale,1967,David Niven,Ken Hughes,315.0,85.0,


In [6]:
bond.tail()

Unnamed: 0,Film,Year,Actor,Director,Box Office,Budget,Bond Actor Salary
21,Die Another Day,2002,Pierce Brosnan,Lee Tamahori,465.4,154.2,17.9
22,Casino Royale,2006,Daniel Craig,Martin Campbell,581.5,145.3,3.3
23,Quantum of Solace,2008,Daniel Craig,Marc Forster,514.2,181.4,8.1
24,Skyfall,2012,Daniel Craig,Sam Mendes,943.5,170.2,14.5
25,Spectre,2015,Daniel Craig,Sam Mendes,726.7,206.3,


### Use the set_index and reset_index methods to define a new DataFrame index

In [7]:
bond = pd.read_csv('./data/jamesbond.csv')
bond.head(3)

Unnamed: 0,Film,Year,Actor,Director,Box Office,Budget,Bond Actor Salary
0,Dr. No,1962,Sean Connery,Terence Young,448.8,7.0,0.6
1,From Russia with Love,1963,Sean Connery,Terence Young,543.8,12.6,1.6
2,Goldfinger,1964,Sean Connery,Guy Hamilton,820.4,18.6,3.2


In [8]:
#allows you to assign a new index to your df or to reset the index back to the standard numeric index

In [9]:
#here we want to set the index of the df to an already existing column
bond.set_index(keys = 'Film', inplace = True)
#this is really the same as bond = pd.read_csv('./jamesbond.csv', index_col = 'Film')
bond.head(3)
#the film column as now moved to the index and is no longer part of the value data of the df.

Unnamed: 0_level_0,Year,Actor,Director,Box Office,Budget,Bond Actor Salary
Film,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Dr. No,1962,Sean Connery,Terence Young,448.8,7.0,0.6
From Russia with Love,1963,Sean Connery,Terence Young,543.8,12.6,1.6
Goldfinger,1964,Sean Connery,Guy Hamilton,820.4,18.6,3.2


In [10]:
#can go back to the standard numeric index using the reset_index
bond.reset_index(drop = True) #this will completely remove the index column and reset it back to numeric
bond.reset_index(drop = False, inplace = True) #this will push the initial index of film back into the df values and reset it back to numeric
bond.head()

Unnamed: 0,Film,Year,Actor,Director,Box Office,Budget,Bond Actor Salary
0,Dr. No,1962,Sean Connery,Terence Young,448.8,7.0,0.6
1,From Russia with Love,1963,Sean Connery,Terence Young,543.8,12.6,1.6
2,Goldfinger,1964,Sean Connery,Guy Hamilton,820.4,18.6,3.2
3,Thunderball,1965,Sean Connery,Terence Young,848.1,41.9,4.7
4,Casino Royale,1967,David Niven,Ken Hughes,315.0,85.0,


In [11]:
#what if we have an existing index we went to replace with another column,
#but we dont want to lose the initial index
bond.set_index('Film', inplace = True)
bond.head(3)

Unnamed: 0_level_0,Year,Actor,Director,Box Office,Budget,Bond Actor Salary
Film,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Dr. No,1962,Sean Connery,Terence Young,448.8,7.0,0.6
From Russia with Love,1963,Sean Connery,Terence Young,543.8,12.6,1.6
Goldfinger,1964,Sean Connery,Guy Hamilton,820.4,18.6,3.2


In [12]:
#want to replace the index with the years column
#bond.set_index('Year') 
#this will overwrite the original index and completely toss it out of memory.

In [13]:
bond.reset_index(inplace = True)
bond.set_index('Year', inplace = True)
#this will reset the index to numeric index 
#and then set the index to the year
bond.head(3)

Unnamed: 0_level_0,Film,Actor,Director,Box Office,Budget,Bond Actor Salary
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1962,Dr. No,Sean Connery,Terence Young,448.8,7.0,0.6
1963,From Russia with Love,Sean Connery,Terence Young,543.8,12.6,1.6
1964,Goldfinger,Sean Connery,Guy Hamilton,820.4,18.6,3.2


### Retreive Rows by Index label and .loc[] Accessor

In [14]:
bond = pd.read_csv('./data/jamesbond.csv', index_col = 'Film')
#import csv into pd and set index column equal to film
bond.sort_index(inplace = True)
#also want to sort the index, which actually speeds up the search
bond.head(3)

Unnamed: 0_level_0,Year,Actor,Director,Box Office,Budget,Bond Actor Salary
Film,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
A View to a Kill,1985,Roger Moore,John Glen,275.2,54.5,9.1
Casino Royale,2006,Daniel Craig,Martin Campbell,581.5,145.3,3.3
Casino Royale,1967,David Niven,Ken Hughes,315.0,85.0,


In [15]:
#.loc[] is use to access a row by index label
bond.loc['Goldfinger']
#creates a new series with the values from goldfinger row

bond.loc['GoldenEye']

Year                            1995
Actor                 Pierce Brosnan
Director             Martin Campbell
Box Office                     518.5
Budget                          76.9
Bond Actor Salary                5.1
Name: GoldenEye, dtype: object

In [16]:
bond.loc['Casino Royale']
#if there are more than 2 index values that match the loc[] it will actually return a new df instead of a new series

Unnamed: 0_level_0,Year,Actor,Director,Box Office,Budget,Bond Actor Salary
Film,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Casino Royale,2006,Daniel Craig,Martin Campbell,581.5,145.3,3.3
Casino Royale,1967,David Niven,Ken Hughes,315.0,85.0,


In [17]:
#the .loc[] method supports a lot of the slicing operations you can do with a list.
#can select all rows from 1 movie to another movie
bond.loc['Diamonds Are Forever':'From Russia with Love']
#value in loc slicing the final value is included

Unnamed: 0_level_0,Year,Actor,Director,Box Office,Budget,Bond Actor Salary
Film,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Diamonds Are Forever,1971,Sean Connery,Guy Hamilton,442.5,34.7,5.8
Die Another Day,2002,Pierce Brosnan,Lee Tamahori,465.4,154.2,17.9
Dr. No,1962,Sean Connery,Terence Young,448.8,7.0,0.6
For Your Eyes Only,1981,Roger Moore,John Glen,449.4,60.2,
From Russia with Love,1963,Sean Connery,Terence Young,543.8,12.6,1.6


In [18]:
bond.loc['Diamonds Are Forever':'From Russia with Love':2]
#extracts all the values between diamonds are forever and from russia with love skipping every other value

Unnamed: 0_level_0,Year,Actor,Director,Box Office,Budget,Bond Actor Salary
Film,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Diamonds Are Forever,1971,Sean Connery,Guy Hamilton,442.5,34.7,5.8
Dr. No,1962,Sean Connery,Terence Young,448.8,7.0,0.6
From Russia with Love,1963,Sean Connery,Terence Young,543.8,12.6,1.6


In [19]:
bond.loc['GoldenEye':]
#pulls out all index labels from goldeneye to the end of the df

Unnamed: 0_level_0,Year,Actor,Director,Box Office,Budget,Bond Actor Salary
Film,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
GoldenEye,1995,Pierce Brosnan,Martin Campbell,518.5,76.9,5.1
Goldfinger,1964,Sean Connery,Guy Hamilton,820.4,18.6,3.2
Licence to Kill,1989,Timothy Dalton,John Glen,250.9,56.7,7.9
Live and Let Die,1973,Roger Moore,Guy Hamilton,460.3,30.8,
Moonraker,1979,Roger Moore,Lewis Gilbert,535.0,91.5,
Never Say Never Again,1983,Sean Connery,Irvin Kershner,380.0,86.0,
Octopussy,1983,Roger Moore,John Glen,373.8,53.9,7.8
On Her Majesty's Secret Service,1969,George Lazenby,Peter R. Hunt,291.5,37.3,0.6
Quantum of Solace,2008,Daniel Craig,Marc Forster,514.2,181.4,8.1
Skyfall,2012,Daniel Craig,Sam Mendes,943.5,170.2,14.5


In [20]:
bond.loc[:'Octopussy'] #from the beginning up to and including octopussy

Unnamed: 0_level_0,Year,Actor,Director,Box Office,Budget,Bond Actor Salary
Film,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
A View to a Kill,1985,Roger Moore,John Glen,275.2,54.5,9.1
Casino Royale,2006,Daniel Craig,Martin Campbell,581.5,145.3,3.3
Casino Royale,1967,David Niven,Ken Hughes,315.0,85.0,
Diamonds Are Forever,1971,Sean Connery,Guy Hamilton,442.5,34.7,5.8
Die Another Day,2002,Pierce Brosnan,Lee Tamahori,465.4,154.2,17.9
Dr. No,1962,Sean Connery,Terence Young,448.8,7.0,0.6
For Your Eyes Only,1981,Roger Moore,John Glen,449.4,60.2,
From Russia with Love,1963,Sean Connery,Terence Young,543.8,12.6,1.6
GoldenEye,1995,Pierce Brosnan,Martin Campbell,518.5,76.9,5.1
Goldfinger,1964,Sean Connery,Guy Hamilton,820.4,18.6,3.2


In [21]:
bond.loc[['Moonraker','Skyfall', 'Die Another Day']]
#takes a list ['Moonraker', 'Skyfall', 'Die Another Day'] and returns those rows who match
#The order they're passed into the list is the order they're return in the new df


Unnamed: 0_level_0,Year,Actor,Director,Box Office,Budget,Bond Actor Salary
Film,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Moonraker,1979,Roger Moore,Lewis Gilbert,535.0,91.5,
Skyfall,2012,Daniel Craig,Sam Mendes,943.5,170.2,14.5
Die Another Day,2002,Pierce Brosnan,Lee Tamahori,465.4,154.2,17.9


In [27]:
#every value in the input list must exist or itll raise a keyerror
#to check if a value actually an index label we can use the in keyword
#if it returns false you can't pass it into the .loc[] accessor
'Moonraker' in bond.index

True

### Retrieve Rows by Index Position using the .iloc[] Accessor

In [50]:
bond = pd.read_csv('./data/jamesbond.csv')
bond.head(3)

Unnamed: 0,Film,Year,Actor,Director,Box Office,Budget,Bond Actor Salary
0,Dr. No,1962,Sean Connery,Terence Young,448.8,7.0,0.6
1,From Russia with Love,1963,Sean Connery,Terence Young,543.8,12.6,1.6
2,Goldfinger,1964,Sean Connery,Guy Hamilton,820.4,18.6,3.2


In [30]:
bond.iloc[0] #creates a new series object from the first row of the df

Film                        Dr. No
Year                          1962
Actor                 Sean Connery
Director             Terence Young
Box Office                   448.8
Budget                           7
Bond Actor Salary              0.6
Name: 0, dtype: object

In [31]:
bond.iloc[15]  #creates a new series object from the 16th film in our df

Film                 A View to a Kill
Year                             1985
Actor                     Roger Moore
Director                    John Glen
Box Office                      275.2
Budget                           54.5
Bond Actor Salary                 9.1
Name: 15, dtype: object

In [33]:
bond.iloc[[15,20]]  #creates a new df with the values from the rows with index of 15 and 20

Unnamed: 0,Film,Year,Actor,Director,Box Office,Budget,Bond Actor Salary
15,A View to a Kill,1985,Roger Moore,John Glen,275.2,54.5,9.1
20,The World Is Not Enough,1999,Pierce Brosnan,Michael Apted,439.5,158.3,13.5


In [35]:
bond.iloc[222]  #providing index that doesn't exist with raise indexerror

IndexError: single positional indexer is out-of-bounds

In [38]:
#list slicing syntax works here too
bond.iloc[4:8]
#index position .iloc[] doesn't include the last value in the slice
#while index label .loc[] does include the last value in the slice

Unnamed: 0,Film,Year,Actor,Director,Box Office,Budget,Bond Actor Salary
4,Casino Royale,1967,David Niven,Ken Hughes,315.0,85.0,
5,You Only Live Twice,1967,Sean Connery,Lewis Gilbert,514.2,59.9,4.4
6,On Her Majesty's Secret Service,1969,George Lazenby,Peter R. Hunt,291.5,37.3,0.6
7,Diamonds Are Forever,1971,Sean Connery,Guy Hamilton,442.5,34.7,5.8


In [40]:
bond.iloc[20:] #from index position 20 to the end of the df and returns a new df

Unnamed: 0,Film,Year,Actor,Director,Box Office,Budget,Bond Actor Salary
20,The World Is Not Enough,1999,Pierce Brosnan,Michael Apted,439.5,158.3,13.5
21,Die Another Day,2002,Pierce Brosnan,Lee Tamahori,465.4,154.2,17.9
22,Casino Royale,2006,Daniel Craig,Martin Campbell,581.5,145.3,3.3
23,Quantum of Solace,2008,Daniel Craig,Marc Forster,514.2,181.4,8.1
24,Skyfall,2012,Daniel Craig,Sam Mendes,943.5,170.2,14.5
25,Spectre,2015,Daniel Craig,Sam Mendes,726.7,206.3,


In [43]:
bond.iloc[:4] #pulls from beginning of df all the way up to index position 3 (4 values are selected) and returned as new df

Unnamed: 0,Film,Year,Actor,Director,Box Office,Budget,Bond Actor Salary
0,Dr. No,1962,Sean Connery,Terence Young,448.8,7.0,0.6
1,From Russia with Love,1963,Sean Connery,Terence Young,543.8,12.6,1.6
2,Goldfinger,1964,Sean Connery,Guy Hamilton,820.4,18.6,3.2
3,Thunderball,1965,Sean Connery,Terence Young,848.1,41.9,4.7


In [51]:
#index locations still exist even without the standard pandas numeric index
#still can use iloc[] accessor or stuff without numeric index
bond.set_index('Film', inplace = True)
#now the index is the film column
bond.sort_index(inplace=True)
#sorts the index (film) alphabetically
bond.head(3)

Unnamed: 0_level_0,Year,Actor,Director,Box Office,Budget,Bond Actor Salary
Film,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
A View to a Kill,1985,Roger Moore,John Glen,275.2,54.5,9.1
Casino Royale,2006,Daniel Craig,Martin Campbell,581.5,145.3,3.3
Casino Royale,1967,David Niven,Ken Hughes,315.0,85.0,


In [56]:
bond.iloc[[0,2,8]]
#can still use iloc[] to access the first, third, and 9th index positions even though theres no numeric index
bond.iloc[2]
bond.iloc[:5]

Unnamed: 0_level_0,Year,Actor,Director,Box Office,Budget,Bond Actor Salary
Film,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
A View to a Kill,1985,Roger Moore,John Glen,275.2,54.5,9.1
Casino Royale,2006,Daniel Craig,Martin Campbell,581.5,145.3,3.3
Casino Royale,1967,David Niven,Ken Hughes,315.0,85.0,
Diamonds Are Forever,1971,Sean Connery,Guy Hamilton,442.5,34.7,5.8
Die Another Day,2002,Pierce Brosnan,Lee Tamahori,465.4,154.2,17.9


### Passing second arguments to the loc and iloc accessors

In [57]:
bond = pd.read_csv('./data/jamesbond.csv', index_col = 'Film')
#import csv into pd and set index column equal to film
bond.sort_index(inplace = True)
#also want to sort the index, which actually speeds up the search
bond.head(3)

Unnamed: 0_level_0,Year,Actor,Director,Box Office,Budget,Bond Actor Salary
Film,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
A View to a Kill,1985,Roger Moore,John Glen,275.2,54.5,9.1
Casino Royale,2006,Daniel Craig,Martin Campbell,581.5,145.3,3.3
Casino Royale,1967,David Niven,Ken Hughes,315.0,85.0,


In [61]:
#useful for extracting certain columns from certain rows you select from loc/iloc
bond.loc['Moonraker', 'Actor']
#returns the actor column from the moonraker row as a string

'Roger Moore'

In [76]:
bond.loc['Moonraker', 'Director']
#returns the director of moonraker as a string
bond.loc['Moonraker',['Director','Box Office']]
#returns a new series with the directory and box office of moonraker
bond.loc[['Moonraker', 'GoldenEye'],['Director', 'Actor', 'Box Office']]
#rteturns a new df with the 2 films and their directors, actors, and box office values

Unnamed: 0_level_0,Director,Actor,Box Office
Film,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Moonraker,Lewis Gilbert,Roger Moore,535.0
GoldenEye,Martin Campbell,Pierce Brosnan,518.5


In [78]:
#can also pass in list slices as the second argument
bond.loc['Moonraker', 'Director':'Budget']
#returns a new series with the director, box office and budget as values
bond.loc['Moonraker':'Thunderball', 'Director':'Budget']
#returns columns from directory to budget for each row from moonraker to thunderball
bond.loc['Moonraker':, 'Director':]
#returns a new df with rows of moonraker to the end and columns of director to the end
bond.loc[:'Moonraker',:'Budget']
#returns new df of rows from start to (including) moonraker and columns from start to (including) budget

Unnamed: 0_level_0,Year,Actor,Director,Box Office,Budget
Film,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
A View to a Kill,1985,Roger Moore,John Glen,275.2,54.5
Casino Royale,2006,Daniel Craig,Martin Campbell,581.5,145.3
Casino Royale,1967,David Niven,Ken Hughes,315.0,85.0
Diamonds Are Forever,1971,Sean Connery,Guy Hamilton,442.5,34.7
Die Another Day,2002,Pierce Brosnan,Lee Tamahori,465.4,154.2
Dr. No,1962,Sean Connery,Terence Young,448.8,7.0
For Your Eyes Only,1981,Roger Moore,John Glen,449.4,60.2
From Russia with Love,1963,Sean Connery,Terence Young,543.8,12.6
GoldenEye,1995,Pierce Brosnan,Martin Campbell,518.5,76.9
Goldfinger,1964,Sean Connery,Guy Hamilton,820.4,18.6


In [80]:
#same logic applies to the .iloc[] accessor except both arguments require index positions rather than labels
bond.iloc[14]

Year                        1983
Actor                Roger Moore
Director               John Glen
Box Office                 373.8
Budget                      53.9
Bond Actor Salary            7.8
Name: Octopussy, dtype: object

In [82]:
bond.iloc[14,2] #returns the string at the intersection of the 15th row and 3rd column
bond.iloc[14,2:5]

Director      John Glen
Box Office        373.8
Budget             53.9
Name: Octopussy, dtype: object

In [84]:
bond.iloc[:7,:3]
#returns films with index 0,1,2,3,4,5,6 the info in columns 0, 1, and 2

Unnamed: 0_level_0,Year,Actor,Director
Film,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
A View to a Kill,1985,Roger Moore,John Glen
Casino Royale,2006,Daniel Craig,Martin Campbell
Casino Royale,1967,David Niven,Ken Hughes
Diamonds Are Forever,1971,Sean Connery,Guy Hamilton
Die Another Day,2002,Pierce Brosnan,Lee Tamahori
Dr. No,1962,Sean Connery,Terence Young
For Your Eyes Only,1981,Roger Moore,John Glen


### Set New Value for a Specific Cell or Cells in a Row

In [85]:
bond = pd.read_csv('./data/jamesbond.csv', index_col = 'Film')
#import csv into pd and set index column equal to film
bond.sort_index(inplace = True)
#also want to sort the index, which actually speeds up the search
bond.head(3)

Unnamed: 0_level_0,Year,Actor,Director,Box Office,Budget,Bond Actor Salary
Film,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
A View to a Kill,1985,Roger Moore,John Glen,275.2,54.5,9.1
Casino Royale,2006,Daniel Craig,Martin Campbell,581.5,145.3,3.3
Casino Royale,1967,David Niven,Ken Hughes,315.0,85.0,


In [88]:
bond.loc['Dr. No', 'Actor']

'Sean Connery'

In [89]:
bond.loc['Dr. No', 'Actor'] = 'Sir Sean Connery'
#won't see any output but it will change the cell corresponding to dr. no and actor
bond.loc['Dr. No']

Year                             1962
Actor                Sir Sean Connery
Director                Terence Young
Box Office                      448.8
Budget                              7
Bond Actor Salary                 0.6
Name: Dr. No, dtype: object

In [93]:
#heres how to overwrite multpile cells in a single row
bond.loc['Dr. No', ['Box Office', 'Budget', 'Bond Actor Salary']] = [300, 2, 0.3]
bond.loc['Dr. No']
#now the box office value, budget, and actor salary for dr. no is set to 300, 2, and 0.3 respectively

Year                             1962
Actor                Sir Sean Connery
Director                Terence Young
Box Office                        300
Budget                              2
Bond Actor Salary                 0.3
Name: Dr. No, dtype: object

### Set Multiple Values in a DF

In [94]:
bond = pd.read_csv('./data/jamesbond.csv', index_col = 'Film')
#import csv into pd and set index column equal to film
bond.sort_index(inplace = True)
#also want to sort the index, which actually speeds up the search
bond.head(3)

Unnamed: 0_level_0,Year,Actor,Director,Box Office,Budget,Bond Actor Salary
Film,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
A View to a Kill,1985,Roger Moore,John Glen,275.2,54.5,9.1
Casino Royale,2006,Daniel Craig,Martin Campbell,581.5,145.3,3.3
Casino Royale,1967,David Niven,Ken Hughes,315.0,85.0,


In [100]:
#every time 'sean connery' comes up in the actor column we want to change it to 'sir sean connery'
#need to filter the df down to the rows that have sean connery as the actor
#this is done by creating a boolean series like we used in section 4
bond['Actor'] == 'Sean Connery' #returns a boolean series

bond[bond['Actor'] == 'Sean Connery'] #returns a new df with rows in which the boolean series was True
bond[bond['Actor'] == 'Sean Connery']['Actor'] == 'Sir Sean Connery'
#this is bad because we're actually just storing a new copy of a df, not the original df.

Film
Diamonds Are Forever     False
Dr. No                   False
From Russia with Love    False
Goldfinger               False
Never Say Never Again    False
Thunderball              False
You Only Live Twice      False
Name: Actor, dtype: bool

In [101]:
bond.head()  #see, the initial bond df still doesn't have sir sean connery

Unnamed: 0_level_0,Year,Actor,Director,Box Office,Budget,Bond Actor Salary
Film,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
A View to a Kill,1985,Roger Moore,John Glen,275.2,54.5,9.1
Casino Royale,2006,Daniel Craig,Martin Campbell,581.5,145.3,3.3
Casino Royale,1967,David Niven,Ken Hughes,315.0,85.0,
Diamonds Are Forever,1971,Sean Connery,Guy Hamilton,442.5,34.7,5.8
Die Another Day,2002,Pierce Brosnan,Lee Tamahori,465.4,154.2,17.9


In [104]:
#better way is using .loc[] accessor and feed in a series
actor_is_sean_connery = bond['Actor'] == 'Sean Connery'
bond.loc[actor_is_sean_connery]
#this df is actually still connected to our initial bond df
#this is not a new df

Unnamed: 0_level_0,Year,Actor,Director,Box Office,Budget,Bond Actor Salary
Film,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Diamonds Are Forever,1971,Sean Connery,Guy Hamilton,442.5,34.7,5.8
Dr. No,1962,Sean Connery,Terence Young,448.8,7.0,0.6
From Russia with Love,1963,Sean Connery,Terence Young,543.8,12.6,1.6
Goldfinger,1964,Sean Connery,Guy Hamilton,820.4,18.6,3.2
Never Say Never Again,1983,Sean Connery,Irvin Kershner,380.0,86.0,
Thunderball,1965,Sean Connery,Terence Young,848.1,41.9,4.7
You Only Live Twice,1967,Sean Connery,Lewis Gilbert,514.2,59.9,4.4


In [108]:
#since the df above isn't a copy, its actualyyl connnected to the original df
bond.loc[actor_is_sean_connery, 'Actor'] = 'Sir Sean Connery'
#we can set those values to the value we want
bond.head()
#we now see the actors have changted to sir sean connery
#because we used .loc we modified the initial df, rather than just a copy that was shown above.

Unnamed: 0_level_0,Year,Actor,Director,Box Office,Budget,Bond Actor Salary
Film,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
A View to a Kill,1985,Roger Moore,John Glen,275.2,54.5,9.1
Casino Royale,2006,Daniel Craig,Martin Campbell,581.5,145.3,3.3
Casino Royale,1967,David Niven,Ken Hughes,315.0,85.0,
Diamonds Are Forever,1971,Sir Sean Connery,Guy Hamilton,442.5,34.7,5.8
Die Another Day,2002,Pierce Brosnan,Lee Tamahori,465.4,154.2,17.9


### Rename Index Labels or Columns in a DF

In [109]:
bond = pd.read_csv('./data/jamesbond.csv', index_col = 'Film')
#import csv into pd and set index column equal to film
bond.sort_index(inplace = True)
#also want to sort the index, which actually speeds up the search
bond.head(3)

Unnamed: 0_level_0,Year,Actor,Director,Box Office,Budget,Bond Actor Salary
Film,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
A View to a Kill,1985,Roger Moore,John Glen,275.2,54.5,9.1
Casino Royale,2006,Daniel Craig,Martin Campbell,581.5,145.3,3.3
Casino Royale,1967,David Niven,Ken Hughes,315.0,85.0,


In [111]:
#option 1 is the .rename() method
#can either use mapper and axis parameters in combination
#or can use the index and columns parameters independently
bond.rename(mapper = {'GoldenEye': 'Golden Eye', 'The World Is Not Enough':'Best Bond Movie Ever'}, axis = 0)
#here mapper takes a dictionary where the key is the old name and the value is the new name.
#here we can see the movie goldeneye has changed to golden eye and the world is not enough has chnaged to best bond movie ever

Unnamed: 0_level_0,Year,Actor,Director,Box Office,Budget,Bond Actor Salary
Film,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
A View to a Kill,1985,Roger Moore,John Glen,275.2,54.5,9.1
Casino Royale,2006,Daniel Craig,Martin Campbell,581.5,145.3,3.3
Casino Royale,1967,David Niven,Ken Hughes,315.0,85.0,
Diamonds Are Forever,1971,Sean Connery,Guy Hamilton,442.5,34.7,5.8
Die Another Day,2002,Pierce Brosnan,Lee Tamahori,465.4,154.2,17.9
Dr. No,1962,Sean Connery,Terence Young,448.8,7.0,0.6
For Your Eyes Only,1981,Roger Moore,John Glen,449.4,60.2,
From Russia with Love,1963,Sean Connery,Terence Young,543.8,12.6,1.6
Golden Eye,1995,Pierce Brosnan,Martin Campbell,518.5,76.9,5.1
Goldfinger,1964,Sean Connery,Guy Hamilton,820.4,18.6,3.2


In [113]:
#can also use index which also takes a dict the same as above, cant specify axis in this scenerio.
bond.rename(index = {'GoldenEye':'Golden Eye', 'The World Is Not Enough':'Best Bond Movie Ever'},inplace = True)
bond.head()

Unnamed: 0_level_0,Year,Actor,Director,Box Office,Budget,Bond Actor Salary
Film,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
A View to a Kill,1985,Roger Moore,John Glen,275.2,54.5,9.1
Casino Royale,2006,Daniel Craig,Martin Campbell,581.5,145.3,3.3
Casino Royale,1967,David Niven,Ken Hughes,315.0,85.0,
Diamonds Are Forever,1971,Sean Connery,Guy Hamilton,442.5,34.7,5.8
Die Another Day,2002,Pierce Brosnan,Lee Tamahori,465.4,154.2,17.9


In [114]:
#to rename column names we can either use mapper and specify the axis
#or just use the columns parameter (similar to using index parameter)
bond.rename(mapper = {'Year':'Release Date', 'Box Office': 'Revenue'}, axis = 1)

Unnamed: 0_level_0,Release Date,Actor,Director,Revenue,Budget,Bond Actor Salary
Film,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
A View to a Kill,1985,Roger Moore,John Glen,275.2,54.5,9.1
Casino Royale,2006,Daniel Craig,Martin Campbell,581.5,145.3,3.3
Casino Royale,1967,David Niven,Ken Hughes,315.0,85.0,
Diamonds Are Forever,1971,Sean Connery,Guy Hamilton,442.5,34.7,5.8
Die Another Day,2002,Pierce Brosnan,Lee Tamahori,465.4,154.2,17.9
Dr. No,1962,Sean Connery,Terence Young,448.8,7.0,0.6
For Your Eyes Only,1981,Roger Moore,John Glen,449.4,60.2,
From Russia with Love,1963,Sean Connery,Terence Young,543.8,12.6,1.6
Golden Eye,1995,Pierce Brosnan,Martin Campbell,518.5,76.9,5.1
Goldfinger,1964,Sean Connery,Guy Hamilton,820.4,18.6,3.2


In [115]:
bond.rename(columns = {'Year':'Release Date', 'Box Office':'Revenue'}, inplace = True)
bond.head()

Unnamed: 0_level_0,Release Date,Actor,Director,Revenue,Budget,Bond Actor Salary
Film,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
A View to a Kill,1985,Roger Moore,John Glen,275.2,54.5,9.1
Casino Royale,2006,Daniel Craig,Martin Campbell,581.5,145.3,3.3
Casino Royale,1967,David Niven,Ken Hughes,315.0,85.0,
Diamonds Are Forever,1971,Sean Connery,Guy Hamilton,442.5,34.7,5.8
Die Another Day,2002,Pierce Brosnan,Lee Tamahori,465.4,154.2,17.9


In [116]:
#another strategy that useful for renaming all columns at once
#overwrite the bond.columns list with a new list
#the length of bond.columns and the new input list must be the same
bond.columns = ['Year of Release', 'Actor', 'Director', 'Gross', 'Cost', 'Salary']
bond.head()

Unnamed: 0_level_0,Year of Release,Actor,Director,Gross,Cost,Salary
Film,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
A View to a Kill,1985,Roger Moore,John Glen,275.2,54.5,9.1
Casino Royale,2006,Daniel Craig,Martin Campbell,581.5,145.3,3.3
Casino Royale,1967,David Niven,Ken Hughes,315.0,85.0,
Diamonds Are Forever,1971,Sean Connery,Guy Hamilton,442.5,34.7,5.8
Die Another Day,2002,Pierce Brosnan,Lee Tamahori,465.4,154.2,17.9


### Delete Rows or Columns from a DF

In [127]:
bond = pd.read_csv('./data/jamesbond.csv', index_col = 'Film')
#import csv into pd and set index column equal to film
bond.sort_index(inplace = True)
#also want to sort the index, which actually speeds up the search
bond.head(3)

Unnamed: 0_level_0,Year,Actor,Director,Box Office,Budget,Bond Actor Salary
Film,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
A View to a Kill,1985,Roger Moore,John Glen,275.2,54.5,9.1
Casino Royale,2006,Daniel Craig,Martin Campbell,581.5,145.3,3.3
Casino Royale,1967,David Niven,Ken Hughes,315.0,85.0,


In [118]:
#the .drop() method on rows
bond.drop('A View to a Kill') #returns a brand new df without A view to a kill

Unnamed: 0_level_0,Year,Actor,Director,Box Office,Budget,Bond Actor Salary
Film,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Casino Royale,2006,Daniel Craig,Martin Campbell,581.5,145.3,3.3
Casino Royale,1967,David Niven,Ken Hughes,315.0,85.0,
Diamonds Are Forever,1971,Sean Connery,Guy Hamilton,442.5,34.7,5.8
Die Another Day,2002,Pierce Brosnan,Lee Tamahori,465.4,154.2,17.9
Dr. No,1962,Sean Connery,Terence Young,448.8,7.0,0.6
For Your Eyes Only,1981,Roger Moore,John Glen,449.4,60.2,
From Russia with Love,1963,Sean Connery,Terence Young,543.8,12.6,1.6
GoldenEye,1995,Pierce Brosnan,Martin Campbell,518.5,76.9,5.1
Goldfinger,1964,Sean Connery,Guy Hamilton,820.4,18.6,3.2
Licence to Kill,1989,Timothy Dalton,John Glen,250.9,56.7,7.9


In [120]:
bond.drop(['A View to a Kill', 'Die Another Day', 'From Russia with Love', 'Casino Royale'], inplace = True)
#can also pass a list into drop
bond.head(1)

Unnamed: 0_level_0,Year,Actor,Director,Box Office,Budget,Bond Actor Salary
Film,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Diamonds Are Forever,1971,Sean Connery,Guy Hamilton,442.5,34.7,5.8
Dr. No,1962,Sean Connery,Terence Young,448.8,7.0,0.6
For Your Eyes Only,1981,Roger Moore,John Glen,449.4,60.2,
GoldenEye,1995,Pierce Brosnan,Martin Campbell,518.5,76.9,5.1
Goldfinger,1964,Sean Connery,Guy Hamilton,820.4,18.6,3.2
Licence to Kill,1989,Timothy Dalton,John Glen,250.9,56.7,7.9
Live and Let Die,1973,Roger Moore,Guy Hamilton,460.3,30.8,
Moonraker,1979,Roger Moore,Lewis Gilbert,535.0,91.5,
Never Say Never Again,1983,Sean Connery,Irvin Kershner,380.0,86.0,
Octopussy,1983,Roger Moore,John Glen,373.8,53.9,7.8


In [123]:
#remove columns using the .drop() method
bond.drop('Box Office', axis = 1)
bond.drop(columns = ['Box Office', 'Bond Actor Salary', 'Actor'])

Unnamed: 0_level_0,Year,Director,Budget
Film,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
A View to a Kill,1985,John Glen,54.5
Casino Royale,2006,Martin Campbell,145.3
Casino Royale,1967,Ken Hughes,85.0
Diamonds Are Forever,1971,Guy Hamilton,34.7
Die Another Day,2002,Lee Tamahori,154.2
Dr. No,1962,Terence Young,7.0
For Your Eyes Only,1981,John Glen,60.2
From Russia with Love,1963,Terence Young,12.6
GoldenEye,1995,Martin Campbell,76.9
Goldfinger,1964,Guy Hamilton,18.6


In [124]:
#remove data using the .pop() method
#.pop() takes in a single series or column and permenently removes it. No need to set inplace = True
actor = bond.pop('Actor')
#pops off the actor column from the bond df
#also creats a new actor series with just the actor column next to the film index

In [125]:
bond.head()

Unnamed: 0_level_0,Year,Director,Box Office,Budget,Bond Actor Salary
Film,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
A View to a Kill,1985,John Glen,275.2,54.5,9.1
Casino Royale,2006,Martin Campbell,581.5,145.3,3.3
Casino Royale,1967,Ken Hughes,315.0,85.0,
Diamonds Are Forever,1971,Guy Hamilton,442.5,34.7,5.8
Die Another Day,2002,Lee Tamahori,465.4,154.2,17.9


In [126]:
actor

Film
A View to a Kill                      Roger Moore
Casino Royale                        Daniel Craig
Casino Royale                         David Niven
Diamonds Are Forever                 Sean Connery
Die Another Day                    Pierce Brosnan
Dr. No                               Sean Connery
For Your Eyes Only                    Roger Moore
From Russia with Love                Sean Connery
GoldenEye                          Pierce Brosnan
Goldfinger                           Sean Connery
Licence to Kill                    Timothy Dalton
Live and Let Die                      Roger Moore
Moonraker                             Roger Moore
Never Say Never Again                Sean Connery
Octopussy                             Roger Moore
On Her Majesty's Secret Service    George Lazenby
Quantum of Solace                    Daniel Craig
Skyfall                              Daniel Craig
Spectre                              Daniel Craig
The Living Daylights               Timothy Da

In [128]:
#can also delete columns using the del keyword
del bond['Director']
bond.head()
#now the existing bond df doesn't have the director column

Unnamed: 0_level_0,Year,Actor,Box Office,Budget,Bond Actor Salary
Film,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
A View to a Kill,1985,Roger Moore,275.2,54.5,9.1
Casino Royale,2006,Daniel Craig,581.5,145.3,3.3
Casino Royale,1967,David Niven,315.0,85.0,
Diamonds Are Forever,1971,Sean Connery,442.5,34.7,5.8
Die Another Day,2002,Pierce Brosnan,465.4,154.2,17.9


In [131]:
del bond['Year']
bond.head()
#removes the column or df from the original df, no need to set inplace = True

Unnamed: 0_level_0,Actor,Box Office,Budget,Bond Actor Salary
Film,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
A View to a Kill,Roger Moore,275.2,54.5,9.1
Casino Royale,Daniel Craig,581.5,145.3,3.3
Casino Royale,David Niven,315.0,85.0,
Diamonds Are Forever,Sean Connery,442.5,34.7,5.8
Die Another Day,Pierce Brosnan,465.4,154.2,17.9


### Create Random Sample with the .sample() method

In [132]:
bond = pd.read_csv('./data/jamesbond.csv', index_col = 'Film')
#import csv into pd and set index column equal to film
bond.sort_index(inplace = True)
#also want to sort the index, which actually speeds up the search
bond.head(3)

Unnamed: 0_level_0,Year,Actor,Director,Box Office,Budget,Bond Actor Salary
Film,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
A View to a Kill,1985,Roger Moore,John Glen,275.2,54.5,9.1
Casino Royale,2006,Daniel Craig,Martin Campbell,581.5,145.3,3.3
Casino Royale,1967,David Niven,Ken Hughes,315.0,85.0,


In [136]:
#exctract 5 row at random
bond.sample(n = 5)

Unnamed: 0_level_0,Year,Actor,Director,Box Office,Budget,Bond Actor Salary
Film,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Spectre,2015,Daniel Craig,Sam Mendes,726.7,206.3,
For Your Eyes Only,1981,Roger Moore,John Glen,449.4,60.2,
The World Is Not Enough,1999,Pierce Brosnan,Michael Apted,439.5,158.3,13.5
The Man with the Golden Gun,1974,Roger Moore,Guy Hamilton,334.0,27.7,
Moonraker,1979,Roger Moore,Lewis Gilbert,535.0,91.5,


In [137]:
bond.sample(frac = 0.25)
#returns a new bf containing 25% of the rows at random

Unnamed: 0_level_0,Year,Actor,Director,Box Office,Budget,Bond Actor Salary
Film,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Diamonds Are Forever,1971,Sean Connery,Guy Hamilton,442.5,34.7,5.8
Live and Let Die,1973,Roger Moore,Guy Hamilton,460.3,30.8,
Moonraker,1979,Roger Moore,Lewis Gilbert,535.0,91.5,
Licence to Kill,1989,Timothy Dalton,John Glen,250.9,56.7,7.9
Quantum of Solace,2008,Daniel Craig,Marc Forster,514.2,181.4,8.1
Tomorrow Never Dies,1997,Pierce Brosnan,Roger Spottiswoode,463.2,133.9,10.0


In [139]:
bond.sample(n = 3, axis = 1)
#returns all the rows with 3 random columns

Unnamed: 0_level_0,Actor,Year,Director
Film,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
A View to a Kill,Roger Moore,1985,John Glen
Casino Royale,Daniel Craig,2006,Martin Campbell
Casino Royale,David Niven,1967,Ken Hughes
Diamonds Are Forever,Sean Connery,1971,Guy Hamilton
Die Another Day,Pierce Brosnan,2002,Lee Tamahori
Dr. No,Sean Connery,1962,Terence Young
For Your Eyes Only,Roger Moore,1981,John Glen
From Russia with Love,Sean Connery,1963,Terence Young
GoldenEye,Pierce Brosnan,1995,Martin Campbell
Goldfinger,Sean Connery,1964,Guy Hamilton


### Use the .nsmallest()/.nlargest() methods to get rows with smallest/largest values in a specific column

In [140]:
bond = pd.read_csv('./data/jamesbond.csv', index_col = 'Film')
#import csv into pd and set index column equal to film
bond.sort_index(inplace = True)
#also want to sort the index, which actually speeds up the search
bond.head(3)

Unnamed: 0_level_0,Year,Actor,Director,Box Office,Budget,Bond Actor Salary
Film,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
A View to a Kill,1985,Roger Moore,John Glen,275.2,54.5,9.1
Casino Royale,2006,Daniel Craig,Martin Campbell,581.5,145.3,3.3
Casino Royale,1967,David Niven,Ken Hughes,315.0,85.0,


In [141]:
#extract the 3 rows with the largest box office gross using sort_values
bond.sort_values('Box Office', ascending = False).head(3)

Unnamed: 0_level_0,Year,Actor,Director,Box Office,Budget,Bond Actor Salary
Film,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Skyfall,2012,Daniel Craig,Sam Mendes,943.5,170.2,14.5
Thunderball,1965,Sean Connery,Terence Young,848.1,41.9,4.7
Goldfinger,1964,Sean Connery,Guy Hamilton,820.4,18.6,3.2


In [143]:
bond.nlargest(3, columns = 'Box Office')

Unnamed: 0_level_0,Year,Actor,Director,Box Office,Budget,Bond Actor Salary
Film,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Skyfall,2012,Daniel Craig,Sam Mendes,943.5,170.2,14.5
Thunderball,1965,Sean Connery,Terence Young,848.1,41.9,4.7
Goldfinger,1964,Sean Connery,Guy Hamilton,820.4,18.6,3.2


In [145]:
#return a new df with the 2 smallest box office values
bond.nsmallest(n = 2, columns = 'Box Office')

Unnamed: 0_level_0,Year,Actor,Director,Box Office,Budget,Bond Actor Salary
Film,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Licence to Kill,1989,Timothy Dalton,John Glen,250.9,56.7,7.9
A View to a Kill,1985,Roger Moore,John Glen,275.2,54.5,9.1


In [146]:
# largest budgets
bond.nlargest(n = 3, columns = 'Budget')

Unnamed: 0_level_0,Year,Actor,Director,Box Office,Budget,Bond Actor Salary
Film,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Spectre,2015,Daniel Craig,Sam Mendes,726.7,206.3,
Quantum of Solace,2008,Daniel Craig,Marc Forster,514.2,181.4,8.1
Skyfall,2012,Daniel Craig,Sam Mendes,943.5,170.2,14.5


In [147]:
#six smallest bond actor salaries
bond.nsmallest(n = 6, columns = 'Bond Actor Salary')

Unnamed: 0_level_0,Year,Actor,Director,Box Office,Budget,Bond Actor Salary
Film,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Dr. No,1962,Sean Connery,Terence Young,448.8,7.0,0.6
On Her Majesty's Secret Service,1969,George Lazenby,Peter R. Hunt,291.5,37.3,0.6
From Russia with Love,1963,Sean Connery,Terence Young,543.8,12.6,1.6
Goldfinger,1964,Sean Connery,Guy Hamilton,820.4,18.6,3.2
Casino Royale,2006,Daniel Craig,Martin Campbell,581.5,145.3,3.3
You Only Live Twice,1967,Sean Connery,Lewis Gilbert,514.2,59.9,4.4


In [149]:
#can call nlargest directly on a series
#returns a new series with the film as the index and the box office as the value of the series
bond['Box Office'].nlargest(8)

Film
Skyfall                  943.5
Thunderball              848.1
Goldfinger               820.4
Spectre                  726.7
Casino Royale            581.5
From Russia with Love    543.8
Moonraker                535.0
The Spy Who Loved Me     533.0
Name: Box Office, dtype: float64

In [151]:
#extract a new series of the 2 earliest films
bond['Year'].nsmallest(2)

Film
Dr. No                   1962
From Russia with Love    1963
Name: Year, dtype: int64

### Filter a DF with the .where() method

In [3]:
bond = pd.read_csv('./data/jamesbond.csv', index_col = 'Film')
#import csv into pd and set index column equal to film
bond.sort_index(inplace = True)
#also want to sort the index, which actually speeds up the search
bond.head(3)

Unnamed: 0_level_0,Year,Actor,Director,Box Office,Budget,Bond Actor Salary
Film,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
A View to a Kill,1985,Roger Moore,John Glen,275.2,54.5,9.1
Casino Royale,2006,Daniel Craig,Martin Campbell,581.5,145.3,3.3
Casino Royale,1967,David Niven,Ken Hughes,315.0,85.0,


In [4]:
#.where() method actually returns the complete original df but values that meet the criteria will have their actual values
#while rows that do not meet the criteria will have null values
#start by creating a bool series
mask = bond['Actor'] == 'Sean Connery'
#passing the mask into the df[] will return all the values where the bool series is True
bond[mask]

Unnamed: 0_level_0,Year,Actor,Director,Box Office,Budget,Bond Actor Salary
Film,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Diamonds Are Forever,1971,Sean Connery,Guy Hamilton,442.5,34.7,5.8
Dr. No,1962,Sean Connery,Terence Young,448.8,7.0,0.6
From Russia with Love,1963,Sean Connery,Terence Young,543.8,12.6,1.6
Goldfinger,1964,Sean Connery,Guy Hamilton,820.4,18.6,3.2
Never Say Never Again,1983,Sean Connery,Irvin Kershner,380.0,86.0,
Thunderball,1965,Sean Connery,Terence Young,848.1,41.9,4.7
You Only Live Twice,1967,Sean Connery,Lewis Gilbert,514.2,59.9,4.4


In [5]:
#isntead we use .where() to extract all the values
bond.where(mask)
#returns the whole df. rows where the mask was False have NAN values


Unnamed: 0_level_0,Year,Actor,Director,Box Office,Budget,Bond Actor Salary
Film,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
A View to a Kill,,,,,,
Casino Royale,,,,,,
Casino Royale,,,,,,
Diamonds Are Forever,1971.0,Sean Connery,Guy Hamilton,442.5,34.7,5.8
Die Another Day,,,,,,
Dr. No,1962.0,Sean Connery,Terence Young,448.8,7.0,0.6
For Your Eyes Only,,,,,,
From Russia with Love,1963.0,Sean Connery,Terence Young,543.8,12.6,1.6
GoldenEye,,,,,,
Goldfinger,1964.0,Sean Connery,Guy Hamilton,820.4,18.6,3.2


In [7]:
bond.where(bond['Box Office'] > 800)

Unnamed: 0_level_0,Year,Actor,Director,Box Office,Budget,Bond Actor Salary
Film,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
A View to a Kill,,,,,,
Casino Royale,,,,,,
Casino Royale,,,,,,
Diamonds Are Forever,,,,,,
Die Another Day,,,,,,
Dr. No,,,,,,
For Your Eyes Only,,,,,,
From Russia with Love,,,,,,
GoldenEye,,,,,,
Goldfinger,1964.0,Sean Connery,Guy Hamilton,820.4,18.6,3.2


In [8]:
mask = bond['Actor'] == 'Sean Connery'
mask2 = bond['Box Office'] > 800
bond.where(mask & mask2)

Unnamed: 0_level_0,Year,Actor,Director,Box Office,Budget,Bond Actor Salary
Film,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
A View to a Kill,,,,,,
Casino Royale,,,,,,
Casino Royale,,,,,,
Diamonds Are Forever,,,,,,
Die Another Day,,,,,,
Dr. No,,,,,,
For Your Eyes Only,,,,,,
From Russia with Love,,,,,,
GoldenEye,,,,,,
Goldfinger,1964.0,Sean Connery,Guy Hamilton,820.4,18.6,3.2


### Filter a DF with the .query() method

In [9]:
bond = pd.read_csv('./data/jamesbond.csv', index_col = 'Film')
#import csv into pd and set index column equal to film
bond.sort_index(inplace = True)
#also want to sort the index, which actually speeds up the search
bond.head(3)

Unnamed: 0_level_0,Year,Actor,Director,Box Office,Budget,Bond Actor Salary
Film,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
A View to a Kill,1985,Roger Moore,John Glen,275.2,54.5,9.1
Casino Royale,2006,Daniel Craig,Martin Campbell,581.5,145.3,3.3
Casino Royale,1967,David Niven,Ken Hughes,315.0,85.0,


In [10]:
#the input for the .query() method is a string
#the .query() method won't work if there are spaces in the column names

In [13]:
#we have to get rid of the spaces in the column names
bond.columns = [column_name.replace(' ', '_') for column_name in bond.columns]

In [16]:
#extract rows that have sean connery as the actor
bond.query('Actor == "Sean Connery"', inplace = False)

Unnamed: 0_level_0,Year,Actor,Director,Box_Office,Budget,Bond_Actor_Salary
Film,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Diamonds Are Forever,1971,Sean Connery,Guy Hamilton,442.5,34.7,5.8
Dr. No,1962,Sean Connery,Terence Young,448.8,7.0,0.6
From Russia with Love,1963,Sean Connery,Terence Young,543.8,12.6,1.6
Goldfinger,1964,Sean Connery,Guy Hamilton,820.4,18.6,3.2
Never Say Never Again,1983,Sean Connery,Irvin Kershner,380.0,86.0,
Thunderball,1965,Sean Connery,Terence Young,848.1,41.9,4.7
You Only Live Twice,1967,Sean Connery,Lewis Gilbert,514.2,59.9,4.4


In [17]:
bond.query('Director == "Terence Young"')

Unnamed: 0_level_0,Year,Actor,Director,Box_Office,Budget,Bond_Actor_Salary
Film,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Dr. No,1962,Sean Connery,Terence Young,448.8,7.0,0.6
From Russia with Love,1963,Sean Connery,Terence Young,543.8,12.6,1.6
Thunderball,1965,Sean Connery,Terence Young,848.1,41.9,4.7


In [19]:
#find bond movies who don't have roger moore in it.
bond.query('Actor != "Roger Moore"')

Unnamed: 0_level_0,Year,Actor,Director,Box_Office,Budget,Bond_Actor_Salary
Film,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Casino Royale,2006,Daniel Craig,Martin Campbell,581.5,145.3,3.3
Casino Royale,1967,David Niven,Ken Hughes,315.0,85.0,
Diamonds Are Forever,1971,Sean Connery,Guy Hamilton,442.5,34.7,5.8
Die Another Day,2002,Pierce Brosnan,Lee Tamahori,465.4,154.2,17.9
Dr. No,1962,Sean Connery,Terence Young,448.8,7.0,0.6
From Russia with Love,1963,Sean Connery,Terence Young,543.8,12.6,1.6
GoldenEye,1995,Pierce Brosnan,Martin Campbell,518.5,76.9,5.1
Goldfinger,1964,Sean Connery,Guy Hamilton,820.4,18.6,3.2
Licence to Kill,1989,Timothy Dalton,John Glen,250.9,56.7,7.9
Never Say Never Again,1983,Sean Connery,Irvin Kershner,380.0,86.0,


In [21]:
bond.query('Box_Office > 600')

Unnamed: 0_level_0,Year,Actor,Director,Box_Office,Budget,Bond_Actor_Salary
Film,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Goldfinger,1964,Sean Connery,Guy Hamilton,820.4,18.6,3.2
Skyfall,2012,Daniel Craig,Sam Mendes,943.5,170.2,14.5
Spectre,2015,Daniel Craig,Sam Mendes,726.7,206.3,
Thunderball,1965,Sean Connery,Terence Young,848.1,41.9,4.7


In [22]:
bond.query("Actor == 'Roger Moore' and Director == 'John Glen'")

Unnamed: 0_level_0,Year,Actor,Director,Box_Office,Budget,Bond_Actor_Salary
Film,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
A View to a Kill,1985,Roger Moore,John Glen,275.2,54.5,9.1
For Your Eyes Only,1981,Roger Moore,John Glen,449.4,60.2,
Octopussy,1983,Roger Moore,John Glen,373.8,53.9,7.8


In [24]:
bond.query("Actor not in ['Timothy Dalton', 'George Lazenby']")

Unnamed: 0_level_0,Year,Actor,Director,Box_Office,Budget,Bond_Actor_Salary
Film,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
A View to a Kill,1985,Roger Moore,John Glen,275.2,54.5,9.1
Casino Royale,2006,Daniel Craig,Martin Campbell,581.5,145.3,3.3
Casino Royale,1967,David Niven,Ken Hughes,315.0,85.0,
Diamonds Are Forever,1971,Sean Connery,Guy Hamilton,442.5,34.7,5.8
Die Another Day,2002,Pierce Brosnan,Lee Tamahori,465.4,154.2,17.9
Dr. No,1962,Sean Connery,Terence Young,448.8,7.0,0.6
For Your Eyes Only,1981,Roger Moore,John Glen,449.4,60.2,
From Russia with Love,1963,Sean Connery,Terence Young,543.8,12.6,1.6
GoldenEye,1995,Pierce Brosnan,Martin Campbell,518.5,76.9,5.1
Goldfinger,1964,Sean Connery,Guy Hamilton,820.4,18.6,3.2


### A Review of the .apply() method on a pandas Series object

In [31]:
bond = pd.read_csv('./data/jamesbond.csv', index_col = 'Film')
#import csv into pd and set index column equal to film
bond.sort_index(inplace = True)
#also want to sort the index, which actually speeds up the search
bond.head(3)

Unnamed: 0_level_0,Year,Actor,Director,Box Office,Budget,Bond Actor Salary
Film,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
A View to a Kill,1985,Roger Moore,John Glen,275.2,54.5,9.1
Casino Royale,2006,Daniel Craig,Martin Campbell,581.5,145.3,3.3
Casino Royale,1967,David Niven,Ken Hughes,315.0,85.0,


In [28]:
def convert_to_string_add_millions(number):
    return str(number) + ' Million'

bond['Box Office'] = bond['Box Office'].apply(convert_to_string_add_millions)

In [30]:
bond['Budget']= bond['Budget'].apply(convert_to_string_add_millions)
bond.head()

Unnamed: 0_level_0,Year,Actor,Director,Box Office,Budget,Bond Actor Salary
Film,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
A View to a Kill,1985,Roger Moore,John Glen,275.2 Million,54.5 Million,9.1
Casino Royale,2006,Daniel Craig,Martin Campbell,581.5 Million,145.3 Million,3.3
Casino Royale,1967,David Niven,Ken Hughes,315.0 Million,85.0 Million,
Diamonds Are Forever,1971,Sean Connery,Guy Hamilton,442.5 Million,34.7 Million,5.8
Die Another Day,2002,Pierce Brosnan,Lee Tamahori,465.4 Million,154.2 Million,17.9


In [32]:
#can do the .apply() method on each individual column as a series, like the 2 examples above or
#use a for loop to iterate over all the columns you want to change
columns = ['Box Office', 'Budget', 'Bond Actor Salary']
for i in columns:
    bond[i] = bond[i].apply(convert_to_string_add_millions)
bond.head()

Unnamed: 0_level_0,Year,Actor,Director,Box Office,Budget,Bond Actor Salary
Film,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
A View to a Kill,1985,Roger Moore,John Glen,275.2 Million,54.5 Million,9.1 Million
Casino Royale,2006,Daniel Craig,Martin Campbell,581.5 Million,145.3 Million,3.3 Million
Casino Royale,1967,David Niven,Ken Hughes,315.0 Million,85.0 Million,nan Million
Diamonds Are Forever,1971,Sean Connery,Guy Hamilton,442.5 Million,34.7 Million,5.8 Million
Die Another Day,2002,Pierce Brosnan,Lee Tamahori,465.4 Million,154.2 Million,17.9 Million


### Apply a function to every DF Row the the .apply() method

In [37]:
bond = pd.read_csv('./data/jamesbond.csv', index_col = 'Film')
#import csv into pd and set index column equal to film
bond.sort_index(inplace = True)
#also want to sort the index, which actually speeds up the search
bond.head(3)

Unnamed: 0_level_0,Year,Actor,Director,Box Office,Budget,Bond Actor Salary
Film,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
A View to a Kill,1985,Roger Moore,John Glen,275.2,54.5,9.1
Casino Royale,2006,Daniel Craig,Martin Campbell,581.5,145.3,3.3
Casino Royale,1967,David Niven,Ken Hughes,315.0,85.0,


In [34]:
#we will convert each row into an array and pass each value of that array into the good_movie function
def good_movie(row):
    actor = row[1]
    budget = row[4]
    if actor == 'Pierce Brosnan':
        return 'The Best'
    elif actor == 'Roger Moore' and budget > 40:
        return 'Enjoyable'
    else:
        return 'I have no clue'

In [40]:
bond.apply(good_movie, axis = 1)

Film
A View to a Kill                        Enjoyable
Casino Royale                      I have no clue
Casino Royale                      I have no clue
Diamonds Are Forever               I have no clue
Die Another Day                          The Best
Dr. No                             I have no clue
For Your Eyes Only                      Enjoyable
From Russia with Love              I have no clue
GoldenEye                                The Best
Goldfinger                         I have no clue
Licence to Kill                    I have no clue
Live and Let Die                   I have no clue
Moonraker                               Enjoyable
Never Say Never Again              I have no clue
Octopussy                               Enjoyable
On Her Majesty's Secret Service    I have no clue
Quantum of Solace                  I have no clue
Skyfall                            I have no clue
Spectre                            I have no clue
The Living Daylights               I have no 

In [42]:
bond['Rating'] = bond.apply(good_movie, axis = 1)
bond.head()

Unnamed: 0_level_0,Year,Actor,Director,Box Office,Budget,Bond Actor Salary,Rating
Film,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
A View to a Kill,1985,Roger Moore,John Glen,275.2,54.5,9.1,Enjoyable
Casino Royale,2006,Daniel Craig,Martin Campbell,581.5,145.3,3.3,I have no clue
Casino Royale,1967,David Niven,Ken Hughes,315.0,85.0,,I have no clue
Diamonds Are Forever,1971,Sean Connery,Guy Hamilton,442.5,34.7,5.8,I have no clue
Die Another Day,2002,Pierce Brosnan,Lee Tamahori,465.4,154.2,17.9,The Best


In [None]:
#need to specify axis = 1 so it goes across the rows instead of down the columns.

### Create a Copy of a DF with the .copy() method

In [51]:
bond = pd.read_csv('./data/jamesbond.csv', index_col = 'Film')
#import csv into pd and set index column equal to film
bond.sort_index(inplace = True)
#also want to sort the index, which actually speeds up the search
bond.head(3)

Unnamed: 0_level_0,Year,Actor,Director,Box Office,Budget,Bond Actor Salary
Film,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
A View to a Kill,1985,Roger Moore,John Glen,275.2,54.5,9.1
Casino Royale,2006,Daniel Craig,Martin Campbell,581.5,145.3,3.3
Casino Royale,1967,David Niven,Ken Hughes,315.0,85.0,


In [44]:
#creates an exact copy of an already existing pandas object (df or series)
#but stores in completely seperately in memory.

In [46]:
bond['Director']
#assign this as a brand new variable
directors = bond['Director']
directors.head(3)

Film
A View to a Kill          John Glen
Casino Royale       Martin Campbell
Casino Royale            Ken Hughes
Name: Director, dtype: object

In [48]:
#changing the director of a view to a kill will also change the value in the original df
directors['A View to a Kill'] = 'Mister John Glen'

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [49]:
directors.head(3)

Film
A View to a Kill    Mister John Glen
Casino Royale        Martin Campbell
Casino Royale             Ken Hughes
Name: Director, dtype: object

In [50]:
bond.head(3)

Unnamed: 0_level_0,Year,Actor,Director,Box Office,Budget,Bond Actor Salary
Film,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
A View to a Kill,1985,Roger Moore,Mister John Glen,275.2,54.5,9.1
Casino Royale,2006,Daniel Craig,Martin Campbell,581.5,145.3,3.3
Casino Royale,1967,David Niven,Ken Hughes,315.0,85.0,


In [52]:
#what if we don't want to initial df to be modified?
#we can create a copy of the df and work with that copy
directors_copy = bond['Director'].copy()
directors_copy['A View to a Kill'] = 'Mister John Glen'

In [53]:
directors_copy.head()

Film
A View to a Kill        Mister John Glen
Casino Royale            Martin Campbell
Casino Royale                 Ken Hughes
Diamonds Are Forever        Guy Hamilton
Die Another Day             Lee Tamahori
Name: Director, dtype: object

In [54]:
bond.head()  #we can see that the value has changed in the copy of the directors series but not
#in the original df

Unnamed: 0_level_0,Year,Actor,Director,Box Office,Budget,Bond Actor Salary
Film,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
A View to a Kill,1985,Roger Moore,John Glen,275.2,54.5,9.1
Casino Royale,2006,Daniel Craig,Martin Campbell,581.5,145.3,3.3
Casino Royale,1967,David Niven,Ken Hughes,315.0,85.0,
Diamonds Are Forever,1971,Sean Connery,Guy Hamilton,442.5,34.7,5.8
Die Another Day,2002,Pierce Brosnan,Lee Tamahori,465.4,154.2,17.9


In [55]:
#we can also overwrite the actual director column in the original df after messing with the copy
bond['Director'] = directors_copy
bond.head()

Unnamed: 0_level_0,Year,Actor,Director,Box Office,Budget,Bond Actor Salary
Film,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
A View to a Kill,1985,Roger Moore,Mister John Glen,275.2,54.5,9.1
Casino Royale,2006,Daniel Craig,Martin Campbell,581.5,145.3,3.3
Casino Royale,1967,David Niven,Ken Hughes,315.0,85.0,
Diamonds Are Forever,1971,Sean Connery,Guy Hamilton,442.5,34.7,5.8
Die Another Day,2002,Pierce Brosnan,Lee Tamahori,465.4,154.2,17.9
