In [17]:
import pandas as pd

In [18]:
# Suppose we have two dataframes that share the some
# population, e.g a set of students and a set of staff 
# with the population of interest being the set of all 
# students who are also members of the staff

# What if we wanted to get a list of all people regardless
# of whether they're students or members of staff? In database
# parlance, we would perfom a FULL OUTER JOIN, which is 
# equivalent to what it's refered to as an Union in set theory

# If instead we wanted those who are both students AND staff
# we would make use of an INNER JOIN, the database equivalent
# of an intesection in set theory

In [19]:
# To implement this in Pandas we make use of the merge()
# function

# Create a staff and students dataframe
staff_df = pd.DataFrame([{'Name': 'Kelly', 'Role': 'Director of HR'},
{'Name': 'Sally', 'Role': 'Course liason'},
{'Name': 'James', 'Role': 'Grader'}])

students_df = pd.DataFrame([{'Name': 'James', 'School': 'Business'},
{'Name': 'Mike', 'School': 'Law'},
{'Name': 'Sally', 'School': 'Engineering'}])

# Index the dataframes by name
staff_df = staff_df.set_index('Name')
students_df = students_df.set_index('Name')

print(staff_df.head())
print(students_df.head())

                 Role
Name                 
Kelly  Director of HR
Sally   Course liason
James          Grader
            School
Name              
James     Business
Mike           Law
Sally  Engineering


In [20]:
# There's some overlap: James and Kelly are both students
# and staff, plus both dataframes are indexed along the value
# ('Name') we want top merge them on

# To get a full outer join, we use the merge() function like so
pd.merge(staff_df, students_df, how = 'outer', left_index = True, right_index = True)

Unnamed: 0_level_0,Role,School
Name,Unnamed: 1_level_1,Unnamed: 2_level_1
James,Grader,Business
Kelly,Director of HR,
Mike,,Law
Sally,Course liason,Engineering


In [21]:
# To get an inner join, we just need to set how = 'inner'
pd.merge(staff_df, students_df, how = 'inner', left_index = True, right_index = True)

Unnamed: 0_level_0,Role,School
Name,Unnamed: 1_level_1,Unnamed: 2_level_1
Sally,Course liason,Engineering
James,Grader,Business


In [22]:
# We can also do set addition, called left and right joins
# in database lingo. In merge(), the left is the 1st and the 
# right is the 2nd one

# Get list of all staff regardless of their student status
pd.merge(staff_df, students_df, how = 'left', left_index = True, right_index = True)

Unnamed: 0_level_0,Role,School
Name,Unnamed: 1_level_1,Unnamed: 2_level_1
Kelly,Director of HR,
Sally,Course liason,Engineering
James,Grader,Business


In [23]:
# Get list of all students regardless of their staff status
pd.merge(staff_df, students_df, how = 'right', left_index = True, right_index = True)

Unnamed: 0_level_0,Role,School
Name,Unnamed: 1_level_1,Unnamed: 2_level_1
James,Grader,Business
Mike,,Law
Sally,Course liason,Engineering


In [24]:
# You dont have to use index to join the dataframes, columns
# can be used as well by calling the parameter 'on'

# Remove index from dataframes
staff_df = staff_df.reset_index()
students_df = students_df.reset_index()

# Now merge using the 'on' parameter
pd.merge(staff_df, students_df, how = 'right', on = 'Name')

Unnamed: 0,Name,Role,School
0,James,Grader,Business
1,Mike,,Law
2,Sally,Course liason,Engineering


In [28]:
# What happens when there are conflicts?
staff_df = pd.DataFrame([
{'Name': 'Kelly', 'Role': 'Director of HR', 
'Location': 'State Street'},
{'Name': 'Sally', 'Role': 'Course Liason', 
'Location': 'Washington Avenue'},
{'Name': 'James', 'Role': 'Grader', 
'Location': 'Washington Avenue'}
])

students_df = pd.DataFrame([
{'Name': 'James', 'School': 'Business',
'Location': '1024 Billiard Avenue'},
{'Name': 'Mike', 'School': 'Law',
'Location': 'Fraternity house #22'},
{'Name': 'Sally', 'School': 'Engineering',
'Location': '512 Wilson Crescent'}
])

print(staff_df, )
print(students_df)

    Name            Role           Location
0  Kelly  Director of HR       State Street
1  Sally   Course Liason  Washington Avenue
2  James          Grader  Washington Avenue 

    Name       School              Location
0  James     Business  1024 Billiard Avenue
1   Mike          Law  Fraternity house #22
2  Sally  Engineering   512 Wilson Crescent


In [29]:
# For staff, the location column indicates their office location,
# while for students it indicates their home adress.

# merge() preserves this info but appends an _x or _y to help
# differentiate the index. Underscore _x is always for the left
# dataframe, and _y is always for the right one

# Do a left join for Staff
pd.merge(staff_df, students_df, how = 'left', on = 'Name') 

Unnamed: 0,Name,Role,Location_x,School,Location_y
0,Kelly,Director of HR,State Street,,
1,Sally,Course Liason,Washington Avenue,Engineering,512 Wilson Crescent
2,James,Grader,Washington Avenue,Business,1024 Billiard Avenue


In [31]:
# It's possible that the first name of students and staff
# overlap, but the last name might not.
# In this case we need to use a list of multiple columns to
# be used for joining keys from both dataframes on the parameter

staff_df = pd.DataFrame([
{'Name': 'Kelly', 'Last Name': 'Desjardins',
'Role': 'Director of HR'},
{'Name': 'Sally', 'Last Name': 'Brooks',
 'Role': 'Course Liason'},
{'Name': 'James', 'Last Name': 'Wilde',
 'Role': 'Grader'}
])

students_df = pd.DataFrame([
{'Name': 'James', 'Last Name': 'Hammond',
 'School': 'Business'},
{'Name': 'Mike', 'Last Name': 'Smith',
 'School': 'Law'},
{'Name': 'Sally', 'Last Name': 'Brooks',
 'School': 'Engineering'}
])

print(staff_df, '\n')
print(students_df)

    Name   Last Name            Role
0  Kelly  Desjardins  Director of HR
1  Sally      Brooks   Course Liason
2  James       Wilde          Grader 

    Name Last Name       School
0  James   Hammond     Business
1   Mike     Smith          Law
2  Sally    Brooks  Engineering


In [32]:
# Notice that there are two James both wth different 
# lastnames, so an inner join would not include them
# in the output when we pass a list to 'on

# With no list, pandas treats it as one individual with
# different last names
pd.merge(staff_df, students_df, how = 'inner', on = 'Name')

Unnamed: 0,Name,Last Name_x,Role,Last Name_y,School
0,Sally,Brooks,Course Liason,Brooks,Engineering
1,James,Wilde,Grader,Hammond,Business


In [33]:
# With a list, we tell pandas they have to agree on all
# those columns and it correctly notices they're different
# people
pd.merge(staff_df, students_df, how = 'inner', on = ['Name', 'Last Name'])

Unnamed: 0,Name,Last Name,Role,School
0,Sally,Brooks,Course Liason,Engineering


In [42]:
# If we think of merging as joining 'horizontally' then
# concatenating is joining 'vertically', putting dataframes
# on top or below each other

# Let's take a dataset, tracking certain info over the years
# and storing each year in a separate .csv file, and concatenate it
# into a single dataframe.

# 2011-2013 records on US universities data on students completetion,
# student debt, after-graduation income, etc
df_2011 = pd.read_csv('datasets/MERGED2011_12_PP.csv', on_bad_lines = 'skip')
df_2012 = pd.read_csv('datasets/MERGED2012_13_PP.csv', on_bad_lines = 'skip')
df_2013 = pd.read_csv('datasets/MERGED2013_14_PP.csv', on_bad_lines = 'skip')

df_2011.head()

Unnamed: 0,version https://git-lfs.github.com/spec/v1
0,oid sha256:889cb25b86e8bd07b2aa82b253e3fcd0d8c...
1,size 309076268


In [44]:
# It has more than 1900 columns!

print(len(df_2011))
print(len(df_2012))
print(len(df_2013))

2
2
2


In [45]:
# Let's put all three dataframes in a list and
# use the concat() function to join them
frames = [df_2011, df_2012, df_2013]
pd.concat(frames)

Unnamed: 0,version https://git-lfs.github.com/spec/v1
0,oid sha256:889cb25b86e8bd07b2aa82b253e3fcd0d8c...
1,size 309076268
0,oid sha256:55cfd6746fdcc1cb5a29350c5a39c742ae8...
1,size 157050855
0,oid sha256:dbef09960b9dd4392f144a05562af3639d8...
1,size 157811280


In [46]:
# To show what observations are from what year we
# can use the 'keys' parameter to set an extra level
# of indices
pd.concat(frames, keys = ['2011', '2012', '2013']) 

Unnamed: 0,Unnamed: 1,version https://git-lfs.github.com/spec/v1
2011,0,oid sha256:889cb25b86e8bd07b2aa82b253e3fcd0d8c...
2011,1,size 309076268
2012,0,oid sha256:55cfd6746fdcc1cb5a29350c5a39c742ae8...
2012,1,size 157050855
2013,0,oid sha256:dbef09960b9dd4392f144a05562af3639d8...
2013,1,size 157811280


In [47]:
# Concatenation also has inner and outer join methods

# Outer is for dataframes that don't have identical columns
# with this, some cells will be NaN

# With inner, some observations will be dropped due to NaNs

# This is similar to left and righ joins with merge()