In [1]:
import numpy as np
import pandas as pd

## Merging and joining data frames

In [2]:
# First data frame
raw_data = {
        'subject_id': ['1', '2', '3', '4', '5'],
        'first_name': ['Alex', 'Amy', 'Allen', 'Alice', 'Ayoung'],
        'last_name': ['Anderson', 'Ackerman', 'Ali', 'Aoni', 'Atiches']
}
df_a = pd.DataFrame(raw_data,
                    columns=['subject_id', 'first_name', 'last_name'])
df_a

Unnamed: 0,subject_id,first_name,last_name
0,1,Alex,Anderson
1,2,Amy,Ackerman
2,3,Allen,Ali
3,4,Alice,Aoni
4,5,Ayoung,Atiches


In [3]:
# Second data frame
raw_data = {
        'subject_id': ['4', '5', '6', '7', '8'],
        'first_name': ['Billy', 'Brian', 'Bran', 'Bryce', 'Betty'],
        'last_name': ['Bonder', 'Black', 'Balwner', 'Brice', 'Btisan']
}
df_b = pd.DataFrame(raw_data, columns=['subject_id', 'first_name', 'last_name'])
df_b

Unnamed: 0,subject_id,first_name,last_name
0,4,Billy,Bonder
1,5,Brian,Black
2,6,Bran,Balwner
3,7,Bryce,Brice
4,8,Betty,Btisan


In [4]:
# By default, concatenation simply sticks one data frame onto the bottom of the other
df_new = pd.concat([df_a, df_b])
df_new

Unnamed: 0,subject_id,first_name,last_name
0,1,Alex,Anderson
1,2,Amy,Ackerman
2,3,Allen,Ali
3,4,Alice,Aoni
4,5,Ayoung,Atiches
0,4,Billy,Bonder
1,5,Brian,Black
2,6,Bran,Balwner
3,7,Bryce,Brice
4,8,Betty,Btisan


In [5]:
# The columns output by concat() are the union of the two data frames' columns.
# Any column that is missing gets filled in with NaNs.
pd.concat([df_a, df_b[['first_name', 'last_name']]])

Unnamed: 0,first_name,last_name,subject_id
0,Alex,Anderson,1.0
1,Amy,Ackerman,2.0
2,Allen,Ali,3.0
3,Alice,Aoni,4.0
4,Ayoung,Atiches,5.0
0,Billy,Bonder,
1,Brian,Black,
2,Bran,Balwner,
3,Bryce,Brice,
4,Betty,Btisan,


In [6]:
df_a

Unnamed: 0,subject_id,first_name,last_name
0,1,Alex,Anderson
1,2,Amy,Ackerman
2,3,Allen,Ali
3,4,Alice,Aoni
4,5,Ayoung,Atiches


In [7]:
df_b

Unnamed: 0,subject_id,first_name,last_name
0,4,Billy,Bonder
1,5,Brian,Black
2,6,Bran,Balwner
3,7,Bryce,Brice
4,8,Betty,Btisan


In [8]:
# By specifying axis=1, we can concatenate "horizontally"
pd.concat([df_a, df_b], axis=1)

Unnamed: 0,subject_id,first_name,last_name,subject_id.1,first_name.1,last_name.1
0,1,Alex,Anderson,4,Billy,Bonder
1,2,Amy,Ackerman,5,Brian,Black
2,3,Allen,Ali,6,Bran,Balwner
3,4,Alice,Aoni,7,Bryce,Brice
4,5,Ayoung,Atiches,8,Betty,Btisan


In [9]:
# If one data frame has more rows than the other, the empty elements get filled in with NaNs.
# IMPORTANT: This concatenation merges elements that have the same index label, not those that
# are in the same position.
pd.concat([df_a, df_b.iloc[0:2]], axis=1)

Unnamed: 0,subject_id,first_name,last_name,subject_id.1,first_name.1,last_name.1
0,1,Alex,Anderson,4.0,Billy,Bonder
1,2,Amy,Ackerman,5.0,Brian,Black
2,3,Allen,Ali,,,
3,4,Alice,Aoni,,,
4,5,Ayoung,Atiches,,,


In [10]:
# This kind of horizontal concatenation arguably isn't that useful, though. We might prefer to
# (for example) bring together all the data for the same subject ID.
# Let's introduce a new data frame containing a test ID for each subject ID.
raw_data = {
        'subject_id': ['1', '2', '3', '4', '5', '7', '8', '9', '10', '11'],
        'test_id': [51, 15, 15, 61, 16, 14, 15, 1, 61, 16]}
df_n = pd.DataFrame(raw_data, columns = ['subject_id','test_id'])
df_n

Unnamed: 0,subject_id,test_id
0,1,51
1,2,15
2,3,15
3,4,61
4,5,16
5,7,14
6,8,15
7,9,1
8,10,61
9,11,16


In [11]:
df_new

Unnamed: 0,subject_id,first_name,last_name
0,1,Alex,Anderson
1,2,Amy,Ackerman
2,3,Allen,Ali
3,4,Alice,Aoni
4,5,Ayoung,Atiches
0,4,Billy,Bonder
1,5,Brian,Black
2,6,Bran,Balwner
3,7,Bryce,Brice
4,8,Betty,Btisan


In [12]:
# Now we can use pd.merge() to get the test IDs for each person
pd.merge(df_new, df_n, on='subject_id')

Unnamed: 0,subject_id,first_name,last_name,test_id
0,1,Alex,Anderson,51
1,2,Amy,Ackerman,15
2,3,Allen,Ali,15
3,4,Alice,Aoni,61
4,4,Billy,Bonder,61
5,5,Ayoung,Atiches,16
6,5,Brian,Black,16
7,7,Bryce,Brice,14
8,8,Betty,Btisan,15


In [13]:
# There are a couple of different types of merges:
# outer joins, inner joins, left joins, right joins
df_a

Unnamed: 0,subject_id,first_name,last_name
0,1,Alex,Anderson
1,2,Amy,Ackerman
2,3,Allen,Ali
3,4,Alice,Aoni
4,5,Ayoung,Atiches


In [14]:
df_b

Unnamed: 0,subject_id,first_name,last_name
0,4,Billy,Bonder
1,5,Brian,Black
2,6,Bran,Balwner
3,7,Bryce,Brice
4,8,Betty,Btisan


In [15]:
# Outer join produces the set of all records in DF 1 and DF 2, with matching
# records from both sides where available. If there is no match, the missing side will
# contain NaN.
# In this case, the second DF doesn't have subject IDs 1,2,3 and the first DF doesn't have
# subject IDs 6,7,8.
pd.merge(df_a, df_b, on='subject_id', how='outer')

Unnamed: 0,subject_id,first_name_x,last_name_x,first_name_y,last_name_y
0,1,Alex,Anderson,,
1,2,Amy,Ackerman,,
2,3,Allen,Ali,,
3,4,Alice,Aoni,Billy,Bonder
4,5,Ayoung,Atiches,Brian,Black
5,6,,,Bran,Balwner
6,7,,,Bryce,Brice
7,8,,,Betty,Btisan


In [16]:
# Inner join (the default) produces only the set of records that match in both DF 1 and DF 2.
# In this case, the only overlapping subject IDs are 4 and 5.
pd.merge(df_a, df_b, on='subject_id', how='inner')

Unnamed: 0,subject_id,first_name_x,last_name_x,first_name_y,last_name_y
0,4,Alice,Aoni,Billy,Bonder
1,5,Ayoung,Atiches,Brian,Black


In [17]:
# Left join produces the set of all records in DF 1, with matching
# records from DF 2 where available. If there is no match, the right side will
# contain NaN.
# In this case, the second DF doesn't have subject IDs 1,2,3.
pd.merge(df_a, df_b, on='subject_id', how='left')

Unnamed: 0,subject_id,first_name_x,last_name_x,first_name_y,last_name_y
0,1,Alex,Anderson,,
1,2,Amy,Ackerman,,
2,3,Allen,Ali,,
3,4,Alice,Aoni,Billy,Bonder
4,5,Ayoung,Atiches,Brian,Black


In [18]:
# Right join produces the set of all records in DF 2, with matching
# records from DF 1 where available. If there is no match, the left side will
# contain NaN.
# In this case, the first DF doesn't have subject IDs 6,7,8.
pd.merge(df_a, df_b, on='subject_id', how='right')

Unnamed: 0,subject_id,first_name_x,last_name_x,first_name_y,last_name_y
0,4,Alice,Aoni,Billy,Bonder
1,5,Ayoung,Atiches,Brian,Black
2,6,,,Bran,Balwner
3,7,,,Bryce,Brice
4,8,,,Betty,Btisan


In [19]:
# First set of people and their names
raw_data = {
        'person_id': ['1', '2', '3', '4', '5'],
        'first_name': ['Alex', 'Amy', 'Allen', 'Alice', 'Ayoung'],
        'last_name': ['Anderson', 'Ackerman', 'Ali', 'Aoni', 'Atiches']
}
people_df = pd.DataFrame(raw_data)

In [20]:
# Second set of people and their names
raw_data = {
        'person_id': ['6', '7', '8', '9', '10'],
        'first_name': ['Billy', 'Brian', 'Bran', 'Bryce', 'Betty'],
        'last_name': ['Bonder', 'Black', 'Balwner', 'Brice', 'Btisan']
}
people_df_2 = pd.DataFrame(raw_data)

In [21]:
# Cars owned by people
raw_data = {
        'person_id': ['1', '2', '12', '6', '7', '11'],
        'car_model': ['Ford Focus', 'BMW M3', 'Lotus Elise',
                      'Toyota Camry', 'Kia Picanto', 'Honda Accord'],
        'car_id': ['1', '2', '3', '4', '5', '6']
}
car_owners_df = pd.DataFrame(raw_data)

## Exercises

1. Put both sets of people into a single data frame `all_people_df` and print it.
2. Create a data frame showing the car owned by each person. Don't include any person who doesn't own a car and don't include any car for which we don't have the owner's name.
3. Create a data frame showing the car owned by each person. This time, include people who don't own a car but don't include any car for which we don't have the owner's name.
4. Create a data frame showing the car owned by each person. This time, don't include people who don't own a car but do include any car for which we don't have the owner's name.
5. Create a data frame showing the car owned by each person. This time, include people who don't own a car and also include any car for which we don't have the owner's name.