# Merging DataFrames

In [None]:
# Full outer join = union
# Inner join = intersection

In [2]:
# Example of merging data bases considering the diferent types of join
import pandas as pd

# First DataFrame: staff
staff_df = pd.DataFrame([{'Name': 'Kelly', 'Role': 'Dicrector of HR'},
                         {'Name': 'Sally', 'Role': 'Course liasion'},
                         {'Name': 'James', 'Role': 'Grader'}])

# We index by name
staff_df = staff_df.set_index('Name')

# Second DataFrame: students
student_df = pd.DataFrame([{'Name': 'James', 'School': 'Business'},
                           {'Name': 'Mike', 'School': 'Law'},
                           {'Name': 'Sally', 'School': 'Engineering'}])

#We index by name
student_df = student_df.set_index('Name')

print(staff_df.head())
print(student_df.head())

                  Role
Name                  
Kelly  Dicrector of HR
Sally   Course liasion
James           Grader
            School
Name              
James     Business
Mike           Law
Sally  Engineering


In [3]:
# If we want the union of these dataframes, we can call merge() passing in 
# the staff DataFrame on the left and the student DataFrame on the right and telling we want an outer join. 
# We want to use the indices of the left and right dataframes for the merge.
pd.merge(staff_df, student_df, how='outer', left_index=True, right_index=True)

Unnamed: 0_level_0,Role,School
Name,Unnamed: 1_level_1,Unnamed: 2_level_1
James,Grader,Business
Kelly,Dicrector of HR,
Mike,,Law
Sally,Course liasion,Engineering


In [4]:
# If we want the intersection, we just need to set the how attribute to inner.
pd.merge(staff_df, student_df, how='inner', left_index=True, right_index=True)

Unnamed: 0_level_0,Role,School
Name,Unnamed: 1_level_1,Unnamed: 2_level_1
Sally,Course liasion,Engineering
James,Grader,Business


In [5]:
# We could also want to get a list of all staff regardless of whether they were student ot not, but in the case
# they were students, we would want to get their student details as well. For this we would use a left join.
pd.merge(staff_df, student_df, how ='left', left_index =True, right_index =True)

Unnamed: 0_level_0,Role,School
Name,Unnamed: 1_level_1,Unnamed: 2_level_1
Kelly,Dicrector of HR,
Sally,Course liasion,Engineering
James,Grader,Business


In [None]:
# We could also do the same using the right merging

In [6]:
# Another way of merging DataFrames is by using columns, and not indices. For doing so, we can use the parameter 'on'
# and the common column of both DataFrames as the joining column

# We remove the indices
staff_df = staff_df.reset_index()
student_df = student_df.reset_index()

# We merge both DataFRames using column join
pd.merge(staff_df, student_df, how = 'right', on='Name')

Unnamed: 0,Name,Role,School
0,Sally,Course liasion,Engineering
1,James,Grader,Business
2,Mike,,Law


In [7]:
# There might be cases where we've conflicts between DataFrames, alas a column with the same label but different type
# of data. The merge function preserves this information, but appends an _x or _y to help differentiate between which 
# index went with which column of data, being _x the left DataFRame and -y the right DataFrame.
staff_df['Location'] = ['State Street', 'Washington Avenue', 'Washington Avenue']
student_df['Location'] = ['1024 Billiard Avenue', 'Fraternity House #22', '512 Wilson Crescent']

pd.merge(staff_df, student_df, how='left', on='Name')

Unnamed: 0,Name,Role,Location_x,School,Location_y
0,Kelly,Dicrector of HR,State Street,,
1,Sally,Course liasion,Washington Avenue,Engineering,512 Wilson Crescent
2,James,Grader,Washington Avenue,Business,1024 Billiard Avenue


__Multi-indexing and multiple columns__

In [8]:
# First DataFrame: staff
staff_df = pd.DataFrame([{'First Name': 'Kelly', 'Last Name': 'Desjardins', 'Role': 'Dicrector of HR'},
                         {'First Name': 'Sally', 'Last Name': 'Brooks', 'Role': 'Course liasion'},
                         {'First Name': 'James', 'Last Name': 'Wilde', 'Role': 'Grader'}])


# Second DataFrame: students
student_df = pd.DataFrame([{'First Name': 'James', 'Last Name': 'Hammond', 'School': 'Business'},
                           {'First Name': 'Mike', 'Last Name': 'Smith', 'School': 'Law'},
                           {'First Name': 'Sally', 'Last Name': 'Brooks', 'School': 'Engineering'}])

In [9]:
# Multiple columns indexing
pd.merge(staff_df, student_df, how='inner', on=['First Name', 'Last Name'])

Unnamed: 0,First Name,Last Name,Role,School
0,Sally,Brooks,Course liasion,Engineering


__Concatenation__

In [11]:
# We want to concatenate both DataFrames but we want to keep knowing which data was from students and which from staff
pd.concat([staff_df, student_df], keys = ['Staff', 'Students'])

Unnamed: 0,Unnamed: 1,First Name,Last Name,Role,School
Staff,0,Kelly,Desjardins,Dicrector of HR,
Staff,1,Sally,Brooks,Course liasion,
Staff,2,James,Wilde,Grader,
Students,0,James,Hammond,,Business
Students,1,Mike,Smith,,Law
Students,2,Sally,Brooks,,Engineering


In [None]:
# There exist also the inner and outer methods for concatenation. 
# If we concatenate two dataframes with different columns, with the outer method, columns will be added with NaN values. 
# If we choose to do inner concatenation, some columns will be dropped due to NaN values.
# We can think of this as analogous to the left and right joins of the merge() function

In [15]:
pd.concat([staff_df, student_df], join='outer', keys = ['Staff', 'Students'])

Unnamed: 0,Unnamed: 1,First Name,Last Name,Role,School
Staff,0,Kelly,Desjardins,Dicrector of HR,
Staff,1,Sally,Brooks,Course liasion,
Staff,2,James,Wilde,Grader,
Students,0,James,Hammond,,Business
Students,1,Mike,Smith,,Law
Students,2,Sally,Brooks,,Engineering


In [16]:
pd.concat([staff_df, student_df], join='inner', keys = ['Staff', 'Students'])

Unnamed: 0,Unnamed: 1,First Name,Last Name
Staff,0,Kelly,Desjardins
Staff,1,Sally,Brooks
Staff,2,James,Wilde
Students,0,James,Hammond
Students,1,Mike,Smith
Students,2,Sally,Brooks
