# Merge


In [None]:
import pandas as pd

In [None]:
# Sample raw data dictionaries
raw_data_1 = {
    "subject_id": ["1", "2", "3", "4", "5"],
    "first_name": ["Alex", "Amy", "Allen", "Alice", "Ayoung"],
    "last_name": ["Anderson", "Ackerman", "Ali", "Aoni", "Atiches"],
}

raw_data_2 = {
    "subject_id": ["4", "5", "6", "7", "8"],
    "first_name": ["Billy", "Brian", "Bran", "Bryce", "Betty"],
    "last_name": ["Bonder", "Black", "Balwner", "Brice", "Btisan"],
}

raw_data_3 = {"subject_id": ["1", "2", "3", "4", "6"], "test_id": [11, 12, 13, 14, 16]}

In [None]:
# Create DataFrames from the raw data
data1 = pd.DataFrame(raw_data_1, columns=["subject_id", "first_name", "last_name"])
data2 = pd.DataFrame(raw_data_2, columns=["subject_id", "first_name", "last_name"])
data3 = pd.DataFrame(raw_data_3, columns=["subject_id", "test_id"])

display(data1)
display(data2)
display(data3)

In [None]:
# Save the DataFrames to an Excel file with separate sheets
names = ["data1", "data2", "data3"]
dataframes = [data1, data2, data3]
with pd.ExcelWriter("merge_raw.xlsx") as writer:
    for name, frame in zip(names, dataframes):
        frame.to_excel(writer, sheet_name=name, index=False)

In [None]:
# Concatenate data1 and data2 vertically
all_data = pd.concat([data1, data2])
all_data

In [None]:
# Reset index after concatenation
all_data = all_data.reset_index(drop=True)
all_data

In [None]:
# Concatenate data1 and data2 horizontally
pd.concat([data1, data2], axis=1)

### Join diagram

[Link](https://data36.com/wp-content/uploads/2018/08/4-pandas-merge-inner-outer-left-right-768x579.png)


In [None]:
# Merge data1 and data3 along the subject_id value (inner join)
pd.merge(data1, data3, on="subject_id", how="inner")

# Alternate syntax for merging data1 and data3
data1.merge(data3, on="subject_id", how="inner")

In [None]:
# Merge data1 and data3 along the subject_id value (left join)
data1.merge(data3, on="subject_id", how="left")

In [None]:
# Merge data1 and data3 along the subject_id value (right join)
data1.merge(data3, on="subject_id", how="right")

In [None]:
# Merge data1 and data3 along the subject_id value (outer join)
data1.merge(data3, on="subject_id", how="outer")

In [None]:
# Change columns header of data1 and data3 so that they are not matched.
data1 = data1.rename(columns={"subject_id": "id1"})
data3 = data3.rename(columns={"subject_id": "id3"})

# Merge all_data and data3 along the subject_id value (innter join)
data1.merge(data3, left_on="id1", right_on="id3", how="inner")

### Join `dummy_data1` and `dummy_data2` and the merge with the `dummy_data3`


In [None]:
dummy_data1 = {
    "id": ["1", "2", "3", "4", "5"],
    "Feature1": ["A", "C", "E", "G", "I"],
    "Feature2": ["B", "D", "F", "H", "J"],
}

dummy_data2 = {
    "id": ["1", "2", "6", "7", "8"],
    "Feature1": ["K", "M", "O", "Q", "S"],
    "Feature2": ["L", "N", "P", "R", "T"],
}

dummy_data3 = {
    "id": ["1", "2", "3", "4", "5", "7", "8", "9", "10", "11"],
    "Feature3": [12, 13, 14, 15, 16, 17, 15, 12, 13, 23],
}

dummy_df1 = pd.DataFrame(dummy_data1, columns=["id", "Feature1", "Feature2"])
dummy_df2 = pd.DataFrame(dummy_data2, columns=["id", "Feature1", "Feature2"])
dummy_df3 = pd.DataFrame(dummy_data3, columns=["id", "Feature3"])
display(dummy_df1)
display(dummy_df2)
display(dummy_df3)