# Merging datasets

- DataFrame.concat()

In [8]:
# Setup
import pandas as pd
import numpy as np

# Create some datasets
# arange gives a list of numbers, 0 to 15, reshape rehapes it to 4*4 matrix
# columns gives the column names
df1 = pd.DataFrame(np.arange(16).reshape(4,4), columns=list("ABCD"))
print("df1:")
print(df1)

# zeros gives a matrix of zeris, (Nrows, Ncolumns)
df2 = pd.DataFrame(np.zeros((3,4)), columns=list("ABCD"))

print("\ndf2:")
print(df2)



df1:
    A   B   C   D
0   0   1   2   3
1   4   5   6   7
2   8   9  10  11
3  12  13  14  15

df2:
     A    B    C    D
0  0.0  0.0  0.0  0.0
1  0.0  0.0  0.0  0.0
2  0.0  0.0  0.0  0.0


## Concat

- Concatenates along a particular axis.

- Set logic.

In [13]:
# Concat needs dataframes in a list
print(pd.concat([df1,df2]))
# Dataframes are put after eachother
# df1 and df2 has same column names. So df2 is added along axis=0

# Row indeces are just concated after eachother.
# You can re-index or reset index to mitigate this.
# drop=True removes the OLD index column
print("")
print(pd.concat([df1,df2]).reset_index(drop=True))

# With axis=1:
print("")
print(pd.concat([df1,df2], axis="columns"))
# Here it adds NaN in a new row for df2 since df2 lacks one row to fit df1.


      A     B     C     D
0   0.0   1.0   2.0   3.0
1   4.0   5.0   6.0   7.0
2   8.0   9.0  10.0  11.0
3  12.0  13.0  14.0  15.0
0   0.0   0.0   0.0   0.0
1   0.0   0.0   0.0   0.0
2   0.0   0.0   0.0   0.0

      A     B     C     D
0   0.0   1.0   2.0   3.0
1   4.0   5.0   6.0   7.0
2   8.0   9.0  10.0  11.0
3  12.0  13.0  14.0  15.0
4   0.0   0.0   0.0   0.0
5   0.0   0.0   0.0   0.0
6   0.0   0.0   0.0   0.0

    A   B   C   D    A    B    C    D
0   0   1   2   3  0.0  0.0  0.0  0.0
1   4   5   6   7  0.0  0.0  0.0  0.0
2   8   9  10  11  0.0  0.0  0.0  0.0
3  12  13  14  15  NaN  NaN  NaN  NaN


In [17]:
# "join=" sets how the set-logic is handles.
pd.concat([df1,df2], axis="columns", join="inner")
# Above we had "outer", uses all data
# Here we have inner, it only uses what is common for both and drops the rest!


Unnamed: 0,A,B,C,D,A.1,B.1,C.1,D.1
0,0,1,2,3,0.0,0.0,0.0,0.0
1,4,5,6,7,0.0,0.0,0.0,0.0
2,8,9,10,11,0.0,0.0,0.0,0.0


## Merge

- 

See "pandas merge join concat":

[https://pandas.pydata.org/pandas-docs/stable/user_guide/merging.html]



In [22]:
# From pandas documentation help examples

left = pd.DataFrame({
        "key": ["K0", "K0", "K2", "K3"],
        "A": ["A0", "A1", "A2", "A3"],
        "B": ["B0", "B1", "B2", "B3"],
    })

right = pd.DataFrame({
        "key": ["K0", "K1", "K2", "K3"],
        "C": ["C0", "C1", "C2", "C3"],
        "D": ["D0", "D1", "D2", "D3"],
    })

print("left:")
print(left)
print("\nright:")
print(right)


left:
  key   A   B
0  K0  A0  B0
1  K0  A1  B1
2  K2  A2  B2
3  K3  A3  B3

right:
  key   C   D
0  K0  C0  D0
1  K1  C1  D1
2  K2  C2  D2
3  K3  C3  D3


In [24]:
# We have two k0, we use "key", so C1 and D1 dissapears, instead it doubles c0 and d0
pd.merge(left, right, on="key", indicator=True)

Unnamed: 0,key,A,B,C,D,_merge
0,K0,A0,B0,C0,D0,both
1,K0,A1,B1,C0,D0,both
2,K2,A2,B2,C2,D2,both
3,K3,A3,B3,C3,D3,both


In [25]:
# Change to setlogic = outer
pd.merge(left, right, on="key", how="outer", indicator=True)
# Adds a row with those that have a key that are common to 

Unnamed: 0,key,A,B,C,D,_merge
0,K0,A0,B0,C0,D0,both
1,K0,A1,B1,C0,D0,both
2,K2,A2,B2,C2,D2,both
3,K3,A3,B3,C3,D3,both
4,K1,,,C1,D1,right_only
