## Concat 和 Append

### 回顾：numpy的矩阵连接（Concatenation）

In [1]:
import numpy as np

x = [1, 2, 3]
y = [4, 5, 6]
z = [7, 8, 9]
np.concatenate([x, y, z])

array([1, 2, 3, 4, 5, 6, 7, 8, 9])

In [2]:
x = [[1, 2], [3, 4]]
np.concatenate([x, x], axis=1)

array([[1, 2, 1, 2],
       [3, 4, 3, 4]])

### Pandas的数据连接

In [4]:
import pandas as pd

ser1 = pd.Series(['A', 'B', 'C'], index=[1, 2, 3])
ser2 = pd.Series(['D', 'E', 'F'], index=[4, 5, 6])
pd.concat([ser1, ser2])

1    A
2    B
3    C
4    D
5    E
6    F
dtype: object

In [5]:
def make_df(cols, ind):
    data = {c: [str(c) + str(i) for i in ind] for c in cols}
    return pd.DataFrame(data, ind) 

make_df('ABC', range(3))

Unnamed: 0,A,B,C
0,A0,B0,C0
1,A1,B1,C1
2,A2,B2,C2


In [6]:
df1 = make_df('AB', [1, 2])
df1

Unnamed: 0,A,B
1,A1,B1
2,A2,B2


In [7]:
df2 = make_df('AB', [3, 4])
df2

Unnamed: 0,A,B
3,A3,B3
4,A4,B4


In [8]:
pd.concat([df1, df2])

Unnamed: 0,A,B
1,A1,B1
2,A2,B2
3,A3,B3
4,A4,B4


In [10]:
df3 = make_df('AB', [0, 1])
df3

Unnamed: 0,A,B
0,A0,B0
1,A1,B1


In [11]:
df4 = make_df('CD', [0, 1])
df4

Unnamed: 0,C,D
0,C0,D0
1,C1,D1


In [13]:
pd.concat([df3, df4], axis=1)

Unnamed: 0,A,B,C,D
0,A0,B0,C0,D0
1,A1,B1,C1,D1


#### 处理相同索引

In [14]:
df_x = make_df('AB', [0, 1])
df_x

Unnamed: 0,A,B
0,A0,B0
1,A1,B1


In [17]:
df_y = make_df('AB', [0, 1])
df_y

Unnamed: 0,A,B
0,A0,B0
1,A1,B1


In [19]:
pd.concat([df_x, df_y]) # 默认没有问题

Unnamed: 0,A,B
0,A0,B0
1,A1,B1
0,A0,B0
1,A1,B1


##### 让其报错

In [20]:
try:
    pd.concat([df_x, df_y], verify_integrity=True)
except ValueError as e: 
    print("ValueError:", e)

ValueError: Indexes have overlapping values: [0, 1]


##### 忽略相同索引

In [21]:
pd.concat([df_x, df_y], ignore_index=True)

Unnamed: 0,A,B
0,A0,B0
1,A1,B1
2,A0,B0
3,A1,B1


##### 制作多层索引

In [23]:
pd.concat([df_x, df_y], keys=['x', 'y'])

Unnamed: 0,Unnamed: 1,A,B
x,0,A0,B0
x,1,A1,B1
y,0,A0,B0
y,1,A1,B1


#### join

In [24]:
df5 = make_df('ABC', [1, 2])
df5

Unnamed: 0,A,B,C
1,A1,B1,C1
2,A2,B2,C2


In [25]:
df6 = make_df('BCD', [3, 4])
df6

Unnamed: 0,B,C,D
3,B3,C3,D3
4,B4,C4,D4


In [26]:
pd.concat([df5, df6]) # 默认是 outer join

Unnamed: 0,A,B,C,D
1,A1,B1,C1,
2,A2,B2,C2,
3,,B3,C3,D3
4,,B4,C4,D4


In [27]:
pd.concat([df5, df6], join='inner') # inner join

Unnamed: 0,B,C
1,B1,C1
2,B2,C2
3,B3,C3
4,B4,C4


In [28]:
pd.concat([df5, df6], join_axes=[df5.columns]) # 使用 join_axes

Unnamed: 0,A,B,C
1,A1,B1,C1
2,A2,B2,C2
3,,B3,C3
4,,B4,C4


#### ``append()``

In [29]:
df1.append(df2) # 等价于 pd.concat([df1, df2])

Unnamed: 0,A,B
1,A1,B1
2,A2,B2
3,A3,B3
4,A4,B4


注意：和Python中的list的append方法不同，pandas中的append没有修改原有的数据！