1.数据预处理包括数据的清洗、合并、重塑和转换。

### 一、数据清洗

In [79]:
import pandas as pd
import numpy as np

#### 1、空值和缺失值的处理

一般空值使用 None 表示，缺失值使用 NaN表示。

In [6]:
# isnull()函数:用来检查缺失值和空值，如果是返回True 否则返回False
ser1 = pd.Series([1,None,np.nan])

In [7]:
ser1

0    1.0
1    NaN
2    NaN
dtype: float64

In [8]:
pd.isnull(ser1)

0    False
1     True
2     True
dtype: bool

In [9]:
# notnull()函数:用来检查缺失值和空值，如果是返回False 否则返回True
ser1 = pd.Series([1,None,np.nan])

In [10]:
pd.notnull(ser1)

0     True
1    False
2    False
dtype: bool

In [11]:
# dropna()函数：作用时删除含有空值或缺省值的行或列。

In [12]:
df1 = pd.DataFrame({"类别":["小说", "散文", "传记", "青春文学"],
                   "书名":["《A2》", np.nan, "《B》", "《C》"],
                   "作者":[None, "张三","王武", "里斯"]})
df1 

Unnamed: 0,类别,书名,作者
0,小说,《A2》,
1,散文,,张三
2,传记,《B》,王武
3,青春文学,《C》,里斯


In [13]:
df1.dropna()

Unnamed: 0,类别,书名,作者
2,传记,《B》,王武
3,青春文学,《C》,里斯


In [14]:
df1

Unnamed: 0,类别,书名,作者
0,小说,《A2》,
1,散文,,张三
2,传记,《B》,王武
3,青春文学,《C》,里斯


In [15]:
# fillna()函数：填充空值或缺省值
df2 = pd.DataFrame({"A":[1,2,3,np.nan],
                   "B":[np.nan,4,np.nan,6],
                   "C":['c',7,8,9],
                   "D":[None,2,3,np.nan]})
df2

Unnamed: 0,A,B,C,D
0,1.0,,c,
1,2.0,4.0,7,2.0
2,3.0,,8,3.0
3,,6.0,9,


In [16]:
df2.fillna('0')

Unnamed: 0,A,B,C,D
0,1,0,c,0
1,2,4,7,2
2,3,0,8,3
3,0,6,9,0


In [17]:
df2.fillna({"A":4,"B":5,"D":1})

Unnamed: 0,A,B,C,D
0,1.0,5.0,c,1.0
1,2.0,4.0,7,2.0
2,3.0,5.0,8,3.0
3,4.0,6.0,9,1.0


In [18]:
df2.fillna(method="ffill")

Unnamed: 0,A,B,C,D
0,1.0,,c,
1,2.0,4.0,7,2.0
2,3.0,4.0,8,3.0
3,3.0,6.0,9,3.0


In [20]:
df2.fillna(method="bfill").fillna(method="ffill")

Unnamed: 0,A,B,C,D
0,1.0,4.0,c,2.0
1,2.0,4.0,7,2.0
2,3.0,6.0,8,3.0
3,3.0,6.0,9,3.0


#### 2、重复值处理

duplicated()函数:判断是否有重复值
drop_duplicates()函数:用于删除重复值

In [24]:
df3 = pd.DataFrame({"id":[1,2,4,3,4],
                   "name":["张三","李四","王五","赵六","王五"],
                   "sex":["男","男","女","男","女"]})
df3

Unnamed: 0,id,name,sex
0,1,张三,男
1,2,李四,男
2,4,王五,女
3,3,赵六,男
4,4,王五,女


In [25]:
# 默认从前向后查重
df3.duplicated()

0    False
1    False
2    False
3    False
4     True
dtype: bool

In [26]:
# 指定从后向前查找重复值
df3.duplicated(keep="last")

0    False
1    False
2     True
3    False
4    False
dtype: bool

In [27]:
# 所有相同的重复值都被标记
df3.duplicated(keep=False)

0    False
1    False
2     True
3    False
4     True
dtype: bool

In [28]:
df3.drop_duplicates()

Unnamed: 0,id,name,sex
0,1,张三,男
1,2,李四,男
2,4,王五,女
3,3,赵六,男


In [29]:
df3.drop_duplicates(keep="last")

Unnamed: 0,id,name,sex
0,1,张三,男
1,2,李四,男
3,3,赵六,男
4,4,王五,女


#### 二、更改数据类型

#### 1、明确指定数据的类型

In [33]:
df1 = pd.DataFrame({'A':['1','2','3'],'B':['4','5','6']})
df1

Unnamed: 0,A,B
0,1,4
1,2,5
2,3,6


In [35]:
# 查看数据类型
df1.dtypes

A    object
B    object
dtype: object

In [39]:
df2 = pd.DataFrame({'A':['1','2','3'],'B':['4','5','6']},dtype='int')
df2

  return bool(asarray(a1 == a2).all())


Unnamed: 0,A,B
0,1,4
1,2,5
2,3,6


In [41]:
df2.dtypes

A    int32
B    int32
dtype: object

#### 2、通过astype()函数强制转换数据的类型

In [46]:
df3 = df1.astype(dtype='int')
df3

Unnamed: 0,A,B
0,1,4
1,2,5
2,3,6


In [45]:
df3.dtypes

A    int32
B    int32
dtype: object

In [47]:
# astype 函数存在一些局限性，只要待转换的数据中存在数字以外的字符，转换就会出现错误。
df4 = pd.DataFrame({'A':['a','b','3'],'B':['4','5','6']},dtype='int')
# df5 = df4.astype(dtype='int')

  return bool(asarray(a1 == a2).all())


#### 3、通过 to_numeric()函数强制转换数据类型

注意：to_numeric()函数不能直接操作DataFrame()

In [52]:
df4 = pd.Series(['1','a','2','c'])
df4

0    1
1    a
2    2
3    c
dtype: object

In [53]:
# 通过errors参数来忽略一些错误，比如数据中存在数字以外的字符。
df5 = pd.to_numeric(df4, errors="ignore")
df5

0    1
1    a
2    2
3    c
dtype: object

In [54]:
df6 = pd.Series(['1','4.2','3.8'])
df6

0      1
1    4.2
2    3.8
dtype: object

In [55]:
# 将包含数字的字符串类型转换为浮点型
df7 = pd.to_numeric(df6, errors="ignore")
df7

0    1.0
1    4.2
2    3.8
dtype: float64

### 数据合并

#### 一、轴向堆叠数据

理解NumPy中维度、轴、秩（rank）：

1、维度称为轴。

2、秩指的是轴的数量，或者维度的数量。

In [56]:
# 一维数组[1, 2, 3]
arr1 = np.array([1,2,3])
arr1

array([1, 2, 3])

In [57]:
# 数组的轴的数量，或者维度的数量
arr1.ndim

1

In [58]:
# 数组的维度
arr1.shape

(3,)

In [59]:
# 二维数组[[1,2,3],[4,5,6]]
arr2 = np.array([[1,2,3],[4,5,6]])
arr2

array([[1, 2, 3],
       [4, 5, 6]])

In [60]:
arr2.ndim

2

In [61]:
arr2.shape

(2, 3)

In [62]:
# 三维数组[[[1,2,3],[4,5,6]],[[11,12,13],[14,15,16]]]
arr3 = np.array([[[1,2,3],[4,5,6]],[[11,12,13],[14,15,16]]])
arr3

array([[[ 1,  2,  3],
        [ 4,  5,  6]],

       [[11, 12, 13],
        [14, 15, 16]]])

In [64]:
arr3.ndim

3

In [65]:
arr3.shape

(2, 2, 3)

In [67]:
# 计算 axis 0 (轴0) 的和
arr3.sum(axis=0)

array([[12, 14, 16],
       [18, 20, 22]])

In [68]:
arr3.sum(axis=1)

array([[ 5,  7,  9],
       [25, 27, 29]])

In [69]:
arr3.sum(axis=2)

array([[ 6, 15],
       [36, 45]])

In [71]:
# 降维处理（将三维替换为二维）
A = np.array([1,2,3])
B = np.array([4,5,6])
C = np.array([11,12,13])
D = np.array([14,15,16])
new_arr3 = np.array([[A,B],[C,D]])
new_arr3

array([[[ 1,  2,  3],
        [ 4,  5,  6]],

       [[11, 12, 13],
        [14, 15, 16]]])

In [72]:
# 计算 轴0 的和： A+C, B+D
new_arr3.sum(axis=0)

array([[12, 14, 16],
       [18, 20, 22]])

In [73]:
A+C

array([12, 14, 16])

In [74]:
B+D

array([18, 20, 22])

##### 1、横向堆叠与外连接

In [76]:
df1 = pd.DataFrame({"A":['a0','a1','a2'],
                    "B":['b0','b1','b2'],
                   })
df1

Unnamed: 0,A,B
0,a0,b0
1,a1,b1
2,a2,b2


In [84]:
df2 = pd.DataFrame({"C":['c0','c1','c2','c3'],
                    "D":['d0','d1','d2','d3'],
                   })
df2

Unnamed: 0,C,D
0,c0,d0
1,c1,d1
2,c2,d2
3,c3,d3


In [86]:
# 横向堆叠合并，采用外连接
pd.concat([df1,df2], axis=1, join="outer")

Unnamed: 0,A,B,C,D
0,a0,b0,c0,d0
1,a1,b1,c1,d1
2,a2,b2,c2,d2
3,,,c3,d3


##### 2、纵向堆叠与内连接

In [87]:
df1 = pd.DataFrame({"A":['a0','a1','a2'],
                    "B":['b0','b1','b2'],
                    "C":['c0','c1','c2']
                   })
df1

Unnamed: 0,A,B,C
0,a0,b0,c0
1,a1,b1,c1
2,a2,b2,c2


In [88]:
df2 = pd.DataFrame({"B":['b3','b4','b5'],
                    "C":['c3','c4','c5'],
                    "D":['d0','d1','d2'],
                   })
df2

Unnamed: 0,B,C,D
0,b3,c3,d0
1,b4,c4,d1
2,b5,c5,d2


In [90]:
pd.concat([df1,df2],axis=0,join='inner')

Unnamed: 0,B,C
0,b0,c0
1,b1,c1
2,b2,c2
0,b3,c3
1,b4,c4
2,b5,c5


#### 二、主键合并数据

1.合并单个重复列

In [91]:
left = pd.DataFrame({"A":['a0','a1','a2'],
                    "B":['b0','b1','b2'],
                    "key":['k0','k1','k2']})
left

Unnamed: 0,A,B,key
0,a0,b0,k0
1,a1,b1,k1
2,a2,b2,k2


In [92]:
right = pd.DataFrame({"C":['c0','c1','c2','c3'],
                     "D":['d0','d1','d2','d3'],
                     "key":['k0','k1','k2','k3']})
right

Unnamed: 0,C,D,key
0,c0,d0,k0
1,c1,d1,k1
2,c2,d2,k2
3,c3,d3,k3


In [93]:
pd.merge(left,right,on='key')

Unnamed: 0,A,B,key,C,D
0,a0,b0,k0,c0,d0
1,a1,b1,k1,c1,d1
2,a2,b2,k2,c2,d2


In [94]:
# 内连接
pd.merge(left,right,on='key',how='outer')

Unnamed: 0,A,B,key,C,D
0,a0,b0,k0,c0,d0
1,a1,b1,k1,c1,d1
2,a2,b2,k2,c2,d2
3,,,k3,c3,d3


2.合并多个重叠列

In [95]:
left = pd.DataFrame({"A":['a0','a1','a2'],
                    "B":['b0','b1','b2'],
                    "key":['k0','k1','k2']})
left

Unnamed: 0,A,B,key
0,a0,b0,k0
1,a1,b1,k1
2,a2,b2,k2


In [98]:
right = pd.DataFrame({"A":['a0','a1','a2','a3'],
                     "C":['c0','c1','c2','c3'],  
                     "D":['d0','d1','d2','d3'],
                     "key":['k0','k1','k2','k3']})
right

Unnamed: 0,A,C,D,key
0,a0,c0,d0,k0
1,a1,c1,d1,k1
2,a2,c2,d2,k2
3,a3,c3,d3,k3


In [102]:
# 默认外连接 合并 A，key 两列
pd.merge(left,right,on=['A','key'])

Unnamed: 0,A,B,key,C,D
0,a0,b0,k0,c0,d0
1,a1,b1,k1,c1,d1
2,a2,b2,k2,c2,d2


#### 3、左连接和右连接

In [104]:
# 左连接：以左表为基准进行连接
pd.merge(left,right,on=['A','key'], how='left')


Unnamed: 0,A,B,key,C,D
0,a0,b0,k0,c0,d0
1,a1,b1,k1,c1,d1
2,a2,b2,k2,c2,d2


In [105]:
# 右连接：以右表为基准进行连接
pd.merge(left,right,on=['A','key'], how='right')

Unnamed: 0,A,B,key,C,D
0,a0,b0,k0,c0,d0
1,a1,b1,k1,c1,d1
2,a2,b2,k2,c2,d2
3,a3,,k3,c3,d3


4、行索引合并

In [106]:
left = pd.DataFrame({"A":['a0','a1','a2'],
                    "B":['b0','b1','b2'],
                    })
left

Unnamed: 0,A,B
0,a0,b0
1,a1,b1
2,a2,b2


In [107]:
right = pd.DataFrame({"C":['c0','c1','c2'],
                    "D":['d0','d1','d2'],
                    })
right

Unnamed: 0,C,D
0,c0,d0
1,c1,d1
2,c2,d2


In [109]:
pd.merge(left,right,how='outer',left_index=True,right_index=True)

Unnamed: 0,A,B,C,D
0,a0,b0,c0,d0
1,a1,b1,c1,d1
2,a2,b2,c2,d2


#### 三、根据行索引合并数据

1、行索引和列索引没有重叠

In [116]:
left = pd.DataFrame({"A":['a0','a1'],
                    "B":['b0','b1'],},
                   index=['a','b'])
left

Unnamed: 0,A,B
a,a0,b0
b,a1,b1


In [117]:
right = pd.DataFrame({"C":['c0','c1'],
                    "D":['d0','d1'],},
                    index=['c','d'])
right

Unnamed: 0,C,D
c,c0,d0
d,c1,d1


In [121]:
left.join(right, how='outer')

Unnamed: 0,A,B,key,C,D
0,a0,b0,k0,,
1,a1,b1,k1,,
2,a2,b2,k2,,
c,,,,c0,d0
d,,,,c1,d1


 2、假设两个表行索引和列索引重叠

In [124]:
left = pd.DataFrame({"A":['a0','a1','a2'],
                    "B":['b0','b1','b2'],
                    "key":['k0','k1','k2']}
                   )
left

Unnamed: 0,A,B,key
0,a0,b0,k0
1,a1,b1,k1
2,a2,b2,k2


In [125]:
right = pd.DataFrame({"C":['c0','c1','c2'],
                    "D":['d0','d1','d2']},
                     index=['k0','k1','k2']
                   )
right

Unnamed: 0,C,D
k0,c0,d0
k1,c1,d1
k2,c2,d2


In [126]:
# on 指定连接的列名
left.join(right,on='key', how='left')

Unnamed: 0,A,B,key,C,D
0,a0,b0,k0,c0,d0
1,a1,b1,k1,c1,d1
2,a2,b2,k2,c2,d2


#### 四、合并重叠数据

注意：使用 combine_first 函数合并两个DataFrame对象时，必须确保他们的行索引和列索引有重叠的部分。

In [127]:
left = pd.DataFrame({'A':[np.nan, 'a1', 'a2', 'a3'],
                    'B':[np.nan, 'b1', np.nan, 'b3'],
                    'key':['k0', 'k1', 'k2', 'k3']})
left

Unnamed: 0,A,B,key
0,,,k0
1,a1,b1,k1
2,a2,,k2
3,a3,b3,k3


In [129]:
right = pd.DataFrame({'A':['c0', 'c1', 'c2'],
                    'B':['d0', 'd1', 'd2']},
                    index=[1, 0, 2])
right

Unnamed: 0,A,B
1,c0,d0
0,c1,d1
2,c2,d2


In [132]:
# 用right的数据填充left缺失的部分
left.combine_first(right)

Unnamed: 0,A,B,key
0,c1,d1,k0
1,a1,b1,k1
2,a2,d2,k2
3,a3,b3,k3
