# 数据的转换
使用stack and unstack

In [1]:
import pandas as pd
import numpy as np

In [2]:
data = pd.DataFrame(np.arange(6).reshape(2,3), index = ['xiaoming','xiaoli'], columns = ['one','two','three'] )

In [3]:
data

Unnamed: 0,one,two,three
xiaoming,0,1,2
xiaoli,3,4,5


In [4]:
# 多级的索引
data.unstack()

one    xiaoming    0
       xiaoli      3
two    xiaoming    1
       xiaoli      4
three  xiaoming    2
       xiaoli      5
dtype: int32

In [5]:
data.stack()

xiaoming  one      0
          two      1
          three    2
xiaoli    one      3
          two      4
          three    5
dtype: int32

# 移除重复数据
1. deplicated 找出重复的数据
2. 丢掉重复的数据 drop_deplicated
3. 替换操作


In [6]:
data = pd.DataFrame({ 'k1': ['one']*3 +['two']*2,
                      'k2':[1,1,2,3,4]    
})

In [7]:
data

Unnamed: 0,k1,k2
0,one,1
1,one,1
2,one,2
3,two,3
4,two,4


In [8]:
data.duplicated()

0    False
1     True
2    False
3    False
4    False
dtype: bool

In [9]:
data.drop_duplicates()

Unnamed: 0,k1,k2
0,one,1
2,one,2
3,two,3
4,two,4


In [10]:
# 根据哪些列去掉重复项
data.drop_duplicates('k1')

Unnamed: 0,k1,k2
0,one,1
3,two,3


In [11]:
# 替换一个数
data.replace(2, np.nan)

Unnamed: 0,k1,k2
0,one,1.0
1,one,1.0
2,one,
3,two,3.0
4,two,4.0


In [12]:
# 替换几个数字，以列表的形式传入 []
data.replace([1,3], np.nan)

Unnamed: 0,k1,k2
0,one,
1,one,
2,one,2.0
3,two,
4,two,4.0


# 字符串操作
1. data.k2.str.replace() 只能对单个列操作
2. 是否包含 contains
3. 变为大写 upper
4. 分割方法 split
5. 

In [13]:
data = pd.DataFrame({ 'k1':['beijing,haidian,chushou','beijing,chaoyang,chushou','beijing,feitai,chuzu'],
                     'k2':['beijing,xicheng,chuzu','beijing,shijingshan,chushou',np.nan]
})

In [14]:
data

Unnamed: 0,k1,k2
0,"beijing,haidian,chushou","beijing,xicheng,chuzu"
1,"beijing,chaoyang,chushou","beijing,shijingshan,chushou"
2,"beijing,feitai,chuzu",


In [15]:
# map方法替换
data.k1.str.replace('beijing','shanghai')

0     shanghai,haidian,chushou
1    shanghai,chaoyang,chushou
2        shanghai,feitai,chuzu
Name: k1, dtype: object

In [16]:
data.k1.str.contains('chushou')

0     True
1     True
2    False
Name: k1, dtype: bool

In [17]:
data.k1.str.upper()

0     BEIJING,HAIDIAN,CHUSHOU
1    BEIJING,CHAOYANG,CHUSHOU
2        BEIJING,FEITAI,CHUZU
Name: k1, dtype: object

In [18]:
data.k1.str.split(',')
#一个元素是一个列表

0     [beijing, haidian, chushou]
1    [beijing, chaoyang, chushou]
2        [beijing, feitai, chuzu]
Name: k1, dtype: object

In [19]:
data.k1

0     beijing,haidian,chushou
1    beijing,chaoyang,chushou
2        beijing,feitai,chuzu
Name: k1, dtype: object

# 数据离散化

In [20]:
data = pd.DataFrame(np.random.randint(1,50,(20,2)), columns = ['k1','k2'])

In [21]:
data

Unnamed: 0,k1,k2
0,7,21
1,16,13
2,28,3
3,13,15
4,5,44
5,35,42
6,17,40
7,23,9
8,37,44
9,16,12


In [22]:
bins = [1,10,20,30,40,50]

In [23]:
cats = pd.cut(data.k1,bins)

In [24]:
cats

0      (1, 10]
1     (10, 20]
2     (20, 30]
3     (10, 20]
4      (1, 10]
5     (30, 40]
6     (10, 20]
7     (20, 30]
8     (30, 40]
9     (10, 20]
10     (1, 10]
11    (40, 50]
12    (30, 40]
13    (20, 30]
14    (30, 40]
15    (20, 30]
16    (30, 40]
17    (40, 50]
18     (1, 10]
19    (10, 20]
Name: k1, dtype: category
Categories (5, interval[int64]): [(1, 10] < (10, 20] < (20, 30] < (30, 40] < (40, 50]]

In [25]:
cats.value_counts()

(30, 40]    5
(10, 20]    5
(20, 30]    4
(1, 10]     4
(40, 50]    2
Name: k1, dtype: int64

In [27]:
# 根据数据范围分成四等分
cats = pd.cut(data.k1, 4)

In [28]:
cats

0     (4.958, 15.5]
1      (15.5, 26.0]
2      (26.0, 36.5]
3     (4.958, 15.5]
4     (4.958, 15.5]
5      (26.0, 36.5]
6      (15.5, 26.0]
7      (15.5, 26.0]
8      (36.5, 47.0]
9      (15.5, 26.0]
10    (4.958, 15.5]
11     (36.5, 47.0]
12     (26.0, 36.5]
13     (26.0, 36.5]
14     (26.0, 36.5]
15     (15.5, 26.0]
16     (26.0, 36.5]
17     (36.5, 47.0]
18    (4.958, 15.5]
19    (4.958, 15.5]
Name: k1, dtype: category
Categories (4, interval[float64]): [(4.958, 15.5] < (15.5, 26.0] < (26.0, 36.5] < (36.5, 47.0]]

In [29]:
cats.value_counts()

(26.0, 36.5]     6
(4.958, 15.5]    6
(15.5, 26.0]     5
(36.5, 47.0]     3
Name: k1, dtype: int64

In [30]:
# 按照数据的个数分成4等份
cats = pd.qcut(data.k1,4)

In [31]:
cats

0     (4.999, 13.0]
1      (13.0, 24.0]
2      (24.0, 34.0]
3     (4.999, 13.0]
4     (4.999, 13.0]
5      (34.0, 47.0]
6      (13.0, 24.0]
7      (13.0, 24.0]
8      (34.0, 47.0]
9      (13.0, 24.0]
10    (4.999, 13.0]
11     (34.0, 47.0]
12     (24.0, 34.0]
13     (24.0, 34.0]
14     (24.0, 34.0]
15     (24.0, 34.0]
16     (24.0, 34.0]
17     (34.0, 47.0]
18    (4.999, 13.0]
19    (4.999, 13.0]
Name: k1, dtype: category
Categories (4, interval[float64]): [(4.999, 13.0] < (13.0, 24.0] < (24.0, 34.0] < (34.0, 47.0]]

In [32]:
cats.value_counts()

(24.0, 34.0]     6
(4.999, 13.0]    6
(34.0, 47.0]     4
(13.0, 24.0]     4
Name: k1, dtype: int64