In [3]:
import numpy as np
import pandas as pd

##### 处理缺失数据

In [22]:
data1 = pd.Series([1,np.nan,3.5,np.nan,7])
data1.dropna()

0    1.0
2    3.5
4    7.0
dtype: float64

##### 数据转化（duplicated，map，replace）

In [2]:
data2 = pd.DataFrame({'k1': ['one', 'two'] * 3 + ['two'],  'k2': [1, 1, 2, 3, 3, 4, 4]})
data2

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4
6,two,4


In [None]:
data2.duplicated()

In [None]:
# data.drop_duplicates()
data2.drop_duplicates(['k1'])

In [3]:
data3 = pd.DataFrame(
{'food': ['bacon',  'pulled pork', 'bacon', 'Pastrami', 'corned beef', 'Bacon', 'pastrami', 'honey ham', 'nova lox'], 
'ounces': [4, 3, 12, 6, 7.5, 8, 3, 5, 6]}
)
data

Unnamed: 0,food,ounces
0,bacon,4.0
1,pulled pork,3.0
2,bacon,12.0
3,Pastrami,6.0
4,corned beef,7.5
5,Bacon,8.0
6,pastrami,3.0
7,honey ham,5.0
8,nova lox,6.0


In [5]:
meat_to_animal = {
 'bacon': 'pig',
 'pulled pork': 'pig',
 'pastrami': 'cow',
 'corned beef': 'cow',
 'honey ham': 'pig',
 'nova lox': 'salmon'
}

In [None]:
# map-way1
lowercased = data3['food'].str.lower()
lowercased

In [None]:
data3['animal'] = lowercased.map(meat_to_animal)   # 执行顺序从右到左
data3

In [6]:
# map-way2
data3['animal'] = data3['food'].map(lambda x: meat_to_animal[x.lower()])
data3

Unnamed: 0,food,ounces,animal
0,bacon,4.0,pig
1,pulled pork,3.0,pig
2,bacon,12.0,pig
3,Pastrami,6.0,cow
4,corned beef,7.5,cow
5,Bacon,8.0,pig
6,pastrami,3.0,cow
7,honey ham,5.0,pig
8,nova lox,6.0,salmon


In [10]:
data4 = pd.Series([1., -999., 2., -999., -1000., 3.])
data4

0       1.0
1    -999.0
2       2.0
3    -999.0
4   -1000.0
5       3.0
dtype: float64

In [16]:
data4.replace(-999, np.nan)

0       1.0
1       NaN
2       2.0
3       NaN
4   -1000.0
5       3.0
dtype: float64

In [18]:
data4.replace([-999,-1000],np.nan)

0    1.0
1    NaN
2    2.0
3    NaN
4    NaN
5    3.0
dtype: float64

##### 重命名轴索引

In [24]:
data5 = pd.DataFrame(np.arange(12).reshape((3, 4)), 
                    index=['Ohio', 'Colorado', 'New York'], 
                    columns=['one', 'two', 'three', 'four'])
data5

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
New York,8,9,10,11


In [25]:
transform = lambda x: x[:4].upper()

data5.index.map(transform)

Index(['OHIO', 'COLO', 'NEW '], dtype='object')

In [27]:
data5.index = data5.index.map(transform)
data5

Unnamed: 0,one,two,three,four
OHIO,0,1,2,3
COLO,4,5,6,7
NEW,8,9,10,11


In [28]:
data5.rename(index=str.title,columns=str.upper)
data5.rename(index={'OHIO': 'INDIANA'}, columns={'three': 'peekaboo'})

Unnamed: 0,ONE,TWO,THREE,FOUR
Ohio,0,1,2,3
Colo,4,5,6,7
New,8,9,10,11


##### 离散化和面元划分

In [29]:
ages = [20, 22, 25, 27, 21, 23, 37, 31, 61, 45, 41]
bins = [18, 25, 35, 60, 100]
cats = pd.cut(ages, bins)
cats

[(18, 25], (18, 25], (18, 25], (25, 35], (18, 25], ..., (35, 60], (25, 35], (60, 100], (35, 60], (35, 60]]
Length: 11
Categories (4, interval[int64, right]): [(18, 25] < (25, 35] < (35, 60] < (60, 100]]

In [4]:
df = pd.DataFrame(np.arange(5 * 4).reshape(5, 4))
df

Unnamed: 0,0,1,2,3
0,0,1,2,3
1,4,5,6,7
2,8,9,10,11
3,12,13,14,15
4,16,17,18,19


In [5]:
sampler = np.random.permutation(5)
sampler

array([3, 0, 4, 2, 1])

In [7]:
df.take(sampler)

Unnamed: 0,0,1,2,3
3,12,13,14,15
0,0,1,2,3
4,16,17,18,19
2,8,9,10,11
1,4,5,6,7


In [8]:
df.sample(n=3)

Unnamed: 0,0,1,2,3
3,12,13,14,15
2,8,9,10,11
1,4,5,6,7
