# Pandas实用示例

1. 如何导入Pandas以及确认版本？

In [1]:
import numpy as np
import pandas as pd
from IPython.display import display, HTML
print(pd.__version__)
print(pd.show_versions(as_json=True))

0.23.4
{'system': {'commit': None, 'python': '3.6.7.final.0', 'python-bits': 64, 'OS': 'Windows', 'OS-release': '10', 'machine': 'AMD64', 'processor': 'Intel64 Family 6 Model 158 Stepping 10, GenuineIntel', 'byteorder': 'little', 'LC_ALL': 'None', 'LANG': 'None', 'LOCALE': 'None.None'}, 'dependencies': {'pandas': '0.23.4', 'pytest': '3.8.0', 'pip': '19.3.1', 'setuptools': '39.0.1', 'Cython': '0.28.5', 'numpy': '1.16.2', 'scipy': '1.0.0', 'pyarrow': '0.12.1', 'xarray': None, 'IPython': '7.6.1', 'sphinx': '1.8.3', 'patsy': None, 'dateutil': '2.7.3', 'pytz': '2017.3', 'blosc': None, 'bottleneck': '1.2.1', 'tables': None, 'numexpr': '2.6.9', 'feather': None, 'matplotlib': '2.2.3', 'openpyxl': '2.5.9', 'xlrd': '1.2.0', 'xlwt': None, 'xlsxwriter': None, 'lxml': '4.2.5', 'bs4': '4.6.3', 'html5lib': '1.0.1', 'sqlalchemy': '0.7.10', 'pymysql': '0.9.3', 'psycopg2': None, 'jinja2': '2.10', 's3fs': None, 'fastparquet': None, 'pandas_gbq': None, 'pandas_datareader': None}}
None


2. 如何从list，Numpy Array以及字典创建Series?

In [2]:
# Inputs
import numpy as np
mylist = list('abcedfghijklmnopqrstuvwxyz')
myarr = np.arange(26)
mydict = dict(zip(mylist, myarr))

# Solution
ser1 = pd.Series(mylist)
ser2 = pd.Series(myarr)
ser3 = pd.Series(mydict)
print(ser1.head())
print(ser2.head())
print(ser3.head())

0    a
1    b
2    c
3    e
4    d
dtype: object
0    0
1    1
2    2
3    3
4    4
dtype: int32
a    0
b    1
c    2
e    3
d    4
dtype: int64


3. 如何将Series的索引转换为Dataframe的一列数据?

In [3]:
# Input
mylist = list('abcedfghijklmnopqrstuvwxyz')
myarr = np.arange(26)
mydict = dict(zip(mylist, myarr))
ser = pd.Series(mydict)

# Solution
# 通过reset_index方法，可以将原有的索引备份为dataframe的一列
df = ser.to_frame().reset_index()
display(df.head())
df.columns

Unnamed: 0,index,0
0,a,0
1,b,1
2,c,2
3,e,3
4,d,4


Index(['index', 0], dtype='object')

4. 如何将很多Series合并为一个dataframe？

In [6]:
# Input
ser1 = pd.Series(list('abcedfghijklmnopqrstuvwxyz'))
ser2 = pd.Series(np.arange(26))

# Solution 1
df = pd.concat([ser1, ser2], axis=1)
display(df)
# Solution 2
df = pd.DataFrame({'col1': ser1, 'col2': ser2})
display(df.head())

Unnamed: 0,0,1
0,a,0
1,b,1
2,c,2
3,e,3
4,d,4
5,f,5
6,g,6
7,h,7
8,i,8
9,j,9


Unnamed: 0,col1,col2
0,a,0
1,b,1
2,c,2
3,e,3
4,d,4


In [6]:
list('abcedfghijklmnopqrstuvwxyz')

['a',
 'b',
 'c',
 'e',
 'd',
 'f',
 'g',
 'h',
 'i',
 'j',
 'k',
 'l',
 'm',
 'n',
 'o',
 'p',
 'q',
 'r',
 's',
 't',
 'u',
 'v',
 'w',
 'x',
 'y',
 'z']

5. 如何设置Series索引的名字？

In [7]:
# Input
ser = pd.Series(list('abcedfghijklmnopqrstuvwxyz'))

# Solution
ser.name = 'alphabets'
ser.head()

0    a
1    b
2    c
3    e
4    d
Name: alphabets, dtype: object

6. 如何从series A获取不存在于series B的items？

In [8]:
# Input
ser1 = pd.Series([1, 2, 3, 4, 5])
ser2 = pd.Series([4, 5, 6, 7, 8])

# Solution 1
display(ser1[~ser1.isin(ser2)])

# Solution 2
display(np.setdiff1d(ser1, ser2))

# display(np.intersect1d(ser1, ser2))

0    1
1    2
2    3
dtype: int64

array([1, 2, 3], dtype=int64)

array([4, 5], dtype=int64)

7. 如何从series A与series B获取差异项？

In [9]:
# Input
ser1 = pd.Series([1, 2, 3, 4, 5])
ser2 = pd.Series([4, 5, 6, 7, 8])

# Solution
ser_u = pd.Series(np.union1d(ser1, ser2))  # union
display(ser_u)
ser_i = pd.Series(np.intersect1d(ser1, ser2))  # intersect
display(ser_i)
display(ser_u[~ser_u.isin(ser_i)])

print(len(ser_i)/len(ser_u))

0    1
1    2
2    3
3    4
4    5
5    6
6    7
7    8
dtype: int64

0    4
1    5
dtype: int64

0    1
1    2
2    3
5    6
6    7
7    8
dtype: int64

0.25


8. 如何从一个数字Series获取minium, 25%, median (50%，即中位数), 75%以及max的项？

高斯分布的概率密度函数如下：

<img src='./image/gause.png' /> <br>

Numpy中可以通过如下方式获取符合正态分布的随机数：```numpy.random.normal(loc=0.0, scale=1.0, size=None)```, 参数的意义为：<br>
loc:float
概率分布的均值，对应着整个分布的中心center

scale:float
概率分布的标准差，对应于分布的宽度，scale越大越矮胖，scale越小，越瘦高

size:int or tuple of ints
输出的shape，默认为None，只输出一个值

我们更经常会用到np.random.randn(size)所谓标准正态分布（μ=0, σ=1），对应于np.random.normal(loc=0, scale=1, size)

In [11]:
# Input
np.random.seed(100)
ser = pd.Series(np.random.normal(0, 1, 25))
display(ser)
# Solution
np.percentile(ser, q=[0, 25, 50, 75, 100])

0    -1.749765
1     0.342680
2     1.153036
3    -0.252436
4     0.981321
5     0.514219
6     0.221180
7    -1.070043
8    -0.189496
9     0.255001
10   -0.458027
11    0.435163
12   -0.583595
13    0.816847
14    0.672721
15   -0.104411
16   -0.531280
17    1.029733
18   -0.438136
19   -1.118318
20    1.618982
21    1.541605
22   -0.251879
23   -0.842436
24    0.184519
dtype: float64

array([-1.74976547, -0.45802699,  0.18451869,  0.67272081,  1.61898166])

9. 如何获得series中唯一项的出现次数？

In [12]:
# Input
np.random.seed(100)
randint = np.random.randint(8, size=30)
print(randint)
ser = pd.Series(np.take(list('abcdefgh'), randint))
print(ser)
# Solution
ser.value_counts()

[0 0 3 7 7 7 0 2 6 4 2 5 2 2 6 2 1 0 0 7 4 3 4 2 0 3 1 5 6 2]
0     a
1     a
2     d
3     h
4     h
5     h
6     a
7     c
8     g
9     e
10    c
11    f
12    c
13    c
14    g
15    c
16    b
17    a
18    a
19    h
20    e
21    d
22    e
23    c
24    a
25    d
26    b
27    f
28    g
29    c
dtype: object


c    7
a    6
h    4
g    3
e    3
d    3
f    2
b    2
dtype: int64

10. 如何仅仅保留前两位出现频率最高的项，并且替代其他的值为'Other'?

In [15]:
# Input
np.random.seed(100)
ser = pd.Series(np.random.randint(1, 5, [12]))
print(ser)
# Solution
print("Top 2 Freq:")
top2 = ser.value_counts()[:2]
print(top2)
print(~ser.isin(ser.value_counts().index[:2]))
ser[~ser.isin(ser.value_counts().index[:2])] = 'Other'
ser

0     1
1     1
2     4
3     4
4     4
5     4
6     1
7     3
8     3
9     1
10    3
11    2
dtype: int32
Top 2 Freq:
4    4
1    4
dtype: int64
0     False
1     False
2     False
3     False
4     False
5     False
6     False
7      True
8      True
9     False
10     True
11     True
dtype: bool


0         1
1         1
2         4
3         4
4         4
5         4
6         1
7     Other
8     Other
9         1
10    Other
11    Other
dtype: object

11. 如何将数字series从小到大分为10个相等数量的分组，并且将每一项替换为分组名称？

In [16]:
# Input
np.random.seed(100)
ser = pd.Series(np.random.random(20))
print(ser)

# Solution
pd.qcut(ser, q=[0, .10, .20, .3, .4, .5, .6, .7, .8, .9, 1], 
        labels=['1st', '2nd', '3rd', '4th', '5th', 
                '6th', '7th', '8th', '9th', '10th'])

0     0.543405
1     0.278369
2     0.424518
3     0.844776
4     0.004719
5     0.121569
6     0.670749
7     0.825853
8     0.136707
9     0.575093
10    0.891322
11    0.209202
12    0.185328
13    0.108377
14    0.219697
15    0.978624
16    0.811683
17    0.171941
18    0.816225
19    0.274074
dtype: float64


0      6th
1      5th
2      6th
3      9th
4      1st
5      2nd
6      7th
7      9th
8      2nd
9      7th
10    10th
11     4th
12     3rd
13     1st
14     4th
15    10th
16     8th
17     3rd
18     8th
19     5th
dtype: category
Categories (10, object): [1st < 2nd < 3rd < 4th ... 7th < 8th < 9th < 10th]

12. 如何将numpy array转换为给定shape的dataframe？

In [13]:
np.random.seed(100)
# Input
ser = pd.Series(np.random.randint(1, 10, 35))

# Solution
df = pd.DataFrame(ser.values.reshape(7,5))
display(df)

Unnamed: 0,0,1,2,3,4
0,9,9,4,8,8
1,1,5,3,6,3
2,3,3,2,1,9
3,5,1,7,3,5
4,2,6,4,5,5
5,4,8,2,2,8
6,8,1,3,4,3


13. 如何从series中找到公约数为3的项目的索引？

In [14]:
np.random.seed(99)
# Input
ser = pd.Series(np.random.randint(1, 10, 7))
print(ser)

# Solution
np.argwhere(ser % 3==0)

0    2
1    4
2    9
3    9
4    3
5    5
6    6
dtype: int32


array([[2],
       [3],
       [4],
       [6]], dtype=int64)

14. 如何从series获得给定位置的items?

In [15]:
# Input
ser = pd.Series(list('abcdefghijklmnopqrstuvwxyz'))
pos = [0, 4, 8, 14, 20]

# Solution 1
print(ser[pos])

# Solution 2
print(ser.take(pos))

0     a
4     e
8     i
14    o
20    u
dtype: object
0     a
4     e
8     i
14    o
20    u
dtype: object


15. 如何将两个series垂直与水平叠加?

In [16]:
# Input
ser1 = pd.Series(range(5))
ser2 = pd.Series(list('abcde'))

# Output
# Vertical
# Solution 1
display(ser1.append(ser2))
# Solution 2
vt = pd.concat([ser1, ser2], axis=0)
display(vt)

# Horizontal
df = pd.concat([ser1, ser2], axis=1)
display(df)

0    0
1    1
2    2
3    3
4    4
0    a
1    b
2    c
3    d
4    e
dtype: object

0    0
1    1
2    2
3    3
4    4
0    a
1    b
2    c
3    d
4    e
dtype: object

Unnamed: 0,0,1
0,0,a
1,1,b
2,2,c
3,3,d
4,4,e


16. 如何获得series A与series B的共同项的索引（相对A来说）？

In [17]:
# Input
ser1 = pd.Series([10, 9, 6, 5, 3, 1, 12, 8, 13])
ser2 = pd.Series([1, 3, 10, 13])

# Solution 1
print([np.where(i == ser1)[0].tolist()[0] for i in ser2])

print(pd.Index(ser1))
# Solution 2
print([pd.Index(ser1).get_loc(i) for i in ser2])

[5, 4, 0, 8]
Int64Index([10, 9, 6, 5, 3, 1, 12, 8, 13], dtype='int64')
[5, 4, 0, 8]


17. 如何对series计算均方差 (mean squared error)?

In [19]:
# Input
np.random.seed(100)
truth = pd.Series(range(10))
print(truth)
pred = pd.Series(range(10)) + np.random.random(10)
print(pred)
# Solution
print('均方差为: {0}'.format(np.mean((truth-pred)**2)))

0    0
1    1
2    2
3    3
4    4
5    5
6    6
7    7
8    8
9    9
dtype: int64
0    0.543405
1    1.278369
2    2.424518
3    3.844776
4    4.004719
5    5.121569
6    6.670749
7    7.825853
8    8.136707
9    9.575093
dtype: float64
均方差为: 0.2762799799636315


18. 如何将字符串series的每一个单词的第一个字母转换为大写字母？

In [20]:
# Input
ser = pd.Series(['how', 'to', 'convert', 'to', 'uppercase?'])

# Solution 1
print(ser.map(lambda x: x.title()))

# Solution 2
print(ser.map(lambda x: x[0].upper() + x[1:]))

# Solution 3
print(pd.Series([i.title() for i in ser]))

0           How
1            To
2       Convert
3            To
4    Uppercase?
dtype: object
0           How
1            To
2       Convert
3            To
4    Uppercase?
dtype: object
0           How
1            To
2       Convert
3            To
4    Uppercase?
dtype: object


19. 如何获得series中每个单词中的字符长度？

In [23]:
# Input
ser = pd.Series(['how', 'to', 'kick', 'ass?'])

# Solution 1
print(ser.map(lambda x: len(x)))

# Solution 2
print(ser.apply(len))

0    3
1    2
2    4
3    4
dtype: int64
0    3
1    2
2    4
3    4
dtype: int64


20. 如何计算series中序列数的差异的差异？

In [83]:
# Input
ser = pd.Series([1, 3, 6, 10, 15, 21, 27, 35])

# Solution
print(ser.diff().tolist())
print(ser.diff().diff().tolist())

[nan, 2.0, 3.0, 4.0, 5.0, 6.0, 6.0, 8.0]
[nan, nan, 1.0, 1.0, 1.0, 1.0, 0.0, 2.0]


21. 如何将series中的日期字符串，替换为timeseries类型？

In [24]:
# Input
ser = pd.Series(['01 Jan 2010', 
                 'January 1, 2018', 
                 '2018-11-05', 
                 '02-02-2011', 
                 '20120303', 
                 '2013/04/04', 
                 '2014-05-05', 
                 '2015-06-06T12:20'])

# Solution 1
from dateutil.parser import parse
print(ser.map(lambda x: parse(x)))

# Solution 2
pd.to_datetime(ser)

0   2010-01-01 00:00:00
1   2018-01-01 00:00:00
2   2018-11-05 00:00:00
3   2011-02-02 00:00:00
4   2012-03-03 00:00:00
5   2013-04-04 00:00:00
6   2014-05-05 00:00:00
7   2015-06-06 12:20:00
dtype: datetime64[ns]


0   2010-01-01 00:00:00
1   2018-01-01 00:00:00
2   2018-11-05 00:00:00
3   2011-02-02 00:00:00
4   2012-03-03 00:00:00
5   2013-04-04 00:00:00
6   2014-05-05 00:00:00
7   2015-06-06 12:20:00
dtype: datetime64[ns]

22. 如何从日期字符串series获得day of month, week number, day of year以及day of week?

In [28]:
# Input
ser = pd.Series(['1 Jan 2020', 
                 '02-02-2011', 
                 '20120303', 
                 '2013/04/04', 
                 '2014-05-05', 
                 '2015-06-06T12:20'])

# Solution
from dateutil.parser import parse
ser_ts = ser.map(lambda x: parse(x))
print(ser_ts)
# day of month
print("Date: ", ser_ts.dt.day.tolist())

# week number
# 注意：因为2010-01-01是周五，所以计算为上一年的第53周
print("Week number: ", ser_ts.dt.weekofyear.tolist())

# day of year
print("Day number of year: ", ser_ts.dt.dayofyear.tolist())

# day of week
print("Day of week: ", ser_ts.dt.weekday_name.tolist())

0   2020-01-01 00:00:00
1   2011-02-02 00:00:00
2   2012-03-03 00:00:00
3   2013-04-04 00:00:00
4   2014-05-05 00:00:00
5   2015-06-06 12:20:00
dtype: datetime64[ns]
Date:  [1, 2, 3, 4, 5, 6]
Week number:  [1, 5, 9, 14, 19, 23]
Day number of year:  [1, 33, 63, 94, 125, 157]
Day of week:  ['Wednesday', 'Wednesday', 'Saturday', 'Thursday', 'Monday', 'Saturday']


23. 如何将year-month字符串，转换为对应月的第四天的日期？

In [34]:
import pandas as pd
# Input
ser = pd.Series(['Jan 2010', 'Feb 2011', 'Mar 2012', 'Jan 2020'])

# Solution 1
from dateutil.parser import parse
# Parse the date
ser_ts = ser.map(lambda x: parse(x))
print(ser_ts)
# Construct date string with date as 4
ser_datestr = ser_ts.dt.year.astype('str') + '-' + ser_ts.dt.month.astype('str') + '-' + '05'
print(ser_datestr)
# Format it.
print([parse(i).strftime('%Y-%m-%d') for i in ser_datestr])

# Solution 2
print(ser.map(lambda x: parse('04 ' + x)))

0   2010-01-04
1   2011-02-04
2   2012-03-04
3   2020-01-04
dtype: datetime64[ns]
0    2010-1-05
1    2011-2-05
2    2012-3-05
3    2020-1-05
dtype: object
['2010-01-05', '2011-02-05', '2012-03-05', '2020-01-05']
0   2010-01-04
1   2011-02-04
2   2012-03-04
3   2020-01-04
dtype: datetime64[ns]


24. 如何从一个字符串series获得至少含有两个元音字母的单词？

In [35]:
# Input
ser = pd.Series(['Apple', 'Orange', 'Plan', 'Python', 'Money'])

# Solution
from collections import Counter
mask = ser.map(lambda x: sum([Counter(x.lower()).get(i, 0) 
                              for i in list('aeiou')]) >= 2)
print(mask)
print(ser[mask])

0     True
1     True
2    False
3    False
4     True
dtype: bool
0     Apple
1    Orange
4     Money
dtype: object


25. 如何从邮件地址series获得合法的邮件地址？

In [36]:
# Input
emails = pd.Series(['buying books at amazom.com', 
                    'rameses@egypt.com', 
                    'matt@t.co', 
                    'narendra@modi.com'])

import re
pattern =r'[\w.%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,4}'

# Solution 1 (as series of strings)
mask = emails.map(lambda x: bool(re.match(pattern, x)))
print(mask)
print(emails[mask])

# Solution 2 (as series of list)
print(emails.str.findall(pattern, flags=re.IGNORECASE))

# Solution 3 (as list)
print([x[0] for x in [re.findall(pattern, email) 
                for email in emails] if len(x) > 0])

0    False
1     True
2     True
3     True
dtype: bool
1    rameses@egypt.com
2            matt@t.co
3    narendra@modi.com
dtype: object
0                     []
1    [rameses@egypt.com]
2            [matt@t.co]
3    [narendra@modi.com]
dtype: object
['rameses@egypt.com', 'matt@t.co', 'narendra@modi.com']


26. 如何根据另一个series进行分组，并获得均值？

In [37]:
np.linspace(1, 10, 10)

array([ 1.,  2.,  3.,  4.,  5.,  6.,  7.,  8.,  9., 10.])

In [226]:
# Input
np.random.seed(100)
fruit = pd.Series(np.random.choice(['apple', 'banana', 'carrot'], 10))
print(fruit)
weights = pd.Series(np.linspace(1, 10, 10))
print(weights)
# Solution
# 这里分组的概念为：apple的items为0, 1, 2, 5; 
# banana的items为7, 
# carrot的items为3, 4, 6, 8, 9
# 求均值，即将对应items的weights的items求均值
print(weights.groupby(fruit).mean())

0     apple
1     apple
2     apple
3    carrot
4    carrot
5     apple
6    carrot
7    banana
8    carrot
9    carrot
dtype: object
0     1.0
1     2.0
2     3.0
3     4.0
4     5.0
5     6.0
6     7.0
7     8.0
8     9.0
9    10.0
dtype: float64
apple     3.0
banana    8.0
carrot    7.0
dtype: float64


27. 如何基于2个series获得[欧式距离](https://en.wikipedia.org/wiki/Euclidean_distance)？

<img src='./image/euclidean.svg' />

In [38]:
# Input
p = pd.Series([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
q = pd.Series([10, 9, 8, 7, 6, 5, 4, 3, 2, 1])
# Solution 
print(sum((p - q)**2)**.5)
# Solution (using func)
print(np.linalg.norm(p-q))

18.16590212458495
18.16590212458495


28. 如何在series中获得局部极值(local maxima)的索引?

In [39]:
# Input
ser = pd.Series([2, 10, 3, 4, 9, 10, 2, 7, 3])

# Solution
print(np.diff(ser))
# 负数标注为-1，正数标注为1
print(np.sign(np.diff(ser)))
dd = np.diff(np.sign(np.diff(ser)))
print(dd)
peak_locs = np.where(dd == -2)[0] + 1
peak_locs

[ 8 -7  1  5  1 -8  5 -4]
[ 1 -1  1  1  1 -1  1 -1]
[-2  2  0  0 -2  2 -2]


array([1, 5, 7], dtype=int64)

29. 如何将字符串的空格，用出现频率最小的字符进行替换？

In [229]:
# Input
my_str = 'dbc deb abed gade'

# Solution
ser = pd.Series(list('dbc deb abed gade'))
freq = ser.value_counts()
print(freq)
# 因为频率次数是降序排列，所以取最后一个item的方法，直接用index[-1]即可。
least_freq = freq.dropna().index[-1]

"".join(ser.replace(' ', least_freq))

d    4
     3
b    3
e    3
a    2
c    1
g    1
dtype: int64


'dbcgdebgabedggade'

30. 生成开始于2000-01-01之后的10个周末(周六)的TimeSeries对象作为index，并且附带10个随机整数，作为values？

In [40]:
# Solution
np.random.seed(88)
ser = pd.Series(np.random.randint(1,100,10), 
                pd.date_range('2000-01-01', 
                              periods=10, 
                              freq='W-SAT'))
ser

2000-01-01    89
2000-01-08    33
2000-01-15    80
2000-01-22    50
2000-01-29    21
2000-02-05    63
2000-02-12    98
2000-02-19    70
2000-02-26    49
2000-03-04    35
Freq: W-SAT, dtype: int32

31. 如何将缺失的值，通过上一个非nan数值进行填充？

In [3]:
# Input
ser = pd.Series([1,10,np.nan,3,np.nan ], 
                index=pd.to_datetime(['2000-01-01', 
                                      '2000-01-03', 
                                      '2000-01-06', 
                                      '2000-01-08',
                                      '2000-01-10']))

print(ser)
# resample，目的是将日期索引补全
print("ser.resample('D')")
print(ser.resample('D'))
print(ser)
# Solution
print("ser.resample('D').ffill()")
print(ser.resample('D').ffill())  # fill with previous value

# Alternatives
print("ser.resample('D').bfill()")
print(ser.resample('D').bfill())  # fill with next value
print("ser.resample('D').bfill().ffill()")
print(ser.resample('D').bfill().ffill())  # fill next else prev value

2000-01-01     1.0
2000-01-03    10.0
2000-01-06     NaN
2000-01-08     3.0
2000-01-10     NaN
dtype: float64
ser.resample('D')
DatetimeIndexResampler [freq=<Day>, axis=0, closed=left, label=left, convention=start, base=0]
2000-01-01     1.0
2000-01-03    10.0
2000-01-06     NaN
2000-01-08     3.0
2000-01-10     NaN
dtype: float64
ser.resample('D').ffill()
2000-01-01     1.0
2000-01-02     1.0
2000-01-03    10.0
2000-01-04    10.0
2000-01-05    10.0
2000-01-06     NaN
2000-01-07     NaN
2000-01-08     3.0
2000-01-09     3.0
2000-01-10     NaN
Freq: D, dtype: float64
ser.resample('D').bfill()
2000-01-01     1.0
2000-01-02    10.0
2000-01-03    10.0
2000-01-04     NaN
2000-01-05     NaN
2000-01-06     NaN
2000-01-07     3.0
2000-01-08     3.0
2000-01-09     NaN
2000-01-10     NaN
Freq: D, dtype: float64
ser.resample('D').bfill().ffill()
2000-01-01     1.0
2000-01-02    10.0
2000-01-03    10.0
2000-01-04    10.0
2000-01-05    10.0
2000-01-06    10.0
2000-01-07     3.0
2000-01-08     3.0
2

In [4]:
data = [['fund','share', 'ter'], 
        ['fund_a', 'A', 1.0], 
        [np.nan, '', 1.5], 
        [np.nan, 'C', 1.0], 
        ['fund_b', 'A', 1.23], 
        [np.nan, 'B', 3.5], 
        ['fund_c', 'A', 1.23],
        [np.nan, 'B', 1.0],]
test_df = pd.DataFrame(data)
display(test_df)
display(test_df.ffill())

Unnamed: 0,0,1,2
0,fund,share,ter
1,fund_a,A,1
2,,,1.5
3,,C,1
4,fund_b,A,1.23
5,,B,3.5
6,fund_c,A,1.23
7,,B,1


Unnamed: 0,0,1,2
0,fund,share,ter
1,fund_a,A,1
2,fund_a,,1.5
3,fund_a,C,1
4,fund_b,A,1.23
5,fund_b,B,3.5
6,fund_c,A,1.23
7,fund_c,B,1


仅仅对value进行填充：

In [5]:
ser = pd.Series([1,10,np.nan, 3, np.nan, 
                 33, 25, 83, np.nan, 
                 22, 45, np.nan,np.nan])
# print(ser)
# 仅仅根据缺失值的上一个非nan值进行填充
# print(ser.ffill())
# 根据缺失值的下一个非nan值进行填充，如果找不到，
# 则用上一个非nan值进行填充
print(ser.bfill().ffill())

0      1.0
1     10.0
2      3.0
3      3.0
4     33.0
5     33.0
6     25.0
7     83.0
8     22.0
9     22.0
10    45.0
11    45.0
12    45.0
dtype: float64


32. 计算自相关的前10个滞后的series，找出哪一个滞后有最大的相关性。

自相关性是指随机误差项的各期望值之间存在着相关关系，称随机误差项之间存在自相关性（auto-correlation）或序列相关，于1972年提出。

更详细的名词解释：

自相关（Auto-correlation）
是对信号相关程度的一种度量，也就是说自相关可以看作是信号与自身的延迟信号相乘后的乘积进行积分运算。
在某些领域，自相关函数等同于自协方差。
随机信号的自相关函数与其功率谱是傅氏变换对（随机信号无法得到具体的函数表达式，只有其统计信息），通过对接受信号的自相关运算可以进行频谱分析。同时，自相关在信号检测中也有很重要的作用，是在误码最小原则下的最佳接收准则。

同一时间函数在瞬时t和t+a的两个值相乘积的平均值作为延迟时间t的函数，是信号与延迟后信号之间相似性的度量。延迟时间为零时，即为信号的均方值，此时它的值最大。

更详细的解释： [自相关性](https://www.zybuluo.com/evilking/note/753058)

In [6]:
np.random.seed(80)
# Input
ser = pd.Series(np.arange(20) + np.random.normal(1, 10, 20))
print(ser)
# Solution
autocorrelations = [ser.autocorr(i).round(2) for i in range(11)]
print(autocorrelations[1:])
print('Lag having highest correlation: ', 
      np.argmax(np.abs(autocorrelations[1:]))+1)

0     19.998529
1      4.088030
2     11.952110
3     -7.807292
4      7.735325
5     12.028672
6      0.386326
7     -2.436107
8      3.188510
9      6.436470
10    13.940738
11    -5.969706
12     9.105098
13    18.491409
14    10.261007
15    27.638082
16     4.701879
17    27.867192
18    10.838846
19     2.418162
dtype: float64
[-0.08, 0.22, 0.06, 0.02, 0.35, -0.52, -0.26, 0.19, -0.36, -0.06]
Lag having highest correlation:  6


33. 导入csv时，如何每次仅仅读取50行数据？

read_csv的函数中有个参数：chunksize

通过指定一个chunksize分块大小来读取文件，返回的是一个可迭代的对象TextFileReader

In [7]:
# Solution 1: Use chunks and for-loop
df = pd.read_csv('https://raw.githubusercontent.com/selva86/datasets/master/BostonHousing.csv', 
                 chunksize=50)
df2 = pd.DataFrame()
for chunk in df:
    # 仅导入公约数为50的行号，如0， 50， 100...
    df2 = df2.append(chunk.iloc[0,:])
    # 导入所有数据的方式
#     df2 = df2.append(chunk)
display(df2)

Unnamed: 0,age,b,chas,crim,dis,indus,lstat,medv,nox,ptratio,rad,rm,tax,zn
0,65.2,396.9,0.0,0.00632,4.09,2.31,4.98,24.0,0.538,15.3,1.0,6.575,296.0,18.0
50,45.7,395.56,0.0,0.08873,6.8147,5.64,13.45,19.7,0.439,16.8,4.0,5.963,243.0,21.0
100,79.9,394.76,0.0,0.14866,2.7778,8.56,9.42,27.5,0.52,20.9,5.0,6.727,384.0,0.0
150,97.3,372.8,0.0,1.6566,1.618,19.58,14.1,21.5,0.871,14.7,5.0,6.122,403.0,0.0
200,13.9,384.3,0.0,0.01778,7.6534,1.47,4.45,32.9,0.403,17.0,3.0,7.135,402.0,95.0
250,13.0,396.28,0.0,0.1403,7.3967,5.86,5.9,24.4,0.431,19.1,7.0,6.487,330.0,22.0
300,47.4,390.86,0.0,0.04417,7.8278,2.24,6.07,24.8,0.4,14.8,5.0,6.871,358.0,70.0
350,44.4,396.9,0.0,0.06211,8.7921,1.25,5.98,22.9,0.429,19.7,1.0,6.49,335.0,40.0
400,100.0,396.9,0.0,25.0461,1.5888,18.1,26.77,5.6,0.693,20.2,24.0,5.987,666.0,0.0
450,92.6,0.32,0.0,6.71772,2.3236,18.1,17.44,13.4,0.713,20.2,24.0,6.749,666.0,0.0


In [9]:
# Solution 2: Use chunks and list comprehension
df = pd.read_csv('https://raw.githubusercontent.com/selva86/datasets/master/BostonHousing.csv', 
                 chunksize=50)
df2 = pd.concat([chunk.iloc[0] for chunk in df], axis=1)
display(df2.head())
df2 = df2.T
display(df2.head())

Unnamed: 0,0,50,100,150,200,250,300,350,400,450,500
crim,0.00632,0.08873,0.14866,1.6566,0.01778,0.1403,0.04417,0.06211,25.0461,6.71772,0.22438
zn,18.0,21.0,0.0,0.0,95.0,22.0,70.0,40.0,0.0,0.0,0.0
indus,2.31,5.64,8.56,19.58,1.47,5.86,2.24,1.25,18.1,18.1,9.69
chas,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
nox,0.538,0.439,0.52,0.871,0.403,0.431,0.4,0.429,0.693,0.713,0.585


Unnamed: 0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,b,lstat,medv
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98,24.0
50,0.08873,21.0,5.64,0.0,0.439,5.963,45.7,6.8147,4.0,243.0,16.8,395.56,13.45,19.7
100,0.14866,0.0,8.56,0.0,0.52,6.727,79.9,2.7778,5.0,384.0,20.9,394.76,9.42,27.5
150,1.6566,0.0,19.58,0.0,0.871,6.122,97.3,1.618,5.0,403.0,14.7,372.8,14.1,21.5
200,0.01778,95.0,1.47,0.0,0.403,7.135,13.9,7.6534,3.0,402.0,17.0,384.3,4.45,32.9


34. 将csv导入dataframe时，如何根据条件改变列的值？

In [10]:
# Solution 1: Using converter parameter
df = pd.read_csv('https://raw.githubusercontent.com/selva86/datasets/master/BostonHousing.csv', 
                 converters={'medv': lambda x: 'High' 
                             if float(x) > 25 else 'Low'})
display(df.head())

Unnamed: 0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,b,lstat,medv
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296,15.3,396.9,4.98,Low
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.9,9.14,Low
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,High
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,High
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.9,5.33,High


35. 如何从给定的series，创建行之间具有特定步长的dataframe？

这个例子，事实上是numpy技巧相关

In [11]:
L = pd.Series(range(15))

def gen_strides(a, stride_len=5, window_len=5):
    n_strides = ((a.size-window_len)//stride_len) + 1
    return np.array([a[s:(s+window_len)] 
                     for s in 
                     np.arange(0, a.size, stride_len)[:n_strides]])

gen_strides(L, stride_len=2, window_len=4)

array([[ 0,  1,  2,  3],
       [ 2,  3,  4,  5],
       [ 4,  5,  6,  7],
       [ 6,  7,  8,  9],
       [ 8,  9, 10, 11],
       [10, 11, 12, 13]], dtype=int64)

36. 如何从csv文件导入特定的列？

In [12]:
df = pd.read_csv('https://raw.githubusercontent.com/selva86/datasets/master/BostonHousing.csv', 
                 usecols=['crim', 'medv'])
display(df.head())

Unnamed: 0,crim,medv
0,0.00632,24.0
1,0.02731,21.6
2,0.02729,34.7
3,0.03237,33.4
4,0.06905,36.2


37. 如何从dataframe每列中获取nrows, ncolumns, datatype, summary stats等属性？并获取numpy array以及等价的列表？

In [13]:
df = pd.read_csv('https://raw.githubusercontent.com/selva86/datasets/master/Cars93_miss.csv')

#  number of rows and columns
print('Shape:')
print(df.shape)
print()

# datatypes
print('Data Types:')
print(df.dtypes)
print()

# how many columns under each dtype
print('get_dtype_counts')
print(df.get_dtype_counts())
print()

print('dtypes value_counts')
print(df.dtypes.value_counts())
print()

# summary statistics
print('summary statistics')
df_stats = df.describe()
display(df_stats)
print()

# numpy array 
df_arr = df.values
print(df_arr[:5])
print()

# list
df_list = df.values.tolist()
print(df_list[:5])
print()

Shape:
(93, 27)

Data Types:
Manufacturer           object
Model                  object
Type                   object
Min.Price             float64
Price                 float64
Max.Price             float64
MPG.city              float64
MPG.highway           float64
AirBags                object
DriveTrain             object
Cylinders              object
EngineSize            float64
Horsepower            float64
RPM                   float64
Rev.per.mile          float64
Man.trans.avail        object
Fuel.tank.capacity    float64
Passengers            float64
Length                float64
Wheelbase             float64
Width                 float64
Turn.circle           float64
Rear.seat.room        float64
Luggage.room          float64
Weight                float64
Origin                 object
Make                   object
dtype: object

get_dtype_counts
float64    18
object      9
dtype: int64

dtypes value_counts
float64    18
object      9
dtype: int64

summary statistics


Unnamed: 0,Min.Price,Price,Max.Price,MPG.city,MPG.highway,EngineSize,Horsepower,RPM,Rev.per.mile,Fuel.tank.capacity,Passengers,Length,Wheelbase,Width,Turn.circle,Rear.seat.room,Luggage.room,Weight
count,86.0,91.0,88.0,84.0,91.0,91.0,86.0,90.0,87.0,85.0,91.0,89.0,92.0,87.0,88.0,89.0,74.0,86.0
mean,17.118605,19.616484,21.459091,22.404762,29.065934,2.658242,144.0,5276.666667,2355.0,16.683529,5.076923,182.865169,103.956522,69.448276,38.954545,27.853933,13.986486,3104.593023
std,8.82829,9.72428,10.696563,5.84152,5.370293,1.045845,53.455204,605.554811,486.916616,3.375748,1.045953,14.792651,6.856317,3.778023,3.304157,3.018129,3.120824,600.129993
min,6.7,7.4,7.9,15.0,20.0,1.0,55.0,3800.0,1320.0,9.2,2.0,141.0,90.0,60.0,32.0,19.0,6.0,1695.0
25%,10.825,12.35,14.575,18.0,26.0,1.8,100.75,4800.0,2017.5,14.5,4.0,174.0,98.0,67.0,36.0,26.0,12.0,2647.5
50%,14.6,17.7,19.15,21.0,28.0,2.3,140.0,5200.0,2360.0,16.5,5.0,181.0,103.0,69.0,39.0,27.5,14.0,3085.0
75%,20.25,23.5,24.825,25.0,31.0,3.25,170.0,5787.5,2565.0,19.0,6.0,192.0,110.0,72.0,42.0,30.0,16.0,3567.5
max,45.4,61.9,80.0,46.0,50.0,5.7,300.0,6500.0,3755.0,27.0,8.0,219.0,119.0,78.0,45.0,36.0,22.0,4105.0



[['Acura' 'Integra' 'Small' 12.9 15.9 18.8 25.0 31.0 'None' 'Front' '4'
  1.8 140.0 6300.0 2890.0 'Yes' 13.2 5.0 177.0 102.0 68.0 37.0 26.5 nan
  2705.0 'non-USA' 'Acura Integra']
 [nan 'Legend' 'Midsize' 29.2 33.9 38.7 18.0 25.0 'Driver & Passenger'
  'Front' '6' 3.2 200.0 5500.0 2335.0 'Yes' 18.0 5.0 195.0 115.0 71.0
  38.0 30.0 15.0 3560.0 'non-USA' 'Acura Legend']
 ['Audi' '90' 'Compact' 25.9 29.1 32.3 20.0 26.0 'Driver only' 'Front'
  '6' 2.8 172.0 5500.0 2280.0 'Yes' 16.9 5.0 180.0 102.0 67.0 37.0 28.0
  14.0 3375.0 'non-USA' 'Audi 90']
 ['Audi' '100' 'Midsize' nan 37.7 44.6 19.0 26.0 'Driver & Passenger' nan
  '6' nan 172.0 5500.0 2535.0 nan 21.1 6.0 193.0 106.0 nan 37.0 31.0 17.0
  3405.0 'non-USA' 'Audi 100']
 ['BMW' '535i' 'Midsize' nan 30.0 nan 22.0 30.0 nan 'Rear' '4' 3.5 208.0
  5700.0 2545.0 'Yes' 21.1 4.0 186.0 109.0 69.0 39.0 27.0 13.0 3640.0
  'non-USA' 'BMW 535i']]

[['Acura', 'Integra', 'Small', 12.9, 15.9, 18.8, 25.0, 31.0, 'None', 'Front', '4', 1.8, 140.0, 6300.0,

38. 如何根据给定条件，从特定cell中获取行与列的number？

In [14]:
# Input
df = pd.read_csv('https://raw.githubusercontent.com/selva86/datasets/master/Cars93_miss.csv')

# Solution
# Get Manufacturer with highest price
print('Get Manufacturer with highest price')
display(df.loc[df.Price == np.max(df.Price), ['Manufacturer', 'Model', 'Type']])

# Get Row and Column number
print('Get Row and Column number')
row, col = np.where(df.values == np.max(df.Price))
print(row, col)
print()

print('Get the value')
# Get the value
print(df.iat[row[0], col[0]])
print(df.iloc[row[0], col[0]])

# Alternates
print(df.at[row[0], 'Price'])
print(df.get_value(row[0], 'Price'))

# The difference between `iat` - `iloc` vs `at` - `loc` is:
# `iat` snd `iloc` accepts row and column numbers. 
# Whereas `at` and `loc` accepts index and column names.

Get Manufacturer with highest price


Unnamed: 0,Manufacturer,Model,Type
58,Mercedes-Benz,300E,Midsize


Get Row and Column number
[58] [4]

Get the value
61.9
61.9
61.9
61.9




39. 如何重命名dataframe的某一列？

Rename the column Type as CarType in df and replace the ‘.’ in column names with ‘_’.

In [15]:
# Input
df = pd.read_csv('https://raw.githubusercontent.com/selva86/datasets/master/Cars93_miss.csv')
print('raw column names')
display(df.columns)
print()
# Solution
# Step 1:
df=df.rename(columns = {'Type':'CarType'})
# or
df.columns.values[2] = "CarType"

# Step 2:
df.columns = df.columns.map(lambda x: x.replace('.', '_'))
print('Ranamed column names')
display(df.columns)

raw column names


Index(['Manufacturer', 'Model', 'Type', 'Min.Price', 'Price', 'Max.Price',
       'MPG.city', 'MPG.highway', 'AirBags', 'DriveTrain', 'Cylinders',
       'EngineSize', 'Horsepower', 'RPM', 'Rev.per.mile', 'Man.trans.avail',
       'Fuel.tank.capacity', 'Passengers', 'Length', 'Wheelbase', 'Width',
       'Turn.circle', 'Rear.seat.room', 'Luggage.room', 'Weight', 'Origin',
       'Make'],
      dtype='object')


Ranamed column names


Index(['Manufacturer', 'Model', 'CarType', 'Min_Price', 'Price', 'Max_Price',
       'MPG_city', 'MPG_highway', 'AirBags', 'DriveTrain', 'Cylinders',
       'EngineSize', 'Horsepower', 'RPM', 'Rev_per_mile', 'Man_trans_avail',
       'Fuel_tank_capacity', 'Passengers', 'Length', 'Wheelbase', 'Width',
       'Turn_circle', 'Rear_seat_room', 'Luggage_room', 'Weight', 'Origin',
       'Make'],
      dtype='object')

40. 如何获得每列缺失值的数目？

Count the number of missing values in each column of df. Which column has the maximum number of missing values?

In [16]:
# Input
df = pd.read_csv('https://raw.githubusercontent.com/selva86/datasets/master/Cars93_miss.csv')

# Solution
n_missings_each_col = df.apply(lambda x: x.isnull().sum())
print('每列缺失值的数目')
display(n_missings_each_col)

print()
print('缺失值最多的列')
print(n_missings_each_col.idxmax())

每列缺失值的数目


Manufacturer           4
Model                  1
Type                   3
Min.Price              7
Price                  2
Max.Price              5
MPG.city               9
MPG.highway            2
AirBags                6
DriveTrain             7
Cylinders              5
EngineSize             2
Horsepower             7
RPM                    3
Rev.per.mile           6
Man.trans.avail        5
Fuel.tank.capacity     8
Passengers             2
Length                 4
Wheelbase              1
Width                  6
Turn.circle            5
Rear.seat.room         4
Luggage.room          19
Weight                 7
Origin                 5
Make                   3
dtype: int64


缺失值最多的列
Luggage.room


41. 如何检查DataFrame是否有missing value?

In [17]:
# Input
df = pd.read_csv('https://raw.githubusercontent.com/selva86/datasets/master/Cars93_miss.csv')

# Solution
df.isnull().values.any()

True

42. 如何将缺失值替换为均值？

In [18]:
# Input
df = pd.read_csv('https://raw.githubusercontent.com/selva86/datasets/master/Cars93_miss.csv')
display(df[['Min.Price', 'Max.Price']].head())
# Solution
df_out = df[['Min.Price', 'Max.Price']] = df[['Min.Price', 'Max.Price']].apply(lambda x: x.fillna(x.mean()))
display(df_out.head())

Unnamed: 0,Min.Price,Max.Price
0,12.9,18.8
1,29.2,38.7
2,25.9,32.3
3,,44.6
4,,


Unnamed: 0,Min.Price,Max.Price
0,12.9,18.8
1,29.2,38.7
2,25.9,32.3
3,17.118605,44.6
4,17.118605,21.459091


43.  如果通过apply方法，将全局变量作为附加参数，为已经存在的列的空值进行填充？

In [22]:
# Input
df = pd.read_csv('https://raw.githubusercontent.com/selva86/datasets/master/Cars93_miss.csv')
display(df.head())
# Solution
# 对Min.Price填充为均值；对Max.Price填充为中位数
d = {'Min.Price': np.nanmean, 'Max.Price': np.nanmedian}
print(d)
df[['Min.Price', 'Max.Price']] = df[['Min.Price', 'Max.Price']].apply(lambda x, d: x.fillna(d[x.name](x)), args=(d, ))
display(df[['Min.Price', 'Max.Price']][:20])

Unnamed: 0,Manufacturer,Model,Type,Min.Price,Price,Max.Price,MPG.city,MPG.highway,AirBags,DriveTrain,...,Passengers,Length,Wheelbase,Width,Turn.circle,Rear.seat.room,Luggage.room,Weight,Origin,Make
0,Acura,Integra,Small,12.9,15.9,18.8,25.0,31.0,,Front,...,5.0,177.0,102.0,68.0,37.0,26.5,,2705.0,non-USA,Acura Integra
1,,Legend,Midsize,29.2,33.9,38.7,18.0,25.0,Driver & Passenger,Front,...,5.0,195.0,115.0,71.0,38.0,30.0,15.0,3560.0,non-USA,Acura Legend
2,Audi,90,Compact,25.9,29.1,32.3,20.0,26.0,Driver only,Front,...,5.0,180.0,102.0,67.0,37.0,28.0,14.0,3375.0,non-USA,Audi 90
3,Audi,100,Midsize,,37.7,44.6,19.0,26.0,Driver & Passenger,,...,6.0,193.0,106.0,,37.0,31.0,17.0,3405.0,non-USA,Audi 100
4,BMW,535i,Midsize,,30.0,,22.0,30.0,,Rear,...,4.0,186.0,109.0,69.0,39.0,27.0,13.0,3640.0,non-USA,BMW 535i


{'Min.Price': <function nanmean at 0x000001F1DCACABF8>, 'Max.Price': <function nanmedian at 0x000001F1DCACAE18>}


Unnamed: 0,Min.Price,Max.Price
0,12.9,18.8
1,29.2,38.7
2,25.9,32.3
3,17.118605,44.6
4,17.118605,19.15
5,14.2,17.3
6,19.9,19.15
7,22.6,24.9
8,26.3,26.3
9,33.0,36.3


44. 如何从dataframe获取特定列，且作为dataframe输出，而不是series？

In [19]:
# Input
df = pd.DataFrame(np.arange(20).reshape(-1, 5), columns=list('abcde'))
display(df)
# Solution
print(type(df[['a']]))
display(df[['a']])
print(type(df.loc[:, ['a']]))
print(type(df.iloc[:, [0]]))

# Alternately the following returns a Series
print(type(df.a))
print(type(df['a']))
print(type(df.loc[:, 'a']))
print(type(df.iloc[:, 1]))

Unnamed: 0,a,b,c,d,e
0,0,1,2,3,4
1,5,6,7,8,9
2,10,11,12,13,14
3,15,16,17,18,19


<class 'pandas.core.frame.DataFrame'>


Unnamed: 0,a
0,0
1,5
2,10
3,15


<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>


45. 如何更改dataframe的列的顺序？

In [20]:
# Input
df = pd.DataFrame(np.arange(20).reshape(-1, 5), columns=list('abcde'))
display(df)
# Solution Q1
display(df[list('cbade')])



# Solution Q2 - No hard coding
def switch_columns(df, col1=None, col2=None):
    colnames = df.columns.tolist()
    i1, i2 = colnames.index(col1), colnames.index(col2)
    colnames[i2], colnames[i1] = colnames[i1], colnames[i2]
    return df[colnames]

display(switch_columns(df, 'a', 'c'))

# 这个例子是如何将column names降序排列
df.sort_index(axis=1, ascending=False, inplace=True)
display(df)

Unnamed: 0,a,b,c,d,e
0,0,1,2,3,4
1,5,6,7,8,9
2,10,11,12,13,14
3,15,16,17,18,19


Unnamed: 0,c,b,a,d,e
0,2,1,0,3,4
1,7,6,5,8,9
2,12,11,10,13,14
3,17,16,15,18,19


Unnamed: 0,c,b,a,d,e
0,2,1,0,3,4
1,7,6,5,8,9
2,12,11,10,13,14
3,17,16,15,18,19


Unnamed: 0,e,d,c,b,a
0,4,3,2,1,0
1,9,8,7,6,5
2,14,13,12,11,10
3,19,18,17,16,15


In [21]:
my_list = list('abcde')
my_list

['a', 'b', 'c', 'd', 'e']

In [24]:
new_list = []
for count, i in enumerate(range(len(my_list))[::-1]):
    new_list.append(my_list[i])
    print(count, i)
print(new_list)

0 4
1 3
2 2
3 1
4 0
['e', 'd', 'c', 'b', 'a']


46. 如何设置行与列的输出显示结果数目？

In [42]:
# Input
df = pd.read_csv('https://raw.githubusercontent.com/selva86/datasets/master/Cars93_miss.csv')

# Solution
pd.set_option('display.max_columns', 10)
pd.set_option('display.max_rows', 10)
display(df)
pd.set_option('display.max_columns', 20)
pd.set_option('display.max_rows', 50)
display(df)

Unnamed: 0,Manufacturer,Model,Type,Min.Price,Price,...,Rear.seat.room,Luggage.room,Weight,Origin,Make
0,Acura,Integra,Small,12.9,15.9,...,26.5,,2705.0,non-USA,Acura Integra
1,,Legend,Midsize,29.2,33.9,...,30.0,15.0,3560.0,non-USA,Acura Legend
2,Audi,90,Compact,25.9,29.1,...,28.0,14.0,3375.0,non-USA,Audi 90
3,Audi,100,Midsize,,37.7,...,31.0,17.0,3405.0,non-USA,Audi 100
4,BMW,535i,Midsize,,30.0,...,27.0,13.0,3640.0,non-USA,BMW 535i
...,...,...,...,...,...,...,...,...,...,...,...
88,Volkswagen,Eurovan,Van,16.6,19.7,...,34.0,,3960.0,,Volkswagen Eurovan
89,Volkswagen,Passat,Compact,17.6,20.0,...,31.5,14.0,2985.0,non-USA,Volkswagen Passat
90,Volkswagen,Corrado,Sporty,22.9,23.3,...,26.0,15.0,2810.0,non-USA,Volkswagen Corrado
91,Volvo,240,Compact,21.8,22.7,...,29.5,14.0,2985.0,non-USA,Volvo 240


Unnamed: 0,Manufacturer,Model,Type,Min.Price,Price,Max.Price,MPG.city,MPG.highway,AirBags,DriveTrain,...,Passengers,Length,Wheelbase,Width,Turn.circle,Rear.seat.room,Luggage.room,Weight,Origin,Make
0,Acura,Integra,Small,12.9,15.9,18.8,25.0,31.0,,Front,...,5.0,177.0,102.0,68.0,37.0,26.5,,2705.0,non-USA,Acura Integra
1,,Legend,Midsize,29.2,33.9,38.7,18.0,25.0,Driver & Passenger,Front,...,5.0,195.0,115.0,71.0,38.0,30.0,15.0,3560.0,non-USA,Acura Legend
2,Audi,90,Compact,25.9,29.1,32.3,20.0,26.0,Driver only,Front,...,5.0,180.0,102.0,67.0,37.0,28.0,14.0,3375.0,non-USA,Audi 90
3,Audi,100,Midsize,,37.7,44.6,19.0,26.0,Driver & Passenger,,...,6.0,193.0,106.0,,37.0,31.0,17.0,3405.0,non-USA,Audi 100
4,BMW,535i,Midsize,,30.0,,22.0,30.0,,Rear,...,4.0,186.0,109.0,69.0,39.0,27.0,13.0,3640.0,non-USA,BMW 535i
5,Buick,Century,Midsize,14.2,15.7,17.3,22.0,31.0,Driver only,,...,6.0,189.0,105.0,69.0,41.0,28.0,16.0,,USA,Buick Century
6,Buick,LeSabre,Large,19.9,20.8,,19.0,28.0,Driver only,Front,...,6.0,200.0,111.0,74.0,42.0,30.5,17.0,3470.0,USA,Buick LeSabre
7,Buick,Roadmaster,Large,22.6,23.7,24.9,16.0,25.0,Driver only,Rear,...,6.0,216.0,116.0,78.0,45.0,30.5,21.0,4105.0,USA,Buick Roadmaster
8,Buick,Riviera,Midsize,26.3,26.3,26.3,19.0,27.0,Driver only,Front,...,5.0,198.0,108.0,,41.0,26.5,14.0,3495.0,USA,Buick Riviera
9,Cadillac,DeVille,Large,33.0,34.7,36.3,16.0,25.0,Driver only,Front,...,6.0,206.0,114.0,73.0,43.0,35.0,18.0,3620.0,USA,Cadillac DeVille


47. 如何对dataframe格式化或者不显示科学计数法？

In [25]:
# Input
np.random.seed(100)
df = pd.DataFrame(np.random.random(4)**10, columns=['random'])
display(df)
# Solution 1: Rounding
display(df.round(4))

# Solution 2: Use apply to change format
display(df.apply(lambda x: '%.4f' % x, axis=1))
# or
display(df.applymap(lambda x: '%.4f' % x))

# Solution 3: Use set_option
pd.set_option('display.float_format', lambda x: '%.4f' % x)

# Solution 4: Assign display.float_format
pd.options.display.float_format = '{:.4f}'.format
display(df)

# Reset/undo float formatting
pd.options.display.float_format = None

Unnamed: 0,random
0,0.002245
1,3e-06
2,0.00019
3,0.185104


Unnamed: 0,random
0,0.0022
1,0.0
2,0.0002
3,0.1851


0    0.0022
1    0.0000
2    0.0002
3    0.1851
dtype: object

Unnamed: 0,random
0,0.0022
1,0.0
2,0.0002
3,0.1851


Unnamed: 0,random
0,0.0022
1,0.0
2,0.0002
3,0.1851


48. 如何将dataframe中的值格式化为百分比形式？

In [26]:
# Input
np.random.seed(100)
df = pd.DataFrame(np.random.random(4), columns=['random'])
display(df)
# Solution
out = df.style.format({
    'random': '{0:.2%}'.format,
})

display(out)

Unnamed: 0,random
0,0.543405
1,0.278369
2,0.424518
3,0.844776


Unnamed: 0,random
0,54.34%
1,27.84%
2,42.45%
3,84.48%


49. 如何按照公约数的行号输出？

In [27]:
# Input
df = pd.read_csv('https://raw.githubusercontent.com/selva86/datasets/master/Cars93_miss.csv')

# Solution
display(df.iloc[::20, :][['Manufacturer', 'Model', 'Type']])

Unnamed: 0,Manufacturer,Model,Type
0,Acura,Integra,Small
20,Chrysler,LeBaron,Compact
40,Honda,Prelude,Sporty
60,Mercury,Cougar,Midsize
80,Subaru,Loyale,Small


50. 如何合并关联列，并创建primary key的索引？

In [56]:
# Input
df = pd.read_csv('https://raw.githubusercontent.com/selva86/datasets/master/Cars93_miss.csv', usecols=[0,1,2,3,5])
display(df.head())
# Solution
df[['Manufacturer', 'Model', 'Type']] = df[['Manufacturer', 'Model', 'Type']].fillna('missing')
df.index = df.Manufacturer + '_' + df.Model + '_' + df.Type
# display(df.index)
display(df.index.is_unique)
display(df.head())

Unnamed: 0,Manufacturer,Model,Type,Min.Price,Max.Price
0,Acura,Integra,Small,12.9,18.8
1,,Legend,Midsize,29.2,38.7
2,Audi,90,Compact,25.9,32.3
3,Audi,100,Midsize,,44.6
4,BMW,535i,Midsize,,


True

Unnamed: 0,Manufacturer,Model,Type,Min.Price,Max.Price
Acura_Integra_Small,Acura,Integra,Small,12.9,18.8
missing_Legend_Midsize,missing,Legend,Midsize,29.2,38.7
Audi_90_Compact,Audi,90,Compact,25.9,32.3
Audi_100_Midsize,Audi,100,Midsize,,44.6
BMW_535i_Midsize,BMW,535i,Midsize,,


51. 如何获得某列第几大的值的行号？

In [38]:
# Input
np.random.seed(99)
df = pd.DataFrame(np.random.randint(1, 30, 30).reshape(10,-1), columns=list('abc'))
display(df)
# Solution
n = 7
# arg开头的函数，得到的列表的值，都是该列的索引值
# 其实应该说是得到该列对应的第几大的值的索引，更贴切
display(df['a'].argsort()[::-1][n])

Unnamed: 0,a,b,c
0,2,4,26
1,9,10,9
2,19,5,6
3,21,2,24
4,4,24,18
5,2,17,28
6,7,12,21
7,24,3,1
8,13,9,9
9,21,26,28


3

52. 如何找到比给定值大的第几个最大值的索引？如找到比均值大的第二个值的，位于Series数值顺序的索引？

In [47]:
# Input
np.random.seed(100)
ser = pd.Series(np.random.randint(1, 100, 15))

# Solution
print('ser: ', ser.tolist(), 'mean: ', round(ser.mean()))
# where得到的是tuple，索引列表，在第一个成员中
print(np.where(ser > ser.mean())[0][1])
# argwhere直接得到索引列表，所以可以直接将1，即第二个项，获取值
np.argwhere(ser > ser.mean())[1]

ser:  [9, 25, 68, 88, 80, 49, 11, 95, 53, 99, 54, 67, 99, 15, 35] mean:  56
3


array([3], dtype=int64)

53. 如何获得dataframe中，行How to get the last n rows of a dataframe with row sum > 100?

In [101]:
# Input
np.random.seed(100)
df = pd.DataFrame(np.random.randint(10, 40, 60).reshape(-1, 4))
display(df)
# Solution
# print row sums
rowsums = df.apply(np.sum, axis=1)
display(rowsums)
# last two rows with row sum greater than 100
display(np.where(rowsums > 100)[0][-2:])
last_two_rows = df.iloc[np.where(rowsums > 100)[0][-2:], :]
display(last_two_rows)

Unnamed: 0,0,1,2,3
0,18,34,13,17
1,33,25,26,20
2,30,12,31,12
3,12,24,12,27
4,26,34,25,14
5,21,38,36,26
6,37,19,39,32
7,12,37,22,14
8,11,23,31,29
9,14,14,37,37


0      82
1     104
2      85
3      75
4      99
5     121
6     127
7      85
8      94
9     102
10     82
11    107
12     92
13     95
14    126
dtype: int64

array([11, 14], dtype=int64)

Unnamed: 0,0,1,2,3
11,11,39,24,33
14,31,37,34,24


54. 如何将Series中小于5%分位数的数替换为5%分位数以及将大于95%分位数的数，替换为95%分位数？

In [115]:
# Input
ser = pd.Series(np.logspace(-2, 2, 30))
display(ser)
print(ser.quantile([0, .1, .2, .3, .4, .5, .6, .7, .8, .9, 1]))
# Solution
def cap_outliers(ser, low_perc, high_perc):
    low, high = ser.quantile([low_perc, high_perc])
    print(low_perc*100, '%ile: ', low, '|', high_perc*100, '%ile: ', high)
    ser[ser < low] = low
    ser[ser > high] = high
    return(ser)

capped_ser = cap_outliers(ser, .05, .95)
display(capped_ser)

0       0.010000
1       0.013738
2       0.018874
3       0.025929
4       0.035622
5       0.048939
6       0.067234
7       0.092367
8       0.126896
9       0.174333
10      0.239503
11      0.329034
12      0.452035
13      0.621017
14      0.853168
15      1.172102
16      1.610262
17      2.212216
18      3.039195
19      4.175319
20      5.736153
21      7.880463
22     10.826367
23     14.873521
24     20.433597
25     28.072162
26     38.566204
27     52.983169
28     72.789538
29    100.000000
dtype: float64

0.0      0.010000
0.1      0.025224
0.2      0.063575
0.3      0.160102
0.4      0.402835
0.5      1.012635
0.6      2.543008
0.7      6.379446
0.8     15.985536
0.9     40.007901
1.0    100.000000
dtype: float64
5.0 %ile:  0.016049294076965887 | 95.0 %ile:  63.876672220183934


0      0.016049
1      0.016049
2      0.018874
3      0.025929
4      0.035622
5      0.048939
6      0.067234
7      0.092367
8      0.126896
9      0.174333
10     0.239503
11     0.329034
12     0.452035
13     0.621017
14     0.853168
15     1.172102
16     1.610262
17     2.212216
18     3.039195
19     4.175319
20     5.736153
21     7.880463
22    10.826367
23    14.873521
24    20.433597
25    28.072162
26    38.566204
27    52.983169
28    63.876672
29    63.876672
dtype: float64

55. 如何去除dataframe中的负数，并reshape为最大可能平方？<br>（最大可能平方：举例：如果矩阵中的元素有70个，那么取其平方根的整数为8，然后将矩阵中的元素进行降序排列，并获得前8^2=64个元素，之后按照元矩阵的元素顺序，构建8x8的矩阵。）

In [127]:
# Input
np.random.seed(100)
df = pd.DataFrame(np.random.randint(-20, 50, 100).reshape(10,-1))
print(df)

# Solution
# Step 1: remove negative values from arr
arr = df[df > 0].values.flatten()
arr_qualified = arr[~np.isnan(arr)]
print(arr_qualified)
print(arr_qualified.shape[0])
# Step 2: find side-length of largest possible square
n = int(np.floor(arr_qualified.shape[0]**.5))
print(n)
# Step 3: Take top n^2 items without changing positions
top_indexes = np.argsort(arr_qualified)[::-1]
print(top_indexes[:n**2])
#对top_indexes[:n**2]排序，是为了维持矩阵中元素原有的顺序
output = np.take(arr_qualified, sorted(top_indexes[:n**2])).reshape(n, -1)
display(output)

    0   1   2   3   4   5   6   7   8   9
0 -12   4  47  28 -10  32  33  46  -6  14
1   4  -5  40  38  -4 -11 -18   7 -16  11
2 -19  -7 -16  39  47 -13  29  27  45  41
3  -6  35 -18  -1  43  33   7  36  10  28
4  27  19  18  24  -2  44  36  14  33  -3
5  -7  10  -3  33  48  30  33 -20  -7  37
6 -17 -17 -10  40 -17  28  32  23  16 -15
7  18  22  13  38  22   2 -20  35  -1  33
8  48  42  30  48  15   3 -11  28   1   5
9  34 -14  17  38  19  31  10  46   4  35
[ 4. 47. 28. 32. 33. 46. 14.  4. 40. 38.  7. 11. 39. 47. 29. 27. 45. 41.
 35. 43. 33.  7. 36. 10. 28. 27. 19. 18. 24. 44. 36. 14. 33. 10. 33. 48.
 30. 33. 37. 40. 28. 32. 23. 16. 18. 22. 13. 38. 22.  2. 35. 33. 48. 42.
 30. 48. 15.  3. 28.  1.  5. 34. 17. 38. 19. 31. 10. 46.  4. 35.]
70
8
[55 35 52  1 13 67  5 16 29 19 53 17  8 39 12 63 47  9 38 22 30 50 18 69
 61 20 32  4 37 34 51  3 41 65 36 54 14 24  2 58 40 25 15 28 42 45 48 26
 64 27 44 62 43 56 31  6 46 11 66 33 23 21 10 60]


array([[47., 28., 32., 33., 46., 14., 40., 38.],
       [ 7., 11., 39., 47., 29., 27., 45., 41.],
       [35., 43., 33.,  7., 36., 10., 28., 27.],
       [19., 18., 24., 44., 36., 14., 33., 10.],
       [33., 48., 30., 33., 37., 40., 28., 32.],
       [23., 16., 18., 22., 13., 38., 22., 35.],
       [33., 48., 42., 30., 48., 15., 28.,  5.],
       [34., 17., 38., 19., 31., 10., 46., 35.]])

56. 如何交换dataframe中的两行？

In [129]:
# Input
df = pd.DataFrame(np.arange(25).reshape(5, -1))
display(df)
# Solution
def swap_rows(df, i1, i2):
    a, b = df.iloc[i1, :].copy(), df.iloc[i2, :].copy()
    df.iloc[i1, :], df.iloc[i2, :] = b, a
    return df

display(swap_rows(df, 1, 2))

Unnamed: 0,0,1,2,3,4
0,0,1,2,3,4
1,5,6,7,8,9
2,10,11,12,13,14
3,15,16,17,18,19
4,20,21,22,23,24


Unnamed: 0,0,1,2,3,4
0,0,1,2,3,4
1,10,11,12,13,14
2,5,6,7,8,9
3,15,16,17,18,19
4,20,21,22,23,24


57. 如何翻转dataframe中的行？

In [48]:
# Input
df = pd.DataFrame(np.arange(25).reshape(5, -1))
display(df)
# Solution 1
display(df.iloc[::-1, :])

# Solution 2
display(df.loc[df.index[::-1], :])

Unnamed: 0,0,1,2,3,4
0,0,1,2,3,4
1,5,6,7,8,9
2,10,11,12,13,14
3,15,16,17,18,19
4,20,21,22,23,24


Unnamed: 0,0,1,2,3,4
4,20,21,22,23,24
3,15,16,17,18,19
2,10,11,12,13,14
1,5,6,7,8,9
0,0,1,2,3,4


Unnamed: 0,0,1,2,3,4
4,20,21,22,23,24
3,15,16,17,18,19
2,10,11,12,13,14
1,5,6,7,8,9
0,0,1,2,3,4


58. 如何对枚举型数据，进行One-Hot编码？(Dummy化)<br>此例，将对列a，进行one-hot编码

In [132]:
# Input
df = pd.DataFrame(np.arange(25).reshape(5,-1), columns=list('abcde'))
display(df)
# Solution
df_onehot = pd.concat([pd.get_dummies(df['a']), df[list('bcde')]], axis=1)
display(df_onehot)

Unnamed: 0,a,b,c,d,e
0,0,1,2,3,4
1,5,6,7,8,9
2,10,11,12,13,14
3,15,16,17,18,19
4,20,21,22,23,24


Unnamed: 0,0,5,10,15,20,b,c,d,e
0,1,0,0,0,0,1,2,3,4
1,0,1,0,0,0,6,7,8,9
2,0,0,1,0,0,11,12,13,14
3,0,0,0,1,0,16,17,18,19
4,0,0,0,0,1,21,22,23,24


59. 每一行获取最大值，哪一列命中最大值的数目最多？

In [142]:
# Input
np.random.seed(100)
df = pd.DataFrame(np.random.randint(1,100, 40).reshape(10, -1))
display(df)
print(df.apply(np.argmax, axis=1))
print(df.apply(np.argmax, axis=1).value_counts())
# Solution
print('Column with highest row maxes: ', df.apply(np.argmax, axis=1).value_counts().index[0])

Unnamed: 0,0,1,2,3
0,9,25,68,88
1,80,49,11,95
2,53,99,54,67
3,99,15,35,25
4,16,61,59,17
5,10,94,87,3
6,28,5,32,2
7,14,84,5,92
8,60,68,8,50
9,48,66,62,15


0    3
1    3
2    1
3    0
4    1
5    1
6    2
7    3
8    1
9    1
dtype: int64
1    5
3    3
2    1
0    1
dtype: int64
Column with highest row maxes:  1


60. 如何创建包含基于欧式距离计算的最近行的行号的新列？（选学）

In [49]:
np.random.seed(100)
df = pd.DataFrame(np.random.randint(1,100, 40).reshape(10, -1), columns=list('pqrs'), index=list('abcdefghij'))
display(df)
# init outputs
nearest_rows = []
nearest_distance = []

# iterate rows.
for i, row in df.iterrows():
    curr = row
    rest = df.drop(i)
    e_dists = {}  # init dict to store euclidean dists for current row.
    # iterate rest of rows for current row
    for j, contestant in rest.iterrows():
        # compute euclidean dist and update e_dists
        e_dists.update({j: round(np.linalg.norm(curr.values - contestant.values))})
    # update nearest row to current row and the distance value
    nearest_rows.append(max(e_dists, key=e_dists.get))
    nearest_distance.append(max(e_dists.values()))

df['nearest_row'] = nearest_rows
df['dist'] = nearest_distance
display(df)

Unnamed: 0,p,q,r,s
a,9,25,68,88
b,80,49,11,95
c,53,99,54,67
d,99,15,35,25
e,16,61,59,17
f,10,94,87,3
g,28,5,32,2
h,14,84,5,92
i,60,68,8,50
j,48,66,62,15


Unnamed: 0,p,q,r,s,nearest_row,dist
a,9,25,68,88,d,115.0
b,80,49,11,95,f,145.0
c,53,99,54,67,g,119.0
d,99,15,35,25,f,132.0
e,16,61,59,17,b,112.0
f,10,94,87,3,b,145.0
g,28,5,32,2,h,124.0
h,14,84,5,92,d,132.0
i,60,68,8,50,f,108.0
j,48,66,62,15,h,103.0


61. 如何获得每一列相对其他列的最大可能相关值？

In [50]:
# Input
np.random.seed(100)
df = pd.DataFrame(np.random.randint(1,100, 80).reshape(8, -1), columns=list('pqrstuvwxy'), index=list('abcdefgh'))
display(df)

# Solution
abs_corrmat = np.abs(df.corr())
display(abs_corrmat)
# 因为自相关的最大值始终为1，所以取第二大数为最大可能相关值
display(abs_corrmat.apply(lambda x: sorted(x)[-2]))
max_corr = abs_corrmat.apply(lambda x: sorted(x)[-2])
print('Maximum Correlation possible for each column: ', np.round(max_corr.tolist(), 2))

Unnamed: 0,p,q,r,s,t,u,v,w,x,y
a,9,25,68,88,80,49,11,95,53,99
b,54,67,99,15,35,25,16,61,59,17
c,10,94,87,3,28,5,32,2,14,84
d,5,92,60,68,8,50,48,66,62,15
e,56,72,81,3,95,20,99,64,54,28
f,57,31,49,48,40,39,45,19,65,57
g,35,54,75,18,73,14,31,18,54,69
h,51,92,92,84,54,79,1,14,58,77


Unnamed: 0,p,q,r,s,t,u,v,w,x,y
p,1.0,0.14101,0.227098,0.265108,0.324185,0.057459,0.196782,0.228487,0.454752,0.299886
q,0.14101,1.0,0.511982,0.20415,0.446381,0.028767,0.071733,0.339051,0.374007,0.308756
r,0.227098,0.511982,1.0,0.354766,0.10893,0.130976,0.27276,0.12922,0.355986,0.031784
s,0.265108,0.20415,0.354766,1.0,0.086377,0.905234,0.512726,0.303129,0.449848,0.315986
t,0.324185,0.446381,0.10893,0.086377,1.0,0.085898,0.261239,0.25349,0.111025,0.317856
u,0.057459,0.028767,0.130976,0.905234,0.085898,1.0,0.428679,0.172237,0.54713,0.113304
v,0.196782,0.071733,0.27276,0.512726,0.261239,0.428679,1.0,0.14476,0.038545,0.504748
w,0.228487,0.339051,0.12922,0.303129,0.25349,0.172237,0.14476,1.0,0.394792,0.281705
x,0.454752,0.374007,0.355986,0.449848,0.111025,0.54713,0.038545,0.394792,1.0,0.441223
y,0.299886,0.308756,0.031784,0.315986,0.317856,0.113304,0.504748,0.281705,0.441223,1.0


p    0.454752
q    0.511982
r    0.511982
s    0.905234
t    0.446381
u    0.905234
v    0.512726
w    0.394792
x    0.547130
y    0.504748
dtype: float64

Maximum Correlation possible for each column:  [0.45 0.51 0.51 0.91 0.45 0.91 0.51 0.39 0.55 0.5 ]


62. 如何基于每行的minimum-by-maximum值，创建一个新列？

In [51]:
# Input
np.random.seed(100)
df = pd.DataFrame(np.random.randint(1,100, 80).reshape(8, -1))
display(df)
# Solution 1
min_by_max = df.apply(lambda x: np.min(x)/np.max(x), axis=1)
display(min_by_max)
# Solution 2
min_by_max = np.min(df, axis=1)/np.max(df, axis=1)
display(min_by_max)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,9,25,68,88,80,49,11,95,53,99
1,54,67,99,15,35,25,16,61,59,17
2,10,94,87,3,28,5,32,2,14,84
3,5,92,60,68,8,50,48,66,62,15
4,56,72,81,3,95,20,99,64,54,28
5,57,31,49,48,40,39,45,19,65,57
6,35,54,75,18,73,14,31,18,54,69
7,51,92,92,84,54,79,1,14,58,77


0    0.090909
1    0.151515
2    0.021277
3    0.054348
4    0.030303
5    0.292308
6    0.186667
7    0.010870
dtype: float64

0    0.090909
1    0.151515
2    0.021277
3    0.054348
4    0.030303
5    0.292308
6    0.186667
7    0.010870
dtype: float64

63. 如何基于每行第二大的数，创建一个新列？

In [152]:
# Input
np.random.seed(100)
df = pd.DataFrame(np.random.randint(1,100, 80).reshape(8, -1))
display(df)
# Solution
out = df.apply(lambda x: x.sort_values().unique()[-2], axis=1)
df['penultimate'] = out
display(df)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,9,25,68,88,80,49,11,95,53,99
1,54,67,99,15,35,25,16,61,59,17
2,10,94,87,3,28,5,32,2,14,84
3,5,92,60,68,8,50,48,66,62,15
4,56,72,81,3,95,20,99,64,54,28
5,57,31,49,48,40,39,45,19,65,57
6,35,54,75,18,73,14,31,18,54,69
7,51,92,92,84,54,79,1,14,58,77


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,penultimate
0,9,25,68,88,80,49,11,95,53,99,95
1,54,67,99,15,35,25,16,61,59,17,67
2,10,94,87,3,28,5,32,2,14,84,87
3,5,92,60,68,8,50,48,66,62,15,68
4,56,72,81,3,95,20,99,64,54,28,95
5,57,31,49,48,40,39,45,19,65,57,57
6,35,54,75,18,73,14,31,18,54,69,73
7,51,92,92,84,54,79,1,14,58,77,84


64. 如何求得所有列的z-score 标准化(zero-mean normalization)以及归一化(每列最小值为0，最大值为1)？

In [52]:
# Input
np.random.seed(100)
df = pd.DataFrame(np.random.randint(1,100, 80).reshape(8, -1))
display(df)
# z-score,标准化
out1 = df.apply(lambda x: ((x - x.mean())/x.std()).round(2))
print('z-score,标准化\n')
display(out1)

# 归一化
out2 = df.apply(lambda x: ((x- x.min())/(x.max() - x.min())).round(2))
print('归一化\n')  
display(out2)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,9,25,68,88,80,49,11,95,53,99
1,54,67,99,15,35,25,16,61,59,17
2,10,94,87,3,28,5,32,2,14,84
3,5,92,60,68,8,50,48,66,62,15
4,56,72,81,3,95,20,99,64,54,28
5,57,31,49,48,40,39,45,19,65,57
6,35,54,75,18,73,14,31,18,54,69
7,51,92,92,84,54,79,1,14,58,77


z-score,标准化



Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,-1.11,-1.5,-0.5,1.32,0.97,0.58,-0.8,1.58,0.04,1.35
1,0.84,0.04,1.35,-0.72,-0.57,-0.42,-0.64,0.56,0.41,-1.21
2,-1.07,1.03,0.63,-1.06,-0.81,-1.25,-0.11,-1.22,-2.39,0.88
3,-1.28,0.96,-0.97,0.76,-1.49,0.62,0.41,0.71,0.6,-1.27
4,0.92,0.22,0.28,-1.06,1.48,-0.63,2.09,0.65,0.1,-0.86
5,0.97,-1.28,-1.63,0.2,-0.4,0.16,0.32,-0.7,0.79,0.04
6,0.02,-0.43,-0.08,-0.64,0.73,-0.88,-0.14,-0.73,0.1,0.41
7,0.71,0.96,0.93,1.21,0.08,1.83,-1.13,-0.85,0.35,0.66


归一化



Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,0.08,0.0,0.38,1.0,0.83,0.59,0.1,1.0,0.76,1.0
1,0.94,0.61,1.0,0.14,0.31,0.27,0.15,0.63,0.88,0.02
2,0.1,1.0,0.76,0.0,0.23,0.0,0.32,0.0,0.0,0.82
3,0.0,0.97,0.22,0.76,0.0,0.61,0.48,0.69,0.94,0.0
4,0.98,0.68,0.64,0.0,1.0,0.2,1.0,0.67,0.78,0.15
5,1.0,0.09,0.0,0.53,0.37,0.46,0.45,0.18,1.0,0.5
6,0.58,0.42,0.52,0.18,0.75,0.12,0.31,0.17,0.78,0.64
7,0.88,0.97,0.86,0.95,0.53,1.0,0.0,0.13,0.86,0.74


65. 如何计算每一行相对下一行的相关性？

In [53]:
# Input
np.random.seed(100)
df = pd.DataFrame(np.random.randint(1,100, 80).reshape(8, -1))
display(df)
print(range(df.shape[0]-1))
# Solution
[df.iloc[i].corr(df.iloc[i+1]).round(2) for i in range(df.shape[0]-1)]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,9,25,68,88,80,49,11,95,53,99
1,54,67,99,15,35,25,16,61,59,17
2,10,94,87,3,28,5,32,2,14,84
3,5,92,60,68,8,50,48,66,62,15
4,56,72,81,3,95,20,99,64,54,28
5,57,31,49,48,40,39,45,19,65,57
6,35,54,75,18,73,14,31,18,54,69
7,51,92,92,84,54,79,1,14,58,77


range(0, 7)


[-0.1, 0.36, 0.14, -0.08, -0.22, 0.35, 0.35]

66. 如何将dataframe双对角线的值都替换为0？

In [185]:
# Input
np.random.seed(100)
df = pd.DataFrame(np.random.randint(1,100, 100).reshape(10, -1))
display(df)
print(df.shape)
# Solution
for i in range(df.shape[0]):
    df.iat[i, i] = 0
    df.iat[df.shape[0]-i-1, i] = 0
display(df)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,9,25,68,88,80,49,11,95,53,99
1,54,67,99,15,35,25,16,61,59,17
2,10,94,87,3,28,5,32,2,14,84
3,5,92,60,68,8,50,48,66,62,15
4,56,72,81,3,95,20,99,64,54,28
5,57,31,49,48,40,39,45,19,65,57
6,35,54,75,18,73,14,31,18,54,69
7,51,92,92,84,54,79,1,14,58,77
8,4,71,4,85,80,11,88,61,4,49
9,53,44,37,6,72,39,87,95,99,43


(10, 10)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,0,25,68,88,80,49,11,95,53,0
1,54,0,99,15,35,25,16,61,0,17
2,10,94,0,3,28,5,32,0,14,84
3,5,92,60,0,8,50,0,66,62,15
4,56,72,81,3,0,0,99,64,54,28
5,57,31,49,48,0,0,45,19,65,57
6,35,54,75,0,73,14,0,18,54,69
7,51,92,0,84,54,79,1,0,58,77
8,4,0,4,85,80,11,88,61,0,49
9,0,44,37,6,72,39,87,95,99,0


67. 如何根据key获得特定分组的数据？

In [192]:
# Input
np.random.seed(100)
df = pd.DataFrame({'col1': ['apple', 'banana', 'orange'] * 3,
                   'col2': np.random.rand(9),
                   'col3': np.random.randint(0, 15, 9)})
display(df)

df_grouped = df.groupby(['col1'])

# Solution 1
display(df_grouped.get_group('apple'))

# Solution 2
for i, dff in df_grouped:
    if i == 'apple':
        display(dff)

Unnamed: 0,col1,col2,col3
0,apple,0.543405,8
1,banana,0.278369,4
2,orange,0.424518,11
3,apple,0.844776,12
4,banana,0.004719,10
5,orange,0.121569,0
6,apple,0.670749,11
7,banana,0.825853,9
8,orange,0.136707,13


Unnamed: 0,col1,col2,col3
0,apple,0.543405,8
3,apple,0.844776,12
6,apple,0.670749,11


Unnamed: 0,col1,col2,col3
0,apple,0.543405,8
3,apple,0.844776,12
6,apple,0.670749,11


68. 当根据另一列分组时，如何获取一列第2大的值？

In [54]:
# Input
np.random.seed(100)
df = pd.DataFrame({'fruit': ['apple', 'banana', 'orange'] * 3,
                   'taste': np.random.rand(9),
                   'price': np.random.randint(0, 15, 9)})

display(df)

# Solution
df_grpd = df['taste'].groupby(df.fruit)
display(df_grpd.get_group('banana').sort_values().iloc[-2])

Unnamed: 0,fruit,taste,price
0,apple,0.543405,8
1,banana,0.278369,4
2,orange,0.424518,11
3,apple,0.844776,12
4,banana,0.004719,10
5,orange,0.121569,0
6,apple,0.670749,11
7,banana,0.825853,9
8,orange,0.136707,13


0.27836938509379616

69. 如何计算daraframe中分组的均值，并且获得新的dataframe？

In [201]:
# Input
np.random.seed(100)
df = pd.DataFrame({'fruit': ['apple', 'banana', 'orange'] * 3,
                   'rating': np.random.rand(9),
                   'price': np.random.randint(0, 15, 9)})
display(df)

# Solution
out = df.groupby('fruit', as_index=False)['price'].mean()
display(out)

Unnamed: 0,fruit,rating,price
0,apple,0.543405,8
1,banana,0.278369,4
2,orange,0.424518,11
3,apple,0.844776,12
4,banana,0.004719,10
5,orange,0.121569,0
6,apple,0.670749,11
7,banana,0.825853,9
8,orange,0.136707,13


Unnamed: 0,fruit,price
0,apple,10.333333
1,banana,7.666667
2,orange,8.0


70. 如何合并两个具有相同数据行，但列名不同的dataframe？

In [55]:
# Input
np.random.seed(100)
df1 = pd.DataFrame({'fruit': ['apple', 'banana', 'orange'] * 3,
                    'weight': ['high', 'medium', 'low'] * 3,
                    'price': np.random.randint(0, 15, 9)})
display(df1)
df2 = pd.DataFrame({'pazham': ['apple', 'orange', 'pine'] * 2,
                    'kilo': ['high', 'low'] * 3,
                    'price': np.random.randint(0, 15, 6)})
display(df2)
# Solution
pd.merge(df1, df2, how='inner', left_on=['fruit', 'weight'], right_on=['pazham', 'kilo'], suffixes=['_left', '_right'])

Unnamed: 0,fruit,weight,price
0,apple,high,8
1,banana,medium,8
2,orange,low,3
3,apple,high,7
4,banana,medium,7
5,orange,low,0
6,apple,high,10
7,banana,medium,14
8,orange,low,4


Unnamed: 0,pazham,kilo,price
0,apple,high,2
1,orange,low,5
2,pine,high,2
3,apple,low,2
4,orange,high,14
5,pine,low,2


Unnamed: 0,fruit,weight,price_left,pazham,kilo,price_right
0,apple,high,8,apple,high,2
1,apple,high,7,apple,high,2
2,apple,high,10,apple,high,2
3,orange,low,3,orange,low,5
4,orange,low,0,orange,low,5
5,orange,low,4,orange,low,5


71. 如何从dataframe移除存在于另一个dataframe的行？

In [56]:
# Input
df1 = pd.DataFrame({'fruit': ['apple', 'orange', 'banana'] * 3,
                    'weight': ['high', 'medium', 'low'] * 3,
                    'price': np.arange(9)})
display(df1)

df2 = pd.DataFrame({'fruit': ['apple', 'orange', 'pine'] * 2,
                    'weight': ['high', 'medium'] * 3,
                    'price': np.arange(6)})
display(df2)

# Solution
# 加入.all(1)的目的是去除空行
display(df1[~df1.isin(df2).all(1)])

Unnamed: 0,fruit,weight,price
0,apple,high,0
1,orange,medium,1
2,banana,low,2
3,apple,high,3
4,orange,medium,4
5,banana,low,5
6,apple,high,6
7,orange,medium,7
8,banana,low,8


Unnamed: 0,fruit,weight,price
0,apple,high,0
1,orange,medium,1
2,pine,high,2
3,apple,medium,3
4,orange,high,4
5,pine,medium,5


Unnamed: 0,fruit,weight,price
2,banana,low,2
3,apple,high,3
4,orange,medium,4
5,banana,low,5
6,apple,high,6
7,orange,medium,7
8,banana,low,8


72. 如何获得两列相同值的索引？

In [57]:
# Input
np.random.seed(90)
df = pd.DataFrame({'fruit1': np.random.choice(['apple', 'orange', 'banana'], 10),
                    'fruit2': np.random.choice(['apple', 'orange', 'banana'], 10)})
display(df)
# Solution
np.where(df.fruit1 == df.fruit2)

Unnamed: 0,fruit1,fruit2
0,orange,banana
1,banana,orange
2,apple,banana
3,banana,orange
4,orange,apple
5,banana,apple
6,banana,banana
7,apple,apple
8,banana,apple
9,apple,apple


(array([6, 7, 9], dtype=int64),)

73. 如何基于某列创建lags(舍弃最后一行的值)与leads（舍弃第一行的值）列？

In [58]:
# Input
np.random.seed(90)
df = pd.DataFrame(np.random.randint(1, 100, 20).reshape(-1, 4), columns = list('abcd'))
display(df)
# Solution
df['a_lag1'] = df['a'].shift(1)
df['b_lead1'] = df['b'].shift(-1)
display(df)

Unnamed: 0,a,b,c,d
0,92,30,32,68
1,40,69,59,38
2,19,75,97,52
3,31,81,19,78
4,83,10,1,1


Unnamed: 0,a,b,c,d,a_lag1,b_lead1
0,92,30,32,68,,69.0
1,40,69,59,38,92.0,75.0
2,19,75,97,52,40.0,81.0
3,31,81,19,78,19.0,10.0
4,83,10,1,1,31.0,


74. 如何将一个文本列分割为两个单独列，并且将第一行作为header？(对于导入数据非常有用)

In [59]:
# Input
df = pd.DataFrame(["STD, City    State",
"33, Kolkata    West Bengal",
"44, Chennai    Tamil Nadu",
"40, Hyderabad    Telengana",
"80, Bangalore    Karnataka"], columns=['row'])

display(df)
# Solution
df_out = df.row.str.split(',|\t', expand=True)
# display(df_out)
# Make first row as header
new_header = df_out.iloc[0]
df_out = df_out[1:]
df_out.columns = new_header
display(df_out)

Unnamed: 0,row
0,"STD, City State"
1,"33, Kolkata West Bengal"
2,"44, Chennai Tamil Nadu"
3,"40, Hyderabad Telengana"
4,"80, Bangalore Karnataka"


Unnamed: 0,STD,City State
1,33,Kolkata West Bengal
2,44,Chennai Tamil Nadu
3,40,Hyderabad Telengana
4,80,Bangalore Karnataka


75. 如何统计DataFrame中的唯一值出现的频率？

In [61]:
# Input
np.random.seed(98)
df = pd.DataFrame(np.random.randint(1, 10, 20).reshape(-1, 4), columns = list('abcd'))
display(df)
# Solution
print(df.values.ravel())
pd.value_counts(df.values.ravel())

Unnamed: 0,a,b,c,d
0,5,2,5,8
1,7,3,8,3
2,6,3,4,8
3,5,7,5,3
4,7,6,6,7


[5 2 5 8 7 3 8 3 6 3 4 8 5 7 5 3 7 6 6 7]


7    4
5    4
3    4
8    3
6    3
4    1
2    1
dtype: int64