# Pandas实用示例

1. 如何导入Pandas以及确认版本？

In [3]:
import numpy as np
import pandas as pd
from IPython.display import display, HTML
print(pd.__version__)
print(pd.show_versions(as_json=True))

0.23.4
{'system': {'commit': None, 'python': '3.6.2.final.0', 'python-bits': 64, 'OS': 'Windows', 'OS-release': '10', 'machine': 'AMD64', 'processor': 'Intel64 Family 6 Model 94 Stepping 3, GenuineIntel', 'byteorder': 'little', 'LC_ALL': 'None', 'LANG': 'None', 'LOCALE': 'None.None'}, 'dependencies': {'pandas': '0.23.4', 'pytest': '3.8.0', 'pip': '18.1', 'setuptools': '40.3.0', 'Cython': '0.28.5', 'numpy': '1.14.0rc1', 'scipy': '1.0.0', 'pyarrow': None, 'xarray': None, 'IPython': '6.5.0', 'sphinx': None, 'patsy': None, 'dateutil': '2.7.3', 'pytz': '2018.5', 'blosc': None, 'bottleneck': None, 'tables': None, 'numexpr': None, 'feather': None, 'matplotlib': '2.2.3', 'openpyxl': '2.5.9', 'xlrd': '1.1.0', 'xlwt': None, 'xlsxwriter': None, 'lxml': '4.2.5', 'bs4': '4.6.3', 'html5lib': '1.0.1', 'sqlalchemy': None, 'pymysql': None, 'psycopg2': None, 'jinja2': '2.10', 's3fs': None, 'fastparquet': None, 'pandas_gbq': None, 'pandas_datareader': None}}
None


2. 如何从list，Numpy Array以及字典创建Series?

In [7]:
# Inputs
import numpy as np
mylist = list('abcedfghijklmnopqrstuvwxyz')
myarr = np.arange(26)
mydict = dict(zip(mylist, myarr))

# Solution
ser1 = pd.Series(mylist)
ser2 = pd.Series(myarr)
ser3 = pd.Series(mydict)
print(ser1.head())
print(ser2.head())
print(ser3.head())

0    a
1    b
2    c
3    e
4    d
dtype: object
0    0
1    1
2    2
3    3
4    4
dtype: int32
a    0
b    1
c    2
e    3
d    4
dtype: int64


3. 如何将Series的索引转换为Dataframe的一列数据?

In [10]:
# Input
mylist = list('abcedfghijklmnopqrstuvwxyz')
myarr = np.arange(26)
mydict = dict(zip(mylist, myarr))
ser = pd.Series(mydict)

# Solution
# 通过reset_index方法，可以将原有的索引备份为dataframe的一列
df = ser.to_frame().reset_index()
display(df.head())

Unnamed: 0,index,0
0,a,0
1,b,1
2,c,2
3,e,3
4,d,4


4. 如果将很多Series合并为一个dataframe？

In [15]:
# Input
ser1 = pd.Series(list('abcedfghijklmnopqrstuvwxyz'))
ser2 = pd.Series(np.arange(26))

# Solution 1
df = pd.concat([ser1, ser2], axis=1)
display(df.head())
# Solution 2
df = pd.DataFrame({'col1': ser1, 'col2': ser2})
display(df.head())

Unnamed: 0,0,1
0,a,0
1,b,1
2,c,2
3,e,3
4,d,4


Unnamed: 0,col1,col2
0,a,0
1,b,1
2,c,2
3,e,3
4,d,4


5. 如何设置Series索引的名字？

In [16]:
# Input
ser = pd.Series(list('abcedfghijklmnopqrstuvwxyz'))

# Solution
ser.name = 'alphabets'
ser.head()

0    a
1    b
2    c
3    e
4    d
Name: alphabets, dtype: object

6. 如何从series A获取不存在于series B的items？

In [21]:
# Input
ser1 = pd.Series([1, 2, 3, 4, 5])
ser2 = pd.Series([4, 5, 6, 7, 8])

# Solution 1
display(ser1[~ser1.isin(ser2)])

# Solution 2
display(np.setdiff1d(ser1, ser2))

0    1
1    2
2    3
dtype: int64

array([1, 2, 3], dtype=int64)

7. 如何从series A与series B获取差异项？

In [23]:
# Input
ser1 = pd.Series([1, 2, 3, 4, 5])
ser2 = pd.Series([4, 5, 6, 7, 8])

# Solution
ser_u = pd.Series(np.union1d(ser1, ser2))  # union
display(ser_u)
ser_i = pd.Series(np.intersect1d(ser1, ser2))  # intersect
display(ser_i)
display(ser_u[~ser_u.isin(ser_i)])

0    1
1    2
2    3
3    4
4    5
5    6
6    7
7    8
dtype: int64

0    4
1    5
dtype: int64

0    1
1    2
2    3
5    6
6    7
7    8
dtype: int64

8. 如何从一个数字Series获取minium, 25%, median (50%，即中位数), 75%以及max的项？

高斯分布的概率密度函数如下：

<img src='./image/gause.png' /> <br>

Numpy中可以通过如下方式获取符合正态分布的随机数：```numpy.random.normal(loc=0.0, scale=1.0, size=None)```, 参数的意义为：<br>
loc:float
概率分布的均值，对应着整个分布的中心center

scale:float
概率分布的标准差，对应于分布的宽度，scale越大越矮胖，scale越小，越瘦高

size:int or tuple of ints
输出的shape，默认为None，只输出一个值

我们更经常会用到np.random.randn(size)所谓标准正太分布（μ=0, σ=1），对应于np.random.normal(loc=0, scale=1, size)

In [25]:
# Input
np.random.RandomState(100)
ser = pd.Series(np.random.normal(10, 5, 25))
display(ser)
# Solution
np.percentile(ser, q=[0, 25, 50, 75, 100])

0     16.670629
1      6.485484
2      4.650735
3     13.298420
4     11.935593
5     17.411325
6     12.089921
7     13.916963
8     12.992851
9     10.277640
10    -1.612885
11    16.166393
12    18.333139
13    13.917828
14     8.584499
15    12.029617
16    11.882349
17    16.721891
18     5.433105
19    -0.847964
20     2.543123
21    11.824412
22    16.975809
23     7.725225
24     7.876613
dtype: float64

array([-1.61288539,  7.72522459, 11.93559339, 13.91782759, 18.33313866])

9. 如何获得series中唯一项的出现次数？

In [32]:
# Input
np.random.seed(100)
randint = np.random.randint(8, size=30)
print(randint)
ser = pd.Series(np.take(list('abcdefgh'), randint))
print(ser.head())
# Solution
ser.value_counts()

[0 0 3 7 7 7 0 2 6 4 2 5 2 2 6 2 1 0 0 7 4 3 4 2 0 3 1 5 6 2]
0    a
1    a
2    d
3    h
4    h
dtype: object


c    7
a    6
h    4
e    3
g    3
d    3
b    2
f    2
dtype: int64

10. 如何仅仅保留前两位出现频率最高的项，并且替代其他的值为'Other'?

In [44]:
# Input
np.random.seed(100)
ser = pd.Series(np.random.randint(1, 5, [12]))
print(ser)
# Solution
print("Top 2 Freq:")
top2 = ser.value_counts()[:2]
print(top2)
print(~ser.isin(ser.value_counts().index[:2]))
ser[~ser.isin(ser.value_counts().index[:2])] = 'Other'
ser

0     1
1     1
2     4
3     4
4     4
5     4
6     1
7     3
8     3
9     1
10    3
11    2
dtype: int32
Top 2 Freq:
4    4
1    4
dtype: int64
0     False
1     False
2     False
3     False
4     False
5     False
6     False
7      True
8      True
9     False
10     True
11     True
dtype: bool


0         1
1         1
2         4
3         4
4         4
5         4
6         1
7     Other
8     Other
9         1
10    Other
11    Other
dtype: object

11. 如何将数字series从小到大分为10个相等数量的分组，并且将每一项替换为分组名称？

In [48]:
# Input
np.random.seed(100)
ser = pd.Series(np.random.random(20))
print(ser)

# Solution
pd.qcut(ser, q=[0, .10, .20, .3, .4, .5, .6, .7, .8, .9, 1], 
        labels=['1st', '2nd', '3rd', '4th', '5th', 
                '6th', '7th', '8th', '9th', '10th'])

0     0.543405
1     0.278369
2     0.424518
3     0.844776
4     0.004719
5     0.121569
6     0.670749
7     0.825853
8     0.136707
9     0.575093
10    0.891322
11    0.209202
12    0.185328
13    0.108377
14    0.219697
15    0.978624
16    0.811683
17    0.171941
18    0.816225
19    0.274074
dtype: float64


0      6th
1      5th
2      6th
3      9th
4      1st
5      2nd
6      7th
7      9th
8      2nd
9      7th
10    10th
11     4th
12     3rd
13     1st
14     4th
15    10th
16     8th
17     3rd
18     8th
19     5th
dtype: category
Categories (10, object): [1st < 2nd < 3rd < 4th ... 7th < 8th < 9th < 10th]

12. 如何将numpy array转换为给定shape的dataframe？

In [50]:
np.random.seed(100)
# Input
ser = pd.Series(np.random.randint(1, 10, 35))

# Solution
df = pd.DataFrame(ser.values.reshape(7,5))
display(df)

Unnamed: 0,0,1,2,3,4
0,9,9,4,8,8
1,1,5,3,6,3
2,3,3,2,1,9
3,5,1,7,3,5
4,2,6,4,5,5
5,4,8,2,2,8
6,8,1,3,4,3


13. 如何从series中找到公约数为3的项目的索引？

In [52]:
np.random.seed(99)
# Input
ser = pd.Series(np.random.randint(1, 10, 7))
print(ser)

# Solution
np.argwhere(ser % 3==0)

0    2
1    4
2    9
3    9
4    3
5    5
6    6
dtype: int32


array([[2],
       [3],
       [4],
       [6]], dtype=int64)

14. 如何从series获得给定位置的items?

In [55]:
# Input
ser = pd.Series(list('abcdefghijklmnopqrstuvwxyz'))
pos = [0, 4, 8, 14, 20]

# Solution 1
print(ser[pos])

# Solution 2
print(ser.take(pos))

0     a
4     e
8     i
14    o
20    u
dtype: object
0     a
4     e
8     i
14    o
20    u
dtype: object


15. 如何将两个series垂直与水平叠加?

In [58]:
# Input
ser1 = pd.Series(range(5))
ser2 = pd.Series(list('abcde'))

# Output
# Vertical
# Solution 1
display(ser1.append(ser2))
# Solution 2
vt = pd.concat([ser1, ser2], axis=0)
display(vt)

# Horizontal
df = pd.concat([ser1, ser2], axis=1)
display(df)

0    0
1    1
2    2
3    3
4    4
0    a
1    b
2    c
3    d
4    e
dtype: object

0    0
1    1
2    2
3    3
4    4
0    a
1    b
2    c
3    d
4    e
dtype: object

Unnamed: 0,0,1
0,0,a
1,1,b
2,2,c
3,3,d
4,4,e


16. 如何获得series A与series B的共同项的索引（相对A来说）？

In [70]:
# Input
ser1 = pd.Series([10, 9, 6, 5, 3, 1, 12, 8, 13])
ser2 = pd.Series([1, 3, 10, 13])

# Solution 1
print([np.where(i == ser1)[0].tolist()[0] for i in ser2])

print(pd.Index(ser1))
# Solution 2
print([pd.Index(ser1).get_loc(i) for i in ser2])

[5, 4, 0, 8]
Int64Index([10, 9, 6, 5, 3, 1, 12, 8, 13], dtype='int64')
[5, 4, 0, 8]


17. 如何对series计算均方差 (mean squared error)?

In [75]:
# Input
np.random.seed(100)
truth = pd.Series(range(10))
print(truth)
pred = pd.Series(range(10)) + np.random.random(10)
print(pred)
# Solution
print('均方差为: {0}'.format(np.mean((truth-pred)**2)))

0    0
1    1
2    2
3    3
4    4
5    5
6    6
7    7
8    8
9    9
dtype: int64
0    0.543405
1    1.278369
2    2.424518
3    3.844776
4    4.004719
5    5.121569
6    6.670749
7    7.825853
8    8.136707
9    9.575093
dtype: float64
均方差为: 0.27627997996363146


18. 如何将字符串series的每一个单词的第一个字母转换为大写字母？

In [77]:
# Input
ser = pd.Series(['how', 'to', 'convert', 'to', 'uppercase?'])

# Solution 1
print(ser.map(lambda x: x.title()))

# Solution 2
print(ser.map(lambda x: x[0].upper() + x[1:]))

# Solution 3
print(pd.Series([i.title() for i in ser]))

0           How
1            To
2       Convert
3            To
4    Uppercase?
dtype: object
0           How
1            To
2       Convert
3            To
4    Uppercase?
dtype: object
0           How
1            To
2       Convert
3            To
4    Uppercase?
dtype: object


19. 如何获得series中每个单词中的字符长度？

In [82]:
# Input
ser = pd.Series(['how', 'to', 'kick', 'ass?'])

# Solution 1
print(ser.map(lambda x: len(x)))

# Solution 2
print(ser.apply(len))

0    3
1    2
2    4
3    4
dtype: int64
0    3
1    2
2    4
3    4
dtype: int64


20. 如何计算series中序列数的差异的差异？

In [83]:
# Input
ser = pd.Series([1, 3, 6, 10, 15, 21, 27, 35])

# Solution
print(ser.diff().tolist())
print(ser.diff().diff().tolist())

[nan, 2.0, 3.0, 4.0, 5.0, 6.0, 6.0, 8.0]
[nan, nan, 1.0, 1.0, 1.0, 1.0, 0.0, 2.0]


21. 如何将series中的日期字符串，替换为timeseries类型？

In [87]:
# Input
ser = pd.Series(['01 Jan 2010', 
                 'January 1, 2018', 
                 '2018-11-05', 
                 '02-02-2011', 
                 '20120303', 
                 '2013/04/04', 
                 '2014-05-05', 
                 '2015-06-06T12:20'])

# Solution 1
from dateutil.parser import parse
print(ser.map(lambda x: parse(x)))

# Solution 2
pd.to_datetime(ser)

0   2010-01-01 00:00:00
1   2018-01-01 00:00:00
2   2018-11-05 00:00:00
3   2011-02-02 00:00:00
4   2012-03-03 00:00:00
5   2013-04-04 00:00:00
6   2014-05-05 00:00:00
7   2015-06-06 12:20:00
dtype: datetime64[ns]


0   2010-01-01 00:00:00
1   2018-01-01 00:00:00
2   2018-11-05 00:00:00
3   2011-02-02 00:00:00
4   2012-03-03 00:00:00
5   2013-04-04 00:00:00
6   2014-05-05 00:00:00
7   2015-06-06 12:20:00
dtype: datetime64[ns]

22. 如何从日期字符串series获得day of month, week number, day of year以及day of week?

In [93]:
# Input
ser = pd.Series(['1 Jan 2010', 
                 '02-02-2011', 
                 '20120303', 
                 '2013/04/04', 
                 '2014-05-05', 
                 '2015-06-06T12:20'])

# Solution
from dateutil.parser import parse
ser_ts = ser.map(lambda x: parse(x))

# day of month
print("Date: ", ser_ts.dt.day.tolist())

# week number
# 注意：因为2010-01-01是周五，所以计算为上一年的第53周
print("Week number: ", ser_ts.dt.weekofyear.tolist())

# day of year
print("Day number of year: ", ser_ts.dt.dayofyear.tolist())

# day of week
print("Day of week: ", ser_ts.dt.weekday_name.tolist())

Date:  [1, 2, 3, 4, 5, 6]
Week number:  [53, 5, 9, 14, 19, 23]
Day number of year:  [1, 33, 63, 94, 125, 157]
Day of week:  ['Friday', 'Wednesday', 'Saturday', 'Thursday', 'Monday', 'Saturday']


23. 如何将year-month字符串，转换为对应月的第四天的日期？

In [99]:
import pandas as pd
# Input
ser = pd.Series(['Jan 2010', 'Feb 2011', 'Mar 2012'])

# Solution 1
from dateutil.parser import parse
# Parse the date
ser_ts = ser.map(lambda x: parse(x))

# Construct date string with date as 4
ser_datestr = ser_ts.dt.year.astype('str') + '-' + ser_ts.dt.month.astype('str') + '-' + '04'
print(ser_datestr)
# Format it.
print([parse(i).strftime('%Y-%m-%d') for i in ser_datestr])

# Solution 2
print(ser.map(lambda x: parse('04 ' + x)))

0    2010-1-04
1    2011-2-04
2    2012-3-04
dtype: object
['2010-01-04', '2011-02-04', '2012-03-04']
0   2010-01-04
1   2011-02-04
2   2012-03-04
dtype: datetime64[ns]


24. 如何从一个字符串series获得至少含有两个元音字母的单词？

In [100]:
# Input
ser = pd.Series(['Apple', 'Orange', 'Plan', 'Python', 'Money'])

# Solution
from collections import Counter
mask = ser.map(lambda x: sum([Counter(x.lower()).get(i, 0) 
                              for i in list('aeiou')]) >= 2)
print(mask)
print(ser[mask])

0     True
1     True
2    False
3    False
4     True
dtype: bool
0     Apple
1    Orange
4     Money
dtype: object


25. 如何从邮件地址series获得合法的邮件地址？

In [115]:
# Input
emails = pd.Series(['buying books at amazom.com', 
                    'rameses@egypt.com', 
                    'matt@t.co', 
                    'narendra@modi.com'])

import re
pattern ='[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\\.[A-Za-z]{2,4}'

# Solution 1 (as series of strings)
mask = emails.map(lambda x: bool(re.match(pattern, x)))
print(emails[mask])

# Solution 2 (as series of list)
print(emails.str.findall(pattern, flags=re.IGNORECASE))

# Solution 3 (as list)
print([x[0] for x in [re.findall(pattern, email) 
                for email in emails] if len(x) > 0])

1    rameses@egypt.com
2            matt@t.co
3    narendra@modi.com
dtype: object
0                     []
1    [rameses@egypt.com]
2            [matt@t.co]
3    [narendra@modi.com]
dtype: object
['rameses@egypt.com', 'matt@t.co', 'narendra@modi.com']


26. 如何根据另一个series进行分组，并获得均值？

In [119]:
# Input
np.random.seed(100)
fruit = pd.Series(np.random.choice(['apple', 'banana', 'carrot'], 10))
print(fruit)
weights = pd.Series(np.linspace(1, 10, 10))
print(weights)
# Solution
# 这里分组的概念为：apple的items为0, 1, 2, 5; 
# banana的items为7, 
# carrot的items为3, 4, 6, 8, 9
# 求均值，即将对应items的weights的items求均值
print(weights.groupby(fruit).mean())

0     apple
1     apple
2     apple
3    carrot
4    carrot
5     apple
6    carrot
7    banana
8    carrot
9    carrot
dtype: object
0     1.0
1     2.0
2     3.0
3     4.0
4     5.0
5     6.0
6     7.0
7     8.0
8     9.0
9    10.0
dtype: float64
apple     3.0
banana    8.0
carrot    7.0
dtype: float64


27. 如何基于2个series获得[欧式距离](https://en.wikipedia.org/wiki/Euclidean_distance)？

<img src='./image/euclidean.svg' />

In [131]:
# Input
p = pd.Series([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
q = pd.Series([10, 9, 8, 7, 6, 5, 4, 3, 2, 1])
# Solution 
print(sum((p - q)**2)**.5)
# Solution (using func)
print(np.linalg.norm(p-q))

18.16590212458495
18.16590212458495


28. 如何在series中获得局部极值(local maxima)的索引?

In [134]:
# Input
ser = pd.Series([2, 10, 3, 4, 9, 10, 2, 7, 3])

# Solution
print(np.diff(ser))
# 负数标注为-1，正数标注为1
print(np.sign(np.diff(ser)))
dd = np.diff(np.sign(np.diff(ser)))
print(dd)
peak_locs = np.where(dd == -2)[0] + 1
peak_locs

[ 8 -7  1  5  1 -8  5 -4]
[ 1 -1  1  1  1 -1  1 -1]
[-2  2  0  0 -2  2 -2]


array([1, 5, 7], dtype=int64)

29. 如何将字符串的空格，用出现频率最小的字符进行替换？

In [138]:
# Input
my_str = 'dbc deb abed gade'

# Solution
ser = pd.Series(list('dbc deb abed gade'))
freq = ser.value_counts()
print(freq)
# 因为频率次数是降序排列，所以取最后一个item的方法，直接用index[-1]即可。
least_freq = freq.dropna().index[-1]

"".join(ser.replace(' ', least_freq))

d    4
     3
b    3
e    3
a    2
c    1
g    1
dtype: int64
Index(['d', ' ', 'b', 'e', 'a', 'c', 'g'], dtype='object')


'dbcgdebgabedggade'

30. 生成开始于2000-01-01之后的10个周末(周六)的TimeSeries对象作为index，并且附带10个随机整数，作为values？

In [140]:
# Solution
np.random.seed(88)
ser = pd.Series(np.random.randint(1,10,10), 
                pd.date_range('2000-01-01', 
                              periods=10, 
                              freq='W-SAT'))
ser

2000-01-01    9
2000-01-08    1
2000-01-15    2
2000-01-22    5
2000-01-29    6
2000-02-05    2
2000-02-12    1
2000-02-19    6
2000-02-26    1
2000-03-04    3
Freq: W-SAT, dtype: int32

31. 如何将缺失的值，通过上一个非nan数值进行填充？

In [147]:
# Input
ser = pd.Series([1,10,3, np.nan], 
                index=pd.to_datetime(['2000-01-01', 
                                      '2000-01-03', 
                                      '2000-01-06', 
                                      '2000-01-08']))

print(ser)
print(ser.resample('D'))
# Solution
print(ser.resample('D').ffill())  # fill with previous value

# Alternatives
print(ser.resample('D').bfill())  # fill with next value
print(ser.resample('D').bfill().ffill())  # fill next else prev value

2000-01-01     1.0
2000-01-03    10.0
2000-01-06     3.0
2000-01-08     NaN
dtype: float64
DatetimeIndexResampler [freq=<Day>, axis=0, closed=left, label=left, convention=start, base=0]
2000-01-01     1.0
2000-01-02     1.0
2000-01-03    10.0
2000-01-04    10.0
2000-01-05    10.0
2000-01-06     3.0
2000-01-07     3.0
2000-01-08     NaN
Freq: D, dtype: float64
2000-01-01     1.0
2000-01-02    10.0
2000-01-03    10.0
2000-01-04     3.0
2000-01-05     3.0
2000-01-06     3.0
2000-01-07     NaN
2000-01-08     NaN
Freq: D, dtype: float64
2000-01-01     1.0
2000-01-02    10.0
2000-01-03    10.0
2000-01-04     3.0
2000-01-05     3.0
2000-01-06     3.0
2000-01-07     3.0
2000-01-08     3.0
Freq: D, dtype: float64


仅仅对value进行填充：

In [153]:
ser = pd.Series([1,10,np.nan, 3, np.nan, 
                 33, 25, 83, np.nan, 
                 22, 45, np.nan,np.nan])
# print(ser)
# 仅仅根据缺失值的上一个非nan值进行填充
# print(ser.ffill())
# 根据缺失值的下一个非nan值进行填充，如果找不到，
# 则用上一个非nan值进行填充
print(ser.bfill().ffill())

0      1.0
1     10.0
2      3.0
3      3.0
4     33.0
5     33.0
6     25.0
7     83.0
8     22.0
9     22.0
10    45.0
11    45.0
12    45.0
dtype: float64


32. 计算自相关的前10个滞后的series，找出哪一个滞后有最大的相关性。

自相关性是指随机误差项的各期望值之间存在着相关关系，称随机误差项之间存在自相关性（auto-correlation）或序列相关，于1972年提出。

更详细的名词解释：

自相关（Auto-correlation）
是对信号相关程度的一种度量，也就是说自相关可以看作是信号与自身的延迟信号相乘后的乘积进行积分运算。
在某些领域，自相关函数等同于自协方差。
随机信号的自相关函数与其功率谱是傅氏变换对（随机信号无法得到具体的函数表达式，只有其统计信息），通过对接受信号的自相关运算可以进行频谱分析。同时，自相关在信号检测中也有很重要的作用，是在误码最小原则下的最佳接收准则。

同一时间函数在瞬时t和t+a的两个值相乘积的平均值作为延迟时间t的函数，是信号与延迟后信号之间相似性的度量。延迟时间为零时，即为信号的均方值，此时它的值最大。

更详细的解释： [自相关性](https://www.zybuluo.com/evilking/note/753058)

In [161]:
np.random.seed(80)
# Input
ser = pd.Series(np.arange(20) + np.random.normal(1, 10, 20))
print(ser)
# Solution
autocorrelations = [ser.autocorr(i).round(2) for i in range(11)]
print(autocorrelations[1:])
print('Lag having highest correlation: ', 
      np.argmax(np.abs(autocorrelations[1:]))+1)

0     19.998529
1      4.088030
2     11.952110
3     -7.807292
4      7.735325
5     12.028672
6      0.386326
7     -2.436107
8      3.188510
9      6.436470
10    13.940738
11    -5.969706
12     9.105098
13    18.491409
14    10.261007
15    27.638082
16     4.701879
17    27.867192
18    10.838846
19     2.418162
dtype: float64
[-0.08, 0.22, 0.06, 0.02, 0.35, -0.52, -0.26, 0.19, -0.36, -0.06]
Lag having highest correlation:  6


33. 导入csv时，如何每次仅仅读取50行数据？

read_csv的函数中有个参数：chunksize

通过指定一个chunksize分块大小来读取文件，返回的是一个可迭代的对象TextFileReader

In [170]:
# Solution 1: Use chunks and for-loop
df = pd.read_csv('https://raw.githubusercontent.com/selva86/datasets/master/BostonHousing.csv', 
                 chunksize=50)
df2 = pd.DataFrame()
for chunk in df:
    df2 = df2.append(chunk.iloc[0,:])
display(df2)

Unnamed: 0,age,b,chas,crim,dis,indus,lstat,medv,nox,ptratio,rad,rm,tax,zn
0,65.2,396.9,0.0,0.00632,4.09,2.31,4.98,24.0,0.538,15.3,1.0,6.575,296.0,18.0
50,45.7,395.56,0.0,0.08873,6.8147,5.64,13.45,19.7,0.439,16.8,4.0,5.963,243.0,21.0
100,79.9,394.76,0.0,0.14866,2.7778,8.56,9.42,27.5,0.52,20.9,5.0,6.727,384.0,0.0
150,97.3,372.8,0.0,1.6566,1.618,19.58,14.1,21.5,0.871,14.7,5.0,6.122,403.0,0.0
200,13.9,384.3,0.0,0.01778,7.6534,1.47,4.45,32.9,0.403,17.0,3.0,7.135,402.0,95.0
250,13.0,396.28,0.0,0.1403,7.3967,5.86,5.9,24.4,0.431,19.1,7.0,6.487,330.0,22.0
300,47.4,390.86,0.0,0.04417,7.8278,2.24,6.07,24.8,0.4,14.8,5.0,6.871,358.0,70.0
350,44.4,396.9,0.0,0.06211,8.7921,1.25,5.98,22.9,0.429,19.7,1.0,6.49,335.0,40.0
400,100.0,396.9,0.0,25.0461,1.5888,18.1,26.77,5.6,0.693,20.2,24.0,5.987,666.0,0.0
450,92.6,0.32,0.0,6.71772,2.3236,18.1,17.44,13.4,0.713,20.2,24.0,6.749,666.0,0.0


In [173]:
# Solution 2: Use chunks and list comprehension
df = pd.read_csv('https://raw.githubusercontent.com/selva86/datasets/master/BostonHousing.csv', 
                 chunksize=50)
df2 = pd.concat([chunk.iloc[0] for chunk in df], axis=1)
display(df2.head())
df2 = df2.transpose()
display(df2.head())

14

Unnamed: 0,0,50,100,150,200,250,300,350,400,450,500
crim,0.00632,0.08873,0.14866,1.6566,0.01778,0.1403,0.04417,0.06211,25.0461,6.71772,0.22438
zn,18.0,21.0,0.0,0.0,95.0,22.0,70.0,40.0,0.0,0.0,0.0
indus,2.31,5.64,8.56,19.58,1.47,5.86,2.24,1.25,18.1,18.1,9.69
chas,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
nox,0.538,0.439,0.52,0.871,0.403,0.431,0.4,0.429,0.693,0.713,0.585


Unnamed: 0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,b,lstat,medv
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98,24.0
50,0.08873,21.0,5.64,0.0,0.439,5.963,45.7,6.8147,4.0,243.0,16.8,395.56,13.45,19.7
100,0.14866,0.0,8.56,0.0,0.52,6.727,79.9,2.7778,5.0,384.0,20.9,394.76,9.42,27.5
150,1.6566,0.0,19.58,0.0,0.871,6.122,97.3,1.618,5.0,403.0,14.7,372.8,14.1,21.5
200,0.01778,95.0,1.47,0.0,0.403,7.135,13.9,7.6534,3.0,402.0,17.0,384.3,4.45,32.9
