In [4]:
import numpy as np
import pandas as pd  # version == 1.1.4, 使用 conda install pandas 升级

# 一、文件的读取和写入

### 1. 文件读取（csv, excel, txt）

读取文件常用(通用)参数：
> header = None : 第一行不作为列名
>
> index_col : 指定列作为索引
>
> usecols : 读取列的集合，默认读取 all
>
> parse_dates : 需要转化为时间的列
>
> nrows : 读取的数据行数
>
> sep ： 分割参数 (正则参数)【针对 read_table: txt 文件】

In [28]:
df_csv = pd.read_csv('./data/my_csv.csv')
print('------------- 1. csv ---------------\n{}\n'.format(df_csv))
df_csv = pd.read_csv('./data/my_csv.csv', header = None)
print('------ csv - None header --------\n{}\n'.format(df_csv))

df_txt = pd.read_table('./data/my_table.txt')  # 相对而言，这个最特殊
print('------------- 2. txt ----------------\n{}\n'.format(df_txt))
df_txt = pd.read_table('./data/my_table.txt', index_col = ['col1', 'col2'])  # 相对而言，这个最特殊
print('------- txt - index_col ---------\n{}\n'.format(df_txt))

df_excel = pd.read_excel('./data/my_excel.xlsx')
print('-------------- 3. excel -------------\n{}\n'.format(df_excel))
df_excel = pd.read_excel('./data/my_excel.xlsx', usecols = ['col1', 'col2'])
print('------- excel - usercols --------\n{}\n'.format(df_excel))
df_excel = pd.read_excel('./data/my_excel.xlsx', parse_dates = ['col5'])
print('------- excel - parse_dates --------\n{}\n'.format(df_excel))
df_excel = pd.read_excel('./data/my_excel.xlsx', nrows = 3)
print('------- excel - nrows --------\n{}\n'.format(df_excel))

df_txt = pd.read_table('./data/my_table_special_sep.txt')
print('----------- txt --------------\n{}\n'.format(df_txt))
df_txt = pd.read_table('./data/my_table_special_sep.txt', sep = ' \|\|\|\| ', engine = 'python')
print('------- txt - sep --------\n{}\n'.format(df_txt))

------------- 1. csv ---------------
   col1 col2  col3    col4      col5
0     2    a   1.4   apple  2020/1/1
1     3    b   3.4  banana  2020/1/2
2     6    c   2.5  orange  2020/1/5
3     5    d   3.2   lemon  2020/1/7

------ csv - None header --------
      0     1     2       3         4
0  col1  col2  col3    col4      col5
1     2     a   1.4   apple  2020/1/1
2     3     b   3.4  banana  2020/1/2
3     6     c   2.5  orange  2020/1/5
4     5     d   3.2   lemon  2020/1/7

------------- 2. txt ----------------
   col1 col2  col3             col4
0     2    a   1.4   apple 2020/1/1
1     3    b   3.4  banana 2020/1/2
2     6    c   2.5  orange 2020/1/5
3     5    d   3.2   lemon 2020/1/7

------- txt - index_col ---------
           col3             col4
col1 col2                       
2    a      1.4   apple 2020/1/1
3    b      3.4  banana 2020/1/2
6    c      2.5  orange 2020/1/5
5    d      3.2   lemon 2020/1/7

-------------- 3. excel -------------
   col1 col2  col3    co

### 2. 数据写入

数据写入 csv 、 txt 文件中
> index = false : 当索引没有特殊意义，保存时去除
>
> to_csv ： 也可保存为 csv 文件，且允许自定义分隔符
> 
> 安装 tabulate , 将表格快速转化为 markdown 、 latex
>

In [33]:
df_csv.to_csv('./data/my_csv_saved.csv', index = False)
df_excel.to_excel('./data/my_excel_saved.xlsx', index = False)
df_txt.to_csv('./data/my_txt_saved.txt', sep = '\t', index = False)

print('------ markdown --------\n{}\n'.format(df_csv.to_markdown()))
print('------ markdown --------\n{}\n'.format(df_csv.to_latex()))

------ markdown --------
|    | 0    | 1    | 2    | 3      | 4        |
|---:|:-----|:-----|:-----|:-------|:---------|
|  0 | col1 | col2 | col3 | col4   | col5     |
|  1 | 2    | a    | 1.4  | apple  | 2020/1/1 |
|  2 | 3    | b    | 3.4  | banana | 2020/1/2 |
|  3 | 6    | c    | 2.5  | orange | 2020/1/5 |
|  4 | 5    | d    | 3.2  | lemon  | 2020/1/7 |

------ markdown --------
\begin{tabular}{llllll}
\toprule
{} &     0 &     1 &     2 &       3 &         4 \\
\midrule
0 &  col1 &  col2 &  col3 &    col4 &      col5 \\
1 &     2 &     a &   1.4 &   apple &  2020/1/1 \\
2 &     3 &     b &   3.4 &  banana &  2020/1/2 \\
3 &     6 &     c &   2.5 &  orange &  2020/1/5 \\
4 &     5 &     d &   3.2 &   lemon &  2020/1/7 \\
\bottomrule
\end{tabular}




# 二、基本数据结构

### 1. Seriel

四部分组成
> data: 序列值
>
> index: 索引
>
> dtype: 存储类型
>
> name: 序列名字
>
> object: 纯字符串序列类型可以认为是 object 类型
>
> \[index_item\]: 取出单个索引的值

In [34]:
s = pd.Series(data = [100, 'a', {'dic1':5}],
             index = pd.Index(['id1', 20, 'third'], name = 'my_idx'),
             dtype = 'object',
             name = 'my_name')
s

my_idx
id1              100
20                 a
third    {'dic1': 5}
Name: my_name, dtype: object

In [35]:
print('---- s.values -----\n{}\n'.format(s.values))
print('---- s.index -----\n{}\n'.format(s.index))
print('---- s.dtype -----\n{}\n'.format(s.dtype))
print('---- s.name -----\n{}\n'.format(s.name))
print('---- s.shape -----\n{}\n'.format(s.shape))

---- s.values -----
[100 'a' {'dic1': 5}]

---- s.index -----
Index(['id1', 20, 'third'], dtype='object', name='my_idx')

---- s.dtype -----
object

---- s.name -----
my_name

---- s.shape -----
(3,)



### 2. DataFrame

> a. 在 Series 基础上增加了列索引，数据框可以由二维的 data 和行列索引来构造
> 
> b. (更多时候)从列索引名到数据映射来构造数据框，再加上行索引
>
> c. \[col_name\], \[col_list\] 来取出相应的列和由多个列组成的表， 结果分别为 Series 和 DataFrame
>


In [48]:
# a. 在 Series 基础上增加了列索引，数据框可以由二维的 data 和行列索引来构造
data = [[1, 'a', 1.2], [2, 'b', 2.2], [3, 'c', 3.2]]
df = pd.DataFrame(data = data, 
                  index = ['row_%d'%i for i in range(3)],
                  columns = ['col_0', 'col_1', 'col_2'])
print(df)
print()

# b. (更多时候)从列索引名到数据映射来构造数据框，再加上行索引
df = pd.DataFrame(data = {'col_0': [1, 2, 3],
                          'col_1': list('abc'),
                          'col_2': [1.2, 2.2, 3.2]},
                  index = ['row_%d'%i for i in range(3)])
print(df)
print()

# c. \[col_name\], \[col_list\] 来取出相应的列和由多个列组成的表， 结果分别为 Series 和 DataFrame
print('------- df[\'col_0\'] -------\n{}\n'.format(df['col_0']))
print()

print('------- df[[\'col_0\', \'col_1\'] -------\n{}\n'.format(df[['col_0', 'col_1']]))
print()

# 取出相应属性【values】【index】【columns】【dtypes】【shape】【T】
print('------- df.T -------\n{}\n'.format(df.T))


       col_0 col_1  col_2
row_0      1     a    1.2
row_1      2     b    2.2
row_2      3     c    3.2

       col_0 col_1  col_2
row_0      1     a    1.2
row_1      2     b    2.2
row_2      3     c    3.2

------- df['col_0'] -------
row_0    1
row_1    2
row_2    3
Name: col_0, dtype: int64


------- df[['col_0', 'col_1'] -------
       col_0 col_1
row_0      1     a
row_1      2     b
row_2      3     c


------- df.T -------
      row_0 row_1 row_2
col_0     1     2     3
col_1     a     b     c
col_2   1.2   2.2   3.2



# 常用基本函数


### 0. 测试数据 / 展示数据

In [62]:
file = './data/learn_pandas.csv'  # learn_pandas.csv:四所学校学生体测个人信息
df = pd.read_csv(file)
print(df.columns)
print()
df = df[df.columns[:7]]  # 暂时只考虑前 7 列数据
print('------- head -------')
print(df.head(5))
print()
print('------- tail -------')
print(df.tail(3))
print()
print('------- info -------')
print(df.info)
print()
print('------- describe --------')
print(df.describe)

Index(['School', 'Grade', 'Name', 'Gender', 'Height', 'Weight', 'Transfer',
       'Test_Number', 'Test_Date', 'Time_Record'],
      dtype='object')

------- head -------
                          School      Grade            Name  Gender  Height  \
0  Shanghai Jiao Tong University   Freshman    Gaopeng Yang  Female   158.9   
1              Peking University   Freshman  Changqiang You    Male   166.5   
2  Shanghai Jiao Tong University     Senior         Mei Sun    Male   188.9   
3               Fudan University  Sophomore    Xiaojuan Sun  Female     NaN   
4               Fudan University  Sophomore     Gaojuan You    Male   174.0   

   Weight Transfer  
0    46.0        N  
1    70.0        N  
2    89.0        N  
3    41.0        N  
4    74.0        N  

------- tail -------
                            School      Grade            Name  Gender  Height  \
197  Shanghai Jiao Tong University     Senior  Chengqiang Chu  Female   153.9   
198  Shanghai Jiao Tong University     Senio

### 2. 特征统计函数
sum, mean, median, var, std, max, min
> quantile 分位数
>
> count 非缺失值个数
>
> idxmax 最大值对应的索引
>
> axis 0--列，1--行

In [65]:
de_demo = df[['Height', 'Weight']]
print('----- mean -------')
print(de_demo.mean())
print()
print('----- max -------')
print(de_demo.max())
print()
print('----- quantile -------')
print(de_demo.quantile(0.75))
print()
print('----- count -------')
print(de_demo.count())
print()
print('----- idxmax -------')
print(de_demo.idxmax())
print()
print('----- mean1_head -------')
print(de_demo.mean(axis = 1).head())
print()

----- mean -------
Height    163.218033
Weight     55.015873
dtype: float64

----- max -------
Height    193.9
Weight     89.0
dtype: float64

----- quantile -------
Height    167.5
Weight     65.0
Name: 0.75, dtype: float64

----- count -------
Height    183
Weight    189
dtype: int64

----- idxmax -------
Height    193
Weight      2
dtype: int64

----- mean1_head -------
0    102.45
1    118.25
2    138.95
3     41.00
4    124.00
dtype: float64



### 3. 唯一值函数

> unique 唯一值组成的列表
>
> nunique 唯一值的个数
>
> value_counts 得到唯一值及其出现次数
>
> drop_duplicates 多个列组合唯一值，keep：
>
> --- 1. 默认 first = 组合保留第一次出现所在行
>
> --- 2. last - 最后一次出现所在行
>
> --- 3. False - 所有重复组合所在行删除
>
> duplicates 返回唯一值布尔列表，不删除重复，keep参数一致

In [67]:
print('------- unique ----------')
print(df['School'].unique())
print()
print('------- nunique ---------')
print(df['School'].nunique())
print()
print('------- value_count --------')
print(df['School'].value_counts())
print()

------- unique ----------
['Shanghai Jiao Tong University' 'Peking University' 'Fudan University'
 'Tsinghua University']

------- nunique ---------
4

------- value_count --------
Tsinghua University              69
Shanghai Jiao Tong University    57
Fudan University                 40
Peking University                34
Name: School, dtype: int64



In [77]:
# drop_duplicates 多个列组合唯一值，keep：
# 默认 first = 组合保留第一次出现所在行
# last - 最后一次出现所在行
# False - 所有重复组合所在行删除
df_demo = df[['Gender', 'Transfer', 'Name']]
print(df_demo)
print('-------------------------------')
print(df_demo.drop_duplicates(['Gender', 'Transfer']))
print('-------------------------------')
print(df_demo.drop_duplicates(['Gender', 'Transfer'], keep = 'last'))
print('-------------------------------')
print(df_demo.drop_duplicates(['Name', 'Gender'], keep = False).head())
print('-------------------------------')
print(df['School'].drop_duplicates())  # 在 Series 上也可以用
print('-------------------------------')
print(df_demo.drop_duplicates(['Gender', 'Transfer']).head())  # 在 Series 上也可以使用

     Gender Transfer            Name
0    Female        N    Gaopeng Yang
1      Male        N  Changqiang You
2      Male        N         Mei Sun
3    Female        N    Xiaojuan Sun
4      Male        N     Gaojuan You
..      ...      ...             ...
195  Female        N    Xiaojuan Sun
196  Female        N         Li Zhao
197  Female        N  Chengqiang Chu
198    Male        N   Chengmei Shen
199    Male        N     Chunpeng Lv

[200 rows x 3 columns]
-------------------------------
    Gender Transfer            Name
0   Female        N    Gaopeng Yang
1     Male        N  Changqiang You
12  Female      NaN        Peng You
21    Male      NaN   Xiaopeng Shen
36    Male        Y    Xiaojuan Qin
43  Female        Y      Gaoli Feng
-------------------------------
     Gender Transfer            Name
147    Male      NaN        Juan You
150    Male        Y   Chengpeng You
169  Female        Y   Chengquan Qin
194  Female      NaN     Yanmei Qian
197  Female        N  Chengqian

# 4. 替换函数
针对列的操作：
> 映射替换
> 
> --- 1. replace: 通过字典构造、两个列表来替换, 
>
> --- 1. replace: method = 'ffill', 'bfill'前面的违背替换的值，后面的未被替换的值，传入list即要被替换成其他元素的元素
>
> --- 2. str.replace: 对正则替换
>
> --- 3. cat.codes
>
> 逻辑替换
>
> --- 1. where: 替换传入条件为 False 的行 指定替换值 不制定则为NaN
>
> --- 2. mask: 替换传入条件为 True 的行 指定替换值 不制定则为NaN
>
> 数值替换
>
> --- 1. round: 取整, round(a), 保留 a 位小数
>
> --- 2. abs: 取绝对值
>
> --- 3. clip: 截断, clip(a, b), a 代表上界， b 代表下界

In [81]:
# > 映射替换
# > 
# > --- 1. replace: 通过字典构造、两个列表来替换, 
# >
# > --- 1. replace: method = 'ffill', 'bfill'前面的违背替换的值，后面的未被替换的值，传入list即要被替换成其他元素的元素
# >
# > --- 2. str.replace: 对正则替换
# >
# > --- 3. cat.codes
print(df['Gender'].replace({'Female': 0, 'Male':1}).head())
print('----------------------------')
print(df['Gender'].replace(['Female', 'Male'], [0, 1]).head())
print('----------------------------')
s = pd.Series(['a', 1, 'b', 2, 1, 1, 'a'])
print(s.replace([1, 2], method = 'ffill'))
print('----------------------------')
print(s.replace([1, 2], method = 'bfill'))

0    0
1    1
2    1
3    0
4    1
Name: Gender, dtype: int64
----------------------------
0    0
1    1
2    1
3    0
4    1
Name: Gender, dtype: int64
----------------------------
0    a
1    a
2    b
3    b
4    b
5    b
6    a
dtype: object
----------------------------
0    a
1    b
2    b
3    a
4    a
5    a
6    a
dtype: object


In [87]:
# > 逻辑替换
# >
# > --- 1. where: 替换传入条件为 False 的行, 指定替换值 不制定则为NaN
# >
# > --- 2. mask: 替换传入条件为 True 的行, 指定替换值 不制定则为NaN(attention: 传入只需要是 - Series索引一致的布尔序列)
s = pd.Series([-1, 1.2345, 100, -50])
print(s.where(s < 0))
print('----------------------------')
print(s.where(s < 0, 100))
print('----------------------------')
print(s.mask(s < 0))
print('----------------------------')
print(s.mask(s < 0, -100))
print('----------------------------')
print(s.mask(pd.Series([True, False, False, True], index = s.index)))

0    -1.0
1     NaN
2     NaN
3   -50.0
dtype: float64
----------------------------
0     -1.0
1    100.0
2    100.0
3    -50.0
dtype: float64
----------------------------
0         NaN
1      1.2345
2    100.0000
3         NaN
dtype: float64
----------------------------
0   -100.0000
1      1.2345
2    100.0000
3   -100.0000
dtype: float64
----------------------------
0         NaN
1      1.2345
2    100.0000
3         NaN
dtype: float64


In [114]:
# > 数值替换
# >
# > --- 1. round: 取整, round(a), 保留 a 位小数
# >
# > --- 2. abs: 取绝对值
# >
# > --- 3. clip: 截断, clip(a, b), a 代表上界， b 代表下界
s = pd.Series([-1, 1.2345, 100, -50])
print(s.round(2))
print('----------------------------')
print(s.abs())
print('----------------------------')
print(s.clip(0, 2))
print('----------------------------')
# 在 clip 中，超过边界的只能截断为边界值，如果要把超出边界的替换为自定义的值，应当如何做？
print(s.mask(s.clip(0,2) != s, 100))

0     -1.00
1      1.23
2    100.00
3    -50.00
dtype: float64
----------------------------
0      1.0000
1      1.2345
2    100.0000
3     50.0000
dtype: float64
----------------------------
0    0.0000
1    1.2345
2    2.0000
3    0.0000
dtype: float64
----------------------------
0    100.0000
1      1.2345
2    100.0000
3    100.0000
dtype: float64


### 5. 排序函数
> 值排序 sort_values, （略：多级索引，索引设置，ch3，默认 ascending == True 升序
>
> 索引排序 sort_index, level：指定索引层的名字或层数, 对索引排序（大多按字母
>

In [122]:
# > 值排序 sort_values, （略：多级索引，索引设置，ch3
df_demo = df[['Grade', 'Name', 'Height', 'Weight']].set_index(['Grade', 'Name'])
print(df_demo.head(5))
print('----------------------------')
print(df_demo.sort_values('Height').head())
print('----------------------------')
print(df_demo.sort_values('Height', ascending=False).head())
print('----------------------------')
print(df_demo.sort_values(['Weight', 'Height'], ascending = [True, False]).head())  # 对体重升序排序，相同体重，保持身高降序列
print('----------------------------')
print()

                          Height  Weight
Grade     Name                          
Freshman  Gaopeng Yang     158.9    46.0
          Changqiang You   166.5    70.0
Senior    Mei Sun          188.9    89.0
Sophomore Xiaojuan Sun       NaN    41.0
          Gaojuan You      174.0    74.0
----------------------------
                         Height  Weight
Grade     Name                         
Junior    Xiaoli Chu      145.4    34.0
Senior    Gaomei Lv       147.3    34.0
Sophomore Peng Han        147.8    34.0
Senior    Changli Lv      148.7    41.0
Sophomore Changjuan You   150.5    40.0
----------------------------
                        Height  Weight
Grade    Name                         
Senior   Xiaoqiang Qin   193.9    79.0
         Mei Sun         188.9    89.0
         Gaoli Zhao      186.5    83.0
Freshman Qiang Han       185.3    87.0
Senior   Qiang Zheng     183.9    87.0
----------------------------
                       Height  Weight
Grade     Name                     

In [123]:
# > 索引排序 sort_index, level：指定索引层的名字或层数
print(df_demo.sort_index(level = ['Grade', 'Name'], ascending = [True, False]).head())

                        Height  Weight
Grade    Name                         
Freshman Yanquan Wang    163.5    55.0
         Yanqiang Xu     152.4    38.0
         Yanqiang Feng   162.3    51.0
         Yanpeng Lv        NaN    65.0
         Yanli Zhang     165.1    52.0


### 6. apply 方法(attention: 速度比 pandas 内置函数差很多)
apply 常用于DataFrame的行迭代或列迭代
> axis: 
>
> 参数，以一个序列为输入的函数

In [129]:
# apply 常用于DataFrame的行迭代或列迭代
# > axis: 
# >
# > 参数，以一个序列为输入的函数
df_demo = df[['Height', 'Weight']]
def my_mean(x):
    res = x.mean()
    return res
print(df_demo.apply(my_mean))
print('----------------------------')
print(df_demo.apply(lambda x: x.mean()))
print('----------------------------')
print(df_demo.apply(lambda x: x.mean(), axis = 1).head())
print('----------------------------')
# mad 偏离该序列均值绝对值大小的均值
print(df_demo.apply(lambda x: (x - x.mean()).abs().mean()))
print('----------------------------')
print(df_demo.mad())

Height    163.218033
Weight     55.015873
dtype: float64
----------------------------
Height    163.218033
Weight     55.015873
dtype: float64
----------------------------
0    102.45
1    118.25
2    138.95
3     41.00
4    124.00
dtype: float64
----------------------------
Height     6.707229
Weight    10.391870
dtype: float64
----------------------------
Height     6.707229
Weight    10.391870
dtype: float64


# 四、窗口对象
> 滑动窗口 rolling
>
> 扩张窗口 expanding
>
> 指数加权窗口 ewm
>
>

### 1. 滑动窗口
> 滑窗对象
>
> .rolling 得到滑窗对象，参数为窗口大小window，窗口包含当前行
> 
> 滑窗函数, 
>
> --- 1. shift：表示取向前第n个元素的值
>
> --- 2. diff：与向前第n个元素做差（numpy中常表示n阶差分）
>
> --- 3. pct_change：与向前第n个元素相比计算增长率
>
> --- 4. 公共参数 periods = n（default = 1）：
>


In [136]:
# > 滑窗对象
# >
# > .rolling 得到滑窗对象，参数为窗口大小window，窗口包含当前行
s = pd.Series([1, 2, 3, 4, 5])
s2 = pd.Series([1, 2, 6, 16, 30])
roller = s.rolling(window = 3)
print(roller.mean())
print('----------------------------')
print(roller.sum())
print('----------------------------')
print(roller.cov(s2))
print('----------------------------')
print(roller.corr(s2))
print('----------------------------')
print(roller.apply(lambda x:x.mean()))

0    NaN
1    NaN
2    2.0
3    3.0
4    4.0
dtype: float64
----------------------------
0     NaN
1     NaN
2     6.0
3     9.0
4    12.0
dtype: float64
----------------------------
0     NaN
1     NaN
2     2.5
3     7.0
4    12.0
dtype: float64
----------------------------
0         NaN
1         NaN
2    0.944911
3    0.970725
4    0.995402
dtype: float64
----------------------------
0    NaN
1    NaN
2    2.0
3    3.0
4    4.0
dtype: float64


In [150]:
# > 滑窗函数, 
# >
# > --- 1. shift：表示取向前第n个元素的值
# >
# > --- 2. diff：与向前第n个元素做差（numpy中常表示n阶差分）
# >
# > --- 3. pct_change：与向前第n个元素相比计算增长率
# >
# > --- 4. 公共参数 periods = n（default = 1）：
s = pd.Series([1, 3, 6, 10, 15])
print(s.shift(2))
print('----------------------------')
print(s.diff(3))
print('----------------------------')
print(s.pct_change())
print('----------------------------')
print(s.shift(-1))
print('----------------------------')
print(s.diff(-2))
print('----------------------------')
print(s.shift(2))
print('----------------------------')
print(s.rolling(3).apply(lambda x: list(x)[0]))
print('----------------------------')
print(s.diff(3))
print('----------------------------')
print(s.rolling(4).apply(lambda x: list(x)[len(x) - 1] - list(x)[0]))
# 练一练
# rolling对象的默认窗口方向都是向前的，某些情况下用户需要向后的窗口，
# 例如对1,2,3设定向后窗口为2的sum操作，结果为3,5,NaN，
# 此时应该如何实现向后的滑窗操作？（提示：使用shift）
print(pd.Series([1,2,3]).rolling(2).sum().shift(-1))

0    NaN
1    NaN
2    1.0
3    3.0
4    6.0
dtype: float64
----------------------------
0     NaN
1     NaN
2     NaN
3     9.0
4    12.0
dtype: float64
----------------------------
0         NaN
1    2.000000
2    1.000000
3    0.666667
4    0.500000
dtype: float64
----------------------------
0     3.0
1     6.0
2    10.0
3    15.0
4     NaN
dtype: float64
----------------------------
0   -5.0
1   -7.0
2   -9.0
3    NaN
4    NaN
dtype: float64
----------------------------
0    NaN
1    NaN
2    1.0
3    3.0
4    6.0
dtype: float64
----------------------------
0    NaN
1    NaN
2    1.0
3    3.0
4    6.0
dtype: float64
----------------------------
0     NaN
1     NaN
2     NaN
3     9.0
4    12.0
dtype: float64
----------------------------
0     NaN
1     NaN
2     NaN
3     9.0
4    12.0
dtype: float64
0    3.0
1    5.0
2    NaN
dtype: float64


### 2. 扩张窗口
逐步扩张、累计窗口，从序列开始扩张到操作位置

\[a1\]\[a1, a2\]\[a1, a2, a3\]\[a1, a2, a3, a4\]

In [157]:
s = pd.Series([1, 3, 6, 10])
print(s.expanding().mean())
# cummax, cumsum, cumprod函数是典型的类扩张窗口函数，请使用expanding对象依次实现它们。
print('----------------------------')
print(s.cummax())
print('----------------------------')
print(s.expanding().max())
print('----------------------------')
print(s.cumsum())
print('----------------------------')
print(s.expanding().sum())
print('----------------------------')
print(s.cumprod())
print('----------------------------')
def exprod(x):
    res = 1
    for i in x:
        res *= i
    return res
print(s.expanding().apply(exprod))

0    1.000000
1    2.000000
2    3.333333
3    5.000000
dtype: float64
----------------------------
0     1
1     3
2     6
3    10
dtype: int64
----------------------------
0     1.0
1     3.0
2     6.0
3    10.0
dtype: float64
----------------------------
0     1
1     4
2    10
3    20
dtype: int64
----------------------------
0     1.0
1     4.0
2    10.0
3    20.0
dtype: float64
----------------------------
0      1
1      3
2     18
3    180
dtype: int64
----------------------------
0      1.0
1      3.0
2     18.0
3    180.0
dtype: float64


## 五、练习
### Ex1：口袋妖怪数据集
现有一份口袋妖怪的数据集，下面进行一些背景说明：

* `#`代表全国图鉴编号，不同行存在相同数字则表示为该妖怪的不同状态

* 妖怪具有单属性和双属性两种，对于单属性的妖怪，`Type 2`为缺失值
* `Total, HP, Attack, Defense, Sp. Atk, Sp. Def, Speed`分别代表种族值、体力、物攻、防御、特攻、特防、速度，其中种族值为后6项之和

In [214]:
df = pd.read_csv('./data/pokemon.csv')
df.head(7)

Unnamed: 0,#,Name,Type 1,Type 2,Total,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed
0,1,Bulbasaur,Grass,Poison,318,45,49,49,65,65,45
1,2,Ivysaur,Grass,Poison,405,60,62,63,80,80,60
2,3,Venusaur,Grass,Poison,525,80,82,83,100,100,80
3,3,VenusaurMega Venusaur,Grass,Poison,625,80,100,123,122,120,80
4,4,Charmander,Fire,,309,39,52,43,60,50,65
5,5,Charmeleon,Fire,,405,58,64,58,80,65,80
6,6,Charizard,Fire,Flying,534,78,84,78,109,85,100


1. 对`HP, Attack, Defense, Sp. Atk, Sp. Def, Speed`进行加总，验证是否为`Total`值。

2. 对于`#`重复的妖怪只保留第一条记录，解决以下问题：

* 求第一属性的种类数量和前三多数量对应的种类
* 求第一属性和第二属性的组合种类
* 求尚未出现过的属性组合

3. 按照下述要求，构造`Series`：

* 取出物攻，超过120的替换为`high`，不足50的替换为`low`，否则设为`mid`
* 取出第一属性，分别用`replace`和`apply`替换所有字母为大写
* 求每个妖怪六项能力的离差，即所有能力中偏离中位数最大的值，添加到`df`并从大到小排序

In [165]:
# 1. 对`HP, Attack, Defense, Sp. Atk, Sp. Def, Speed`进行加总，验证是否为`Total`值。
df[['HP', 'Attack', 'Defense', 'Sp. Atk', 'Sp. Def', 'Speed']].sum(axis=1) == df['Total']

0      True
1      True
2      True
3      True
4      True
       ... 
795    True
796    True
797    True
798    True
799    True
Length: 800, dtype: bool

In [290]:
# 2. 对于`#`重复的妖怪只保留第一条记录，解决以下问题：
df_demo = df.drop_duplicates(['#'])
# * 求第一属性的种类数量和前三多数量对应的种类
print(df_demo['Type 1'].value_counts().head(3))
print('----------------------------')
# * 求第一属性和第二属性的组合种类
df_type = df_demo[['Type 1', 'Type 2']].drop_duplicates()
print(df_type)  # 未删除 Type 2 为 NaN 的情况 
print('----------------------------')
# * 求尚未出现过的属性组合
df_tmp = [[i, j] for i, j in df_type.values]
df_types = set([i for i, j in df_tmp] + [j for i, j in df_tmp])
df_ans = [[i, j] for i in df_types for j in df_types if i == i and [i, j] not in df_tmp and i != j]
print(len(df_ans)) # 太多不打印

Water     105
Normal     93
Grass      66
Name: Type 1, dtype: int64
----------------------------
      Type 1  Type 2
0      Grass  Poison
4       Fire     NaN
6       Fire  Flying
9      Water     NaN
13       Bug     NaN
..       ...     ...
773     Rock   Fairy
778    Ghost   Grass
790   Flying  Dragon
797  Psychic   Ghost
799     Fire   Water

[143 rows x 2 columns]
----------------------------
[['Poison', 'Rock'], ['Poison', 'Steel'], ['Poison', 'Grass'], ['Poison', 'Fire'], ['Poison', 'Psychic'], ['Poison', 'Normal'], ['Poison', 'Ghost'], ['Poison', 'Ice'], ['Poison', 'Electric'], ['Poison', 'Fairy'], ['Rock', 'Poison'], ['Rock', 'Fire'], ['Rock', 'Normal'], ['Rock', 'Ghost'], ['Rock', 'Electric'], ['Flying', 'Poison'], ['Flying', 'Rock'], ['Flying', 'Fighting'], ['Flying', 'Steel'], ['Flying', 'Water'], ['Flying', 'Grass'], ['Flying', 'Fire'], ['Flying', 'Dark'], ['Flying', 'Ground'], ['Flying', 'Bug'], ['Flying', 'Psychic'], ['Flying', 'Normal'], ['Flying', 'Ghost'], ['Flying'

In [405]:
# 3. 按照下述要求，构造`Series`：

# * 取出物攻，超过120的替换为`high`，不足50的替换为`low`，否则设为`mid`
df_attack = df['Attack']
df_attack1 = df_attack.clip(50, 120)
df_attack2 = df_attack1.where(pd.Series([i or j for i, j in zip(df_attack1 == 50, df_attack1 == 120)]), 1)
df_attack3 = df_attack2.replace([1, 50, 120], ['mid', 'low', 'high'])
print(df_attack3.head(10))
print(' ------------------------------------ ')
# * 取出第一属性，分别用`replace`和`apply`替换所有字母为大写
# ----------------- replace ------------------------
df_name = df['Name']
df_name.replace(list(df_name), [i.upper() for i in df_name])
print(df_name)
print(' ------------------------------------ ')
# ----------------- apply --------------------------
df_name = df['Name']
def app_replace(x):
    return x.upper()
df_name.apply(app_replace)
print(df_name)
print(' ------------------------------------ ')
# * 求每个妖怪六项能力的离差，即所有能力中偏离中位数最大的值，添加到`df`并从大到小排序
def app_diff(x):
    tmp = np.array(x)
    quan_05 = np.quantile(tmp, 0.5)
    return tmp - quan_05
df_diff = df[['Attack', 'HP','Defense', 'Sp. Atk', 'Sp. Def', 'Speed']].apply(app_diff)
df['diff'] = df_diff.max(axis = 1)
df.sort_values(['diff'], ascending = False)

0     low
1     mid
2     mid
3     mid
4     mid
5     mid
6     mid
7    high
8     mid
9     low
Name: Attack, dtype: object
 ------------------------------------ 
0                  Bulbasaur
1                    Ivysaur
2                   Venusaur
3      VenusaurMega Venusaur
4                 Charmander
               ...          
795                  Diancie
796      DiancieMega Diancie
797      HoopaHoopa Confined
798       HoopaHoopa Unbound
799                Volcanion
Name: Name, Length: 800, dtype: object
 ------------------------------------ 
0                  Bulbasaur
1                    Ivysaur
2                   Venusaur
3      VenusaurMega Venusaur
4                 Charmander
               ...          
795                  Diancie
796      DiancieMega Diancie
797      HoopaHoopa Confined
798       HoopaHoopa Unbound
799                Volcanion
Name: Name, Length: 800, dtype: object
 ------------------------------------ 


Unnamed: 0,#,Name,Type 1,Type 2,Total,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,diff
261,242,Blissey,Normal,,540,255,10,10,75,135,55,190.0
121,113,Chansey,Normal,,450,250,5,5,35,105,50,185.0
230,213,Shuckle,Bug,Rock,505,20,10,230,10,230,5,160.0
224,208,SteelixMega Steelix,Steel,Ground,610,75,125,230,55,95,30,160.0
333,306,AggronMega Aggron,Steel,,630,70,140,230,60,80,50,160.0
...,...,...,...,...,...,...,...,...,...,...,...,...
175,161,Sentret,Normal,,215,35,46,34,35,45,20,-25.0
732,664,Scatterbug,Bug,,200,38,35,40,27,25,35,-27.0
446,401,Kricketot,Bug,,194,37,25,41,25,41,25,-28.0
255,236,Tyrogue,Fighting,,210,35,35,35,35,35,35,-30.0


### Ex2：指数加权窗口
1. 作为扩张窗口的`ewm`窗口

在扩张窗口中，用户可以使用各类函数进行历史的累计指标统计，但这些内置的统计函数往往把窗口中的所有元素赋予了同样的权重。事实上，可以给出不同的权重来赋给窗口中的元素，指数加权窗口就是这样一种特殊的扩张窗口。

其中，最重要的参数是`alpha`，它决定了默认情况下的窗口权重为$w_i=(1−\alpha)^i,i\in\{0,1,...,t\}$，其中$i=t$表示当前元素，$i=0$表示序列的第一个元素。

从权重公式可以看出，离开当前值越远则权重越小，若记原序列为$x$，更新后的当前元素为$y_t$，此时通过加权公式归一化后可知：

$$
\begin{split}y_t &=\frac{\sum_{i=0}^{t} w_i x_{t-i}}{\sum_{i=0}^{t} w_i} \\
&=\frac{x_t + (1 - \alpha)x_{t-1} + (1 - \alpha)^2 x_{t-2} + ...
+ (1 - \alpha)^{t} x_{0}}{1 + (1 - \alpha) + (1 - \alpha)^2 + ...
+ (1 - \alpha)^{t}}\\\end{split}
$$

对于`Series`而言，可以用`ewm`对象如下计算指数平滑后的序列：

1. 使用 expanding, apply 实现 ewm 滑动加权平滑

2. 从第1问中可以看到，ewm作为一种扩张窗口的特例，只能从序列的第一个元素开始加权。现在希望给定一个限制窗口n，

<u>只对包含自身的最近的n个元素作为窗口进行滑动加权平滑</u>。请根据滑窗函数，给出新的wi与yn的更新公式，并通过rolling窗口实现这一功能。


**解答，更新公式：**
$$
\begin{split}y_n &=\frac{\sum_{i=0}^{n-1} w_i x_{n-i}}{\sum_{i=0}^{n-1} w_i} \\
&=\frac{x_{n -1} + (1 - \alpha)x_{n-2} + (1 - \alpha)^2 x_{n-3} + ...
+ (1 - \alpha)^{n-1} x_{0}}{1 + (1 - \alpha) + (1 - \alpha)^2 + ...
+ (1 - \alpha)^{n-1}}\\\end{split}
$$

In [423]:
# 可以用ewm对象如下计算指数平滑后的序列：
np.random.seed(0)
s = pd.Series(np.random.randint(-1, 2, 30).cumsum())
print(s.head(10))
print(' ------------------------ ')
print(s.ewm(alpha=0.2).mean().head())
print(' ------------------------ ')
# ------------------------------------
# 1. 使用 expanding, apply 实现 ewm 滑动加权平滑
def app_ewm(x, alpha = 0.2):
    x = list(x)
    tmp1 = sum([x[i] * (1 - alpha)**(len(x) - i - 1) for i in range(len(x))])
    tmp2 = sum([(1 - alpha)**i for i in range(len(x))])
    return tmp1 / tmp2
print(s.expanding().apply(app_ewm).head())
# ------------------------------------
# 从第1问中可以看到，ewm作为一种扩张窗口的特例，只能从序列的第一个元素开始加权。现在希望给定一个限制窗口n，
# 只对包含自身的最近的n个元素作为窗口进行滑动加权平滑。请根据滑窗函数，给出新的wi与yn的更新公式，并通过rolling窗口实现这一功能。
s.rolling(window = 4).apply(app_ewm)


0   -1
1   -1
2   -2
3   -2
4   -2
5   -1
6   -2
7   -1
8   -2
9   -3
dtype: int64
 ------------------------ 
0   -1.000000
1   -1.000000
2   -1.409836
3   -1.609756
4   -1.725845
dtype: float64
 ------------------------ 
0   -1.000000
1   -1.000000
2   -1.409836
3   -1.609756
4   -1.725845
dtype: float64


0          NaN
1          NaN
2          NaN
3    -1.609756
4    -1.826558
5    -1.661247
6    -1.728997
7    -1.444444
8    -1.555556
9    -2.121951
10   -2.775068
11   -3.097561
12   -3.216802
13   -2.834688
14   -2.051491
15   -1.902439
16   -1.783198
17   -1.826558
18   -2.000000
19   -2.000000
20   -2.338753
21   -2.609756
22   -3.165312
23   -3.948509
24   -4.436314
25   -4.487805
26   -4.728997
27   -4.444444
28   -4.555556
29   -4.783198
dtype: float64

In [426]:
# 重难点（Polish my answer from RA）
# ---------------- 1.1 -------------------
# False 的均值为 0， 更加一目了然
print((df[['HP', 'Attack', 'Defense', 'Sp. Atk', 'Sp. Def', 'Speed']].sum(1)!=df['Total']).mean())  
# ---------------- 1.2.1 -------------------
# keep 的默认值 first， 加上 index[: 3] ，输出更直观
dp_dup = df.drop_duplicates('#', keep='first')
# dp_dup['Type 1'].nunique()
dp_dup['Type 1'].value_counts().index[:3]
# ---------------- 1.2.2 -------------------
attr_dup = dp_dup.drop_duplicates(['Type 1', 'Type 2'])
attr_dup.shape[0]
# ---------------- 1.2.3 -------------------
# 差别在于，我用 in 判断是否出现过，而这里使用set 的 different 函数
# 这里的不足之处有三点：1. 没考虑 Type2 有而 Type1 没有的属性【虽然不存在这样的属性】
#                   2. 如果考虑Type2中加入，那么要考虑 nan 不能 出现在Type1 中
#                   3. 有没有别的单属性出现的可能
# 在这题里面完全正确
L_full = [' '.join([i, j]) if i!=j else i for j in dp_dup['Type 1'].unique() for i in dp_dup['Type 1'].unique()]
L_part = [' '.join([i, j]) if type(j)!=float else i for i, j in zip(attr_dup['Type 1'], attr_dup['Type 2'])]
res = set(L_full).difference(set(L_part))
len(res) 
# ---------------- 1.3.1 -------------------
# 使用了三个 mask 分别来判断， 其中的 & 如果改成 and 就不成功了，当时自己卡在这个地方
df['Attack'].mask(df['Attack']>120, 'high').mask(df['Attack']<50, 'low').mask((50<=df['Attack'])&(df['Attack']<=120), 'mid').head()
# ---------------- 1.3.2 -------------------
# 自己的不足有两点：
#                1. 在字典中全部字符串都遍历，事实上，加上 unique 就可以了，重复的不计算
#                2. 活用 lambda 精简代码
df['Type 1'].replace({i:str.upper(i) for i in df['Type 1'].unique()})
df['Type 1'].apply(lambda x:str.upper(x)).head()
# ---------------- 1.3.3 -------------------
# 活用 lambda 精简
df['Deviation'] = df[['HP', 'Attack', 'Defense', 'Sp. Atk', 'Sp. Def', 'Speed']].apply(lambda x:np.max((x-x.median()).abs()), 1)
df.sort_values('Deviation', ascending=False).head()
# ---------------- 1.4.1 -------------------
# 对公式迭代的时候，使用 win × x 可以简化计算， 我其实把win算了两边
np.random.seed(0)
s = pd.Series(np.random.randint(-1,2,30).cumsum())
s.ewm(alpha=0.2).mean().head()
def ewm_func(x, alpha=0.2):
    win = (1-alpha)**np.arange(x.shape[0])[::-1]
    res = (win*x).sum()/win.sum()
    return res
s.expanding().apply(ewm_func).head()
# ---------------- 1.4.2 -------------------
# 比较简单
s.rolling(window=4).apply(ewm_func).head() # 无需对原函数改动

0.0


ValueError: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().

In [409]:
np.random.randint(-1, 2, 30).cumsum()

array([ 0,  1,  0,  0,  1,  0,  1,  0,  0,  1,  2,  2,  1,  1,  1,  0,  1,
        2,  3,  4,  4,  5,  6,  7,  8,  9,  8,  8,  9, 10])