In [7]:
import numpy as np
import pandas as pd

df = pd.DataFrame({'key1':[4,5,3,np.nan,2],
                  'key2':[2,9,np.nan,np.nan,5],
                  'key3':[1,2,3,'a','b']},
                 index=list('abcde'))
print(df)

   key1  key2 key3
a   4.0   2.0    1
b   5.0   9.0    2
c   3.0   NaN    3
d   NaN   NaN    a
e   2.0   5.0    b


In [11]:
print(df['key1'].dtype, df['key2'].dtype, df['key3'].dtype)

float64 float64 object


In [14]:
m1 = df.mean() #对列求平均值
print(m1, type(m1))
print('-------')
print('单独对key2列求平均值:', df['key2'].mean())

key1    3.500000
key2    5.333333
dtype: float64 <class 'pandas.core.series.Series'>
-------
单独对key2列求平均值: 5.333333333333333


In [16]:
m2 = df.mean(axis=1) #axis=1对行求平均值
print(m2, type(m2))

a    3.0
b    7.0
c    3.0
d    NaN
e    3.5
dtype: float64 <class 'pandas.core.series.Series'>


In [19]:
#skipna参数：是否忽略NaN，默认是True，如果是False，有NaN的列统计结果仍为Nan
m3 = df.mean(skipna=False) 
print(m3)
m4 = df.mean(axis=1,skipna=False)
print(m4)

key1   NaN
key2   NaN
dtype: float64
a    3.0
b    7.0
c    NaN
d    NaN
e    3.5
dtype: float64


In [45]:
#主要数学计算方法
df = pd.DataFrame({'key1':np.arange(11),
                  'key2':np.random.rand(11)*10})

print(df)

    key1      key2
0      0  3.645607
1      1  4.034001
2      2  1.504822
3      3  8.348921
4      4  5.002392
5      5  0.991633
6      6  0.910818
7      7  6.583501
8      8  7.259424
9      9  5.288712
10    10  4.736378


In [39]:
print(df.count())

key1    12
key2    12
dtype: int64


In [46]:
df['key2'][2] = np.nan
print(df)

    key1      key2
0      0  3.645607
1      1  4.034001
2      2       NaN
3      3  8.348921
4      4  5.002392
5      5  0.991633
6      6  0.910818
7      7  6.583501
8      8  7.259424
9      9  5.288712
10    10  4.736378


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [47]:
print(df.count()) #统计非Nan值的数量

key1    11
key2    10
dtype: int64


In [48]:
print(df.min()) #最小值
print('-----')
print(df.max()) #最大值

key1    0.000000
key2    0.910818
dtype: float64
-----
key1    10.000000
key2     8.348921
dtype: float64


In [55]:
print(df.quantile(q=0.3))

key1    3.000000
key2    3.917483
Name: 0.3, dtype: float64


In [57]:
#cumsum样本的累计和
df['key1_s'] = df['key1'].cumsum()
df['key2_s'] = df['key2'].cumsum()
print(df)

    key1      key2  key1_s     key2_s
0      0  3.645607       0   3.645607
1      1  4.034001       1   7.679608
2      2       NaN       3        NaN
3      3  8.348921       6  16.028529
4      4  5.002392      10  21.030922
5      5  0.991633      15  22.022555
6      6  0.910818      21  22.933373
7      7  6.583501      28  29.516874
8      8  7.259424      36  36.776299
9      9  5.288712      45  42.065011
10    10  4.736378      55  46.801389


In [59]:
print(df.cummax())

    key1      key2  key1_s     key2_s
0    0.0  3.645607     0.0   3.645607
1    1.0  4.034001     1.0   7.679608
2    2.0       NaN     3.0        NaN
3    3.0  8.348921     6.0  16.028529
4    4.0  8.348921    10.0  21.030922
5    5.0  8.348921    15.0  22.022555
6    6.0  8.348921    21.0  22.933373
7    7.0  8.348921    28.0  29.516874
8    8.0  8.348921    36.0  36.776299
9    9.0  8.348921    45.0  42.065011
10  10.0  8.348921    55.0  46.801389


In [61]:
print(df.cummin())

    key1      key2  key1_s    key2_s
0    0.0  3.645607     0.0  3.645607
1    0.0  3.645607     0.0  3.645607
2    0.0       NaN     0.0       NaN
3    0.0  3.645607     0.0  3.645607
4    0.0  3.645607     0.0  3.645607
5    0.0  0.991633     0.0  3.645607
6    0.0  0.910818     0.0  3.645607
7    0.0  0.910818     0.0  3.645607
8    0.0  0.910818     0.0  3.645607
9    0.0  0.910818     0.0  3.645607
10   0.0  0.910818     0.0  3.645607


In [63]:
s = pd.Series(list('adbddekoidceeg'))
print(s)

0     a
1     d
2     b
3     d
4     d
5     e
6     k
7     o
8     i
9     d
10    c
11    e
12    e
13    g
dtype: object


In [68]:
sq = s.unique()
print(sq, type(sq))
print('------')
print(pd.Series(sq))
print('------')
sq.sort()
print(sq)

['a' 'd' 'b' 'e' 'k' 'o' 'i' 'c' 'g'] <class 'numpy.ndarray'>
------
0    a
1    d
2    b
3    e
4    k
5    o
6    i
7    c
8    g
dtype: object
------
['a' 'b' 'c' 'd' 'e' 'g' 'i' 'k' 'o']


In [71]:
print(s)
print('------')
sc = s.value_counts(sort=False)
print(sc)

0     a
1     d
2     b
3     d
4     d
5     e
6     k
7     o
8     i
9     d
10    c
11    e
12    e
13    g
dtype: object
------
g    1
e    3
b    1
k    1
o    1
d    4
a    1
c    1
i    1
dtype: int64


In [97]:
df = pd.DataFrame([['M','Jack','90-65-92'],
                   ['M','Tom','89-88-90'],
                   ['F','Marry','84-50-92'],
                  ['M','Zack','78-72-79'],
                  ['F','Heheda','61-62-63']], columns=['gender','name','score'])
print(df)

  gender    name     score
0      M    Jack  90-65-92
1      M     Tom  89-88-90
2      F   Marry  84-50-92
3      M    Zack  78-72-79
4      F  Heheda  61-62-63


In [98]:
df['gender'] = df['gender'].str.lower()
print(df)

  gender    name     score
0      m    Jack  90-65-92
1      m     Tom  89-88-90
2      f   Marry  84-50-92
3      m    Zack  78-72-79
4      f  Heheda  61-62-63


In [99]:
d = df['score'].str.split('-',expand=True)
df['math'] = d[0]
df['english'] = d[1]
df['art'] = d[2]
del df['score']
print(df)
# print(df, type(d))

  gender    name math english art
0      m    Jack   90      65  92
1      m     Tom   89      88  90
2      f   Marry   84      50  92
3      m    Zack   78      72  79
4      f  Heheda   61      62  63


In [106]:
df1 = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3'],
                     'A': ['A0', 'A1', 'A2', 'A3'],
                     'B': ['B0', 'B1', 'B2', 'B3']})
df2 = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3'],
                      'C': ['C0', 'C1', 'C2', 'C3'],
                      'D': ['D0', 'D1', 'D2', 'D3']})
print(df1)
print('----')
print(df2)
print('----')
print(pd.merge(df1,df2, on='key'))


    A   B key
0  A0  B0  K0
1  A1  B1  K1
2  A2  B2  K2
3  A3  B3  K3
----
    C   D key
0  C0  D0  K0
1  C1  D1  K1
2  C2  D2  K2
3  C3  D3  K3
----
    A   B key   C   D
0  A0  B0  K0  C0  D0
1  A1  B1  K1  C1  D1
2  A2  B2  K2  C2  D2
3  A3  B3  K3  C3  D3


In [107]:
df3 = pd.DataFrame({'key1': ['K0', 'K0', 'K1', 'K2'],
                    'key2': ['K0', 'K1', 'K0', 'K1'],
                    'A': ['A0', 'A1', 'A2', 'A3'],
                    'B': ['B0', 'B1', 'B2', 'B3']})
df4 = pd.DataFrame({'key1': ['K0', 'K1', 'K1', 'K2'],
                    'key2': ['K0', 'K0', 'K0', 'K0'],
                    'C': ['C0', 'C1', 'C2', 'C3'],
                    'D': ['D0', 'D1', 'D2', 'D3']})
print(df3)
print('----')
print(df4)
print('-----')
print(pd.merge(df3,df4, on=['key1','key2']))
#key1=k0 key2=k0的df3有1行，df4有1行
#key1=k1 key2=k0的df3有1行，df4有2行
#所以合并之后结果有3条

    A   B key1 key2
0  A0  B0   K0   K0
1  A1  B1   K0   K1
2  A2  B2   K1   K0
3  A3  B3   K2   K1
----
    C   D key1 key2
0  C0  D0   K0   K0
1  C1  D1   K1   K0
2  C2  D2   K1   K0
3  C3  D3   K2   K0
-----
    A   B key1 key2   C   D
0  A0  B0   K0   K0  C0  D0
1  A2  B2   K1   K0  C1  D1
2  A2  B2   K1   K0  C2  D2


In [111]:
#merge 默认取交集
print(pd.merge(df3,df4,on=['key1','key2'],how='inner'))
print('----')
print(pd.merge(df3, df4, on=['key1','key2'], how='outer')) #并集，缺失的自动补NaN
print('----')
print(pd.merge(df3,df4, on=['key1','key2'], how='left')) #按照df3为参考合并，数据缺失范围NaN
print('----')
print(pd.merge(df3,df4, on=['key1','key2'], how='right'))#按照df4为参考合并，数据缺失范围NaN

    A   B key1 key2   C   D
0  A0  B0   K0   K0  C0  D0
1  A2  B2   K1   K0  C1  D1
2  A2  B2   K1   K0  C2  D2
----
     A    B key1 key2    C    D
0   A0   B0   K0   K0   C0   D0
1   A1   B1   K0   K1  NaN  NaN
2   A2   B2   K1   K0   C1   D1
3   A2   B2   K1   K0   C2   D2
4   A3   B3   K2   K1  NaN  NaN
5  NaN  NaN   K2   K0   C3   D3
----
    A   B key1 key2    C    D
0  A0  B0   K0   K0   C0   D0
1  A1  B1   K0   K1  NaN  NaN
2  A2  B2   K1   K0   C1   D1
3  A2  B2   K1   K0   C2   D2
4  A3  B3   K2   K1  NaN  NaN
----
     A    B key1 key2   C   D
0   A0   B0   K0   K0  C0  D0
1   A2   B2   K1   K0  C1  D1
2   A2   B2   K1   K0  C2  D2
3  NaN  NaN   K2   K0  C3  D3
