# <center>pandas的21个基本操作</center>

In [1]:
import os
import pandas as pd
import numpy as np

## 1. 怎样读取csv文件或者文本文件？

In [2]:
''' data.csv
A,B,C,D
1,2,3,4
5,6,7,8
'''
csv_path = '../data/data.csv'
df = pd.read_csv(csv_path, sep=',', header=0, index_col=False, names=None)
print(df)

   A   B   C   D
0  1   2   3   4
1  5   6   7   8
2  9  10  11  12


In [3]:
''' data.txt
A B C D
1 2 3 4
5 6 7 8
'''
txt_path = '../data/data.txt'
df = pd.read_csv(txt_path, sep=' ', header=0, index_col=False, names=None)
print(df)

   A   B   C   D
0  1   2   3   4
1  5   6   7   8
2  9  10  11  12


## 2. 怎样使用已有数据创建data frame？

In [4]:
# 从词典数据创建data frame
d_dict = {'A':[1,5,9], 'B':[2,6,10], 'C':[3,7,11], 'D':[4,8,12]}
df = pd.DataFrame(data=d_dict)
print(df)

   A   B   C   D
0  1   2   3   4
1  5   6   7   8
2  9  10  11  12


In [5]:
# 从numpy数组创建data frame
np_data = np.array([[1,2,3,4],[5,6,7,8],[9,10,11,12]])
col_names = ['A', 'B', 'C', 'D']
df = pd.DataFrame(data=np_data, columns=col_names)
print(df)

   A   B   C   D
0  1   2   3   4
1  5   6   7   8
2  9  10  11  12


## 3. 怎样可视化data frame的顶部和底部数值？

In [9]:
d_dict = {'A':[1,2,3,4,5,6,7,8,9,10,11,12]}
df = pd.DataFrame(data=d_dict)

df.head(5)

Unnamed: 0,A
0,1
1,2
2,3
3,4
4,5


In [11]:
df.tail(3)

Unnamed: 0,A
9,10
10,11
11,12


## 4. 怎样重命名若干列？

In [12]:
d_dict = {'A':[1,5,9], 'B':[2,6,10], 'C':[3,7,11], 'D':[4,8,12]}
df = pd.DataFrame(data=d_dict)

new_df = df.rename(columns={'A':'new_A', 'B':'new_B', 'C':'new_C'})
print(new_df)

   new_A  new_B  new_C   D
0      1      2      3   4
1      5      6      7   8
2      9     10     11  12


## 5. 怎样获取列的名字形成一个列表？

In [13]:
d_dict = {'A':[1,5,9], 'B':[2,6,10], 'C':[3,7,11], 'D':[4,8,12]}
df = pd.DataFrame(data=d_dict)

name_list = df.columns.tolist()
print(name_list)

['A', 'B', 'C', 'D']


## 6. 怎样获取一列数据的值出现频率？

In [15]:
d_dict = {'A':[1,5,1], 'B':[2,6,10], 'C':[3,7,11], 'D':[4,8,12]}
df = pd.DataFrame(data=d_dict)
A_cnt = df['A'].value_counts()
print(A_cnt)
print(type(A_cnt))

1    2
5    1
Name: A, dtype: int64
<class 'pandas.core.series.Series'>


## 7. 怎样删除行以及重置索引

In [17]:
d_dict = {'A':[1,5,9], 'B':[2,6,10], 'C':[3,7,11], 'D':[4,8,12]}
df = pd.DataFrame(data=d_dict)
df = df.drop([1])  # 删除index为1的行
print(df)

new_df = df.reset_index(drop=True, inplace=False) # 重置索引
print(new_df)

   A   B   C   D
0  1   2   3   4
2  9  10  11  12
   A   B   C   D
0  1   2   3   4
1  9  10  11  12


## 8. 怎样移除一列？

In [18]:
d_dict = {'A':[1,5,9], 'B':[2,6,10], 'C':[3,7,11], 'D':[4,8,12]}
df = pd.DataFrame(data=d_dict)
df.drop(columns=['B'], inplace=True) #删除'B'列，原处修改
print(df)

   A   C   D
0  1   3   4
1  5   7   8
2  9  11  12


## 9. 怎样在data frame中改变索引？

In [19]:
d_dict = {'A':[1,5,9], 'B':[2,6,10], 'C':[3,7,11], 'D':[4,8,12]}
df = pd.DataFrame(data=d_dict)
print('before:\r\n', df)
df.set_index(['B'], inplace=True)
print('after:\r\n', df)

before:
    A   B   C   D
0  1   2   3   4
1  5   6   7   8
2  9  10  11  12
after:
     A   C   D
B            
2   1   3   4
6   5   7   8
10  9  11  12


## 10. 怎样移除包含nan值的行或者列？

In [21]:
d_dict = {'A':[1,np.nan,9], 'B':[2,6,10], 'C':[3,7,11], 'D':[4,8,12]}
df = pd.DataFrame(data=d_dict)
print('before:\r\n', df)
df.dropna(axis=1, inplace=True)
print('after:\r\n', df)

before:
      A   B   C   D
0  1.0   2   3   4
1  NaN   6   7   8
2  9.0  10  11  12
after:
     B   C   D
0   2   3   4
1   6   7   8
2  10  11  12


## 11. 怎样根据指定条件对data frame进行切片？

In [23]:
d_dict = {'A':[1,5,9], 'B':[2,6,10], 'C':[3,7,11], 'D':[4,8,12]}
df = pd.DataFrame(data=d_dict)

# 单个条件
mask = df['A'] > 4
slide_df = df[mask]
print('single condition:\r\n', slide_df)

# 多个条件
mask = (df['A'] > 4) & (df['B'] > 7)
print('multiple condition:\r\n', df[mask])

single condition:
    A   B   C   D
1  5   6   7   8
2  9  10  11  12
multiple condition:
    A   B   C   D
2  9  10  11  12


## 12. 怎样根据给定的索引或者列名进行切片？

In [27]:
d_dict = {'A':[1,5,9], 'B':[2,6,10], 'C':[3,7,11], 'D':[4,8,12]}
df = pd.DataFrame(data=d_dict)

print(df.iat[1,2])   # 根据整数索引进行切片
print(df.at[1, 'C']) # 根据列名进行切片
print(df.loc[[0,2], ['A', 'D']]) # 根据列名就行切片
print(df.iloc[[0,2], [0,3]])     # 根据整数索引进行切片

7
7
   A   D
0  1   4
2  9  12
   A   D
0  1   4
2  9  12


## 13. 怎样逐行迭代?

In [29]:
d_dict = {'A':[1,5,9], 'B':[2,6,10], 'C':[3,7,11], 'D':[4,8,12]}
df = pd.DataFrame(data=d_dict)

# 逐行迭代
sum = 0
for i, row in df.iterrows():
    sum += row['A']
print(sum)

# 元组迭代器
sum = 0
for row in df.itertuples():
    sum += row.A
print(sum)

15
15


## 14. 怎样根据列进行排序？

In [31]:
d_dict = {'A':[1,5,9], 'B':[2,6,10], 'C':[3,7,11], 'D':[4,8,12]}
df = pd.DataFrame(data=d_dict)

df.sort_values(by=['A'], ascending=False, inplace=True)
print(df)

   A   B   C   D
2  9  10  11  12
1  5   6   7   8
0  1   2   3   4


## 15. 怎样将一个函数应用到一系列元素？

In [33]:
d_dict = {'A':[1,5,9], 'B':[2,6,10], 'C':[3,7,11], 'D':[4,8,12]}
df = pd.DataFrame(data=d_dict)

def func_square(x):
    return x*x
new = df['A'].apply(func_square)
print(new)

def func_mul(x, **kwargs):
    return x*kwargs['a']
new = df['A'].apply(func_mul, a=3)
print(new)

0     1
1    25
2    81
Name: A, dtype: int64
0     3
1    15
2    27
Name: A, dtype: int64


## 16. 怎样对所有元素应用函数？

In [34]:
d_dict = {'A':[1,5,9], 'B':[2,6,10], 'C':[3,7,11], 'D':[4,8,12]}
df = pd.DataFrame(data=d_dict)

def func(x):
    return x ** 2
new = df.applymap(func)
print(new)

    A    B    C    D
0   1    4    9   16
1  25   36   49   64
2  81  100  121  144


## 17. 怎样对指定list中的列元素进行切片？

In [36]:
d_dict = {'A':[1,5,9], 'B':[2,6,10], 'C':[3,7,11], 'D':[4,8,12]}
df = pd.DataFrame(data=d_dict)

value_list = list(range(9))
print(df[df['D'].isin(value_list)])
print(df[~df['D'].isin(value_list)])

   A  B  C  D
0  1  2  3  4
1  5  6  7  8
   A   B   C   D
2  9  10  11  12


## 18. 怎样融合列并应用函数？

In [39]:
d_dict = {'A':[1,5,1], 'B':[2,6,10], 'C':[3,7,11], 'D':[4,8,12]}
df = pd.DataFrame(data=d_dict)

new = df.groupby(['A']).agg({'B':'mean'})
print(new)
print(df.groupby(['A']).agg(list))

   B
A   
1  6
5  6
         B        C        D
A                           
1  [2, 10]  [3, 11]  [4, 12]
5      [6]      [7]      [8]


## 19. 怎样将一列元素为list的列拆为多行？

In [41]:
d_dict = {'A':[1,5,1], 'B':[2,6,10], 'C':[3,7,11], 'D':[4,8,12]}
df = pd.DataFrame(data=d_dict)
new = df.groupby(['A']).agg(list)
print(new)
print(new['B'].explode())

         B        C        D
A                           
1  [2, 10]  [3, 11]  [4, 12]
5      [6]      [7]      [8]
A
1     2
1    10
5     6
Name: B, dtype: object


## 20. 怎样拼接两个data frame？

In [43]:
d_dict = {'A':[1,5,1], 'B':[2,6,10], 'C':[3,7,11], 'D':[4,8,12]}
df1 = pd.DataFrame(data=d_dict)
d_dict = {'A':[1,5,1], 'B':[2,6,10], 'C':[3,7,11], 'D':[4,8,12]}
df2 = pd.DataFrame(data=d_dict)

print(pd.concat([df1, df2], axis=0))
print(pd.concat([df1, df2], axis=1))

   A   B   C   D
0  1   2   3   4
1  5   6   7   8
2  1  10  11  12
0  1   2   3   4
1  5   6   7   8
2  1  10  11  12
   A   B   C   D  A   B   C   D
0  1   2   3   4  1   2   3   4
1  5   6   7   8  5   6   7   8
2  1  10  11  12  1  10  11  12


## 21. 怎样合并两个data frame?

In [44]:
d_dict = {'A':[1,5,9], 'B':[2,6,10], 'C':[3,7,11], 'D':[4,8,12]}
df1 = pd.DataFrame(data=d_dict)
d_dict = {'A':[1,5,9], 'e':[2,6,10], 'f':[3,7,11], 'g':[4,8,12]}
df2 = pd.DataFrame(data=d_dict)

new = df1.merge(df2, on=['A'], how='inner')
print(new)

   A   B   C   D   e   f   g
0  1   2   3   4   2   3   4
1  5   6   7   8   6   7   8
2  9  10  11  12  10  11  12
