# Pandas

### Pandas是Python中的套件，功能類似Excel，但使用上比Excel更容易、更能處理大量資料

1. 安裝Pandas
2. 讀取檔案
3. 觀察資料
4. 資料描述
5. 查詢欄、列名稱
6. 取出subset
7. 遺失值處理
8. 重複值處理
9. 索引、選取資料
10. 資料排序
11. groupby
12. 刪除欄位
13. 資料合併
14. 樞紐分析
15. 匯出資料

## 安裝Pandas

In [1]:
!pip install pandas



In [2]:
import pandas as pd #載入套件

## 讀取檔案

In [3]:
df = pd.read_csv("customer.csv")

In [4]:
df1 = pd.read_excel("amountdata.xlsx")

## 觀察資料

In [5]:
df.head() #觀察前幾筆資料

Unnamed: 0,Date,Binary,Age,UniqueKey,Rank,Tag
0,2019/1/3,F,90.0,FEPNDpfzky33,2,JJC
1,2019/1/3,F,40.0,OBMUHsapme35,2,CLP
2,2019/1/6,F,49.0,IPLXWuxetd57,1,XOI
3,2019/1/9,F,48.0,ILJAEfznol59,1,AKG
4,2019/1/13,M,52.0,AJRDLgpbta11,5,WHK


In [6]:
df1.head()

Unnamed: 0,UniqueKey,Amount
0,HJJRZdyfld79,759888
1,ETVOBckqmb41,831898
2,FEPNDpfzky33,957556
3,DZZJVvcgol66,647964
4,TYSDKzjdom34,684438


In [7]:
df.tail() #觀察最後幾筆資料

Unnamed: 0,Date,Binary,Age,UniqueKey,Rank,Tag
295,2021/11/14,F,17.0,EBINKcuiyn43,5,TNC
296,2021/11/28,M,47.0,XMNWClfudo21,1,NTS
297,2021/12/17,F,2.0,BCSNLtrrjb12,4,EXY
298,2021/12/18,F,26.0,SPBTSpoepd15,5,QWC
299,2021/12/19,F,49.0,RDPWJcijyy31,1,SAM


In [8]:
df1.tail()

Unnamed: 0,UniqueKey,Amount
295,PPJUOmldeu21,25550
296,SZFMTkplxn55,764807
297,LFZIVcbfnt45,325621
298,KSKELiwvae81,423369
299,JAIULbwpht67,440777


## 資料描述

In [9]:
df.shape #(列數 ,欄數)

(300, 6)

In [10]:
df.describe() #統計資料

Unnamed: 0,Age,Rank
count,296.0,300.0
mean,49.530405,3.05
std,28.684829,1.412143
min,1.0,1.0
25%,25.0,2.0
50%,49.0,3.0
75%,73.0,4.0
max,99.0,5.0


In [11]:
df.dtypes #資料型態

Date          object
Binary        object
Age          float64
UniqueKey     object
Rank           int64
Tag           object
dtype: object

## 查詢欄、列名稱

In [12]:
df.index #查詢列名

RangeIndex(start=0, stop=300, step=1)

In [13]:
df.columns #查欄位名

Index(['Date', 'Binary', 'Age', 'UniqueKey', 'Rank', 'Tag'], dtype='object')

## 取出subset

In [14]:
df['Date'] #取出DataFrame的一欄為Series

0        2019/1/3
1        2019/1/3
2        2019/1/6
3        2019/1/9
4       2019/1/13
          ...    
295    2021/11/14
296    2021/11/28
297    2021/12/17
298    2021/12/18
299    2021/12/19
Name: Date, Length: 300, dtype: object

In [15]:
df['Date'][0] #取第一欄的第一個

'2019/1/3'

In [16]:
df[['UniqueKey', 'Rank']] #一次取多個欄位

Unnamed: 0,UniqueKey,Rank
0,FEPNDpfzky33,2
1,OBMUHsapme35,2
2,IPLXWuxetd57,1
3,ILJAEfznol59,1
4,AJRDLgpbta11,5
...,...,...
295,EBINKcuiyn43,5
296,XMNWClfudo21,1
297,BCSNLtrrjb12,4
298,SPBTSpoepd15,5


In [17]:
df[df['Rank']>3] #依照條件選取

Unnamed: 0,Date,Binary,Age,UniqueKey,Rank,Tag
4,2019/1/13,M,52.0,AJRDLgpbta11,5,WHK
6,2019/1/30,F,3.0,RMKVPosvij11,4,IQC
11,2019/2/9,F,22.0,NJTXGkjlnw85,4,JTH
12,2019/2/9,F,31.0,EKOGXshjaf38,5,PPZ
13,2019/2/14,M,11.0,VDOXGpbfgd40,4,HNS
...,...,...,...,...,...,...
291,2021/11/7,F,1.0,JMPWCeezvi24,4,LWK
292,2021/11/7,M,57.0,YJTFQgqbxa76,5,USC
295,2021/11/14,F,17.0,EBINKcuiyn43,5,TNC
297,2021/12/17,F,2.0,BCSNLtrrjb12,4,EXY


In [18]:
df[(df['Rank']>3) & (df['Age']>90)] #依照多個條件選取

Unnamed: 0,Date,Binary,Age,UniqueKey,Rank,Tag
31,2019/4/17,M,99.0,UXAMDucdha58,5,ILX
43,2019/5/16,M,98.0,GFRVLobraf35,4,YEC
53,2019/6/10,F,93.0,KTINEwjkhn50,5,VFI
57,2019/6/26,M,95.0,WNJAUyqvxi20,4,MPA
62,2019/7/3,M,98.0,CGSFHydifc22,5,KHM
97,2019/10/21,M,97.0,WSWTDluzif22,4,XXO
99,2019/11/2,F,93.0,UPGPIjddeu27,4,YGI
113,2019/12/28,F,94.0,JHEYZqbrii75,5,KJQ
124,2020/2/7,F,99.0,ATRXKfwxrl99,4,DFN
159,2020/6/26,F,91.0,LWIINoneyq50,4,SZI


## 檢查遺失值

In [19]:
df.isnull().sum()

Date         0
Binary       0
Age          4
UniqueKey    0
Rank         0
Tag          0
dtype: int64

In [20]:
df['Age'].mean() #計算Age平均數

49.5304054054054

In [21]:
df2 = df.fillna(df['Age'].mean()) #用平均數補空值

In [22]:
df2.isnull().sum()

Date         0
Binary       0
Age          0
UniqueKey    0
Rank         0
Tag          0
dtype: int64

## 檢查重複值

In [23]:
df.shape

(300, 6)

In [24]:
df.duplicated().sum() 

3

In [25]:
df3 = df.drop_duplicates() #刪除重複值

In [26]:
df3.shape

(297, 6)

## Indexing and selecting data

## 1. loc

In [27]:
df.loc[2:10] #選2-10列

Unnamed: 0,Date,Binary,Age,UniqueKey,Rank,Tag
2,2019/1/6,F,49.0,IPLXWuxetd57,1,XOI
3,2019/1/9,F,48.0,ILJAEfznol59,1,AKG
4,2019/1/13,M,52.0,AJRDLgpbta11,5,WHK
5,2019/1/15,M,75.0,VTSNEcjenw28,3,PVQ
6,2019/1/30,F,3.0,RMKVPosvij11,4,IQC
7,2019/1/31,M,26.0,RIBABpejsc95,3,ZKR
8,2019/2/4,M,65.0,JOIWTbynha94,3,TCL
9,2019/2/5,M,5.0,QDPLXqijdq38,2,LYW
10,2019/2/7,M,59.0,SWTUZnftai45,3,DBC


In [28]:
df.loc[2:10, 'Age'] #選Age欄的2-10列

2     49.0
3     48.0
4     52.0
5     75.0
6      3.0
7     26.0
8     65.0
9      5.0
10    59.0
Name: Age, dtype: float64

In [29]:
df.loc[2:10, ['Age','Rank']] #選Age及Rank欄的2-10列

Unnamed: 0,Age,Rank
2,49.0,1
3,48.0,1
4,52.0,5
5,75.0,3
6,3.0,4
7,26.0,3
8,65.0,3
9,5.0,2
10,59.0,3


## 2. iloc

In [30]:
df.head()

Unnamed: 0,Date,Binary,Age,UniqueKey,Rank,Tag
0,2019/1/3,F,90.0,FEPNDpfzky33,2,JJC
1,2019/1/3,F,40.0,OBMUHsapme35,2,CLP
2,2019/1/6,F,49.0,IPLXWuxetd57,1,XOI
3,2019/1/9,F,48.0,ILJAEfznol59,1,AKG
4,2019/1/13,M,52.0,AJRDLgpbta11,5,WHK


In [31]:
df.iloc[0] #取第一列

Date             2019/1/3
Binary                  F
Age                  90.0
UniqueKey    FEPNDpfzky33
Rank                    2
Tag                   JJC
Name: 0, dtype: object

In [32]:
df.iloc[0,0] #取第一欄第一列

'2019/1/3'

In [33]:
df.iloc[0:3, 1:4]

Unnamed: 0,Binary,Age,UniqueKey
0,F,90.0,FEPNDpfzky33
1,F,40.0,OBMUHsapme35
2,F,49.0,IPLXWuxetd57


## 資料排序

In [34]:
df.sort_values(by='Age') #依照Age排序，預設為遞增排序

Unnamed: 0,Date,Binary,Age,UniqueKey,Rank,Tag
291,2021/11/7,F,1.0,JMPWCeezvi24,4,LWK
144,2020/5/11,F,1.0,FBWCTeikqy95,5,EWT
268,2021/8/13,F,1.0,KHNMTjdwbc33,5,TKC
108,2019/12/10,F,1.0,PYTZGnedko80,2,HXX
263,2021/7/21,M,1.0,YCEZJcyyav82,5,LKA
...,...,...,...,...,...,...
169,2020/8/2,F,99.0,LSNLOnsopw29,5,BRH
34,2019/4/24,F,,CEXYUbxkqd36,2,XJF
35,2019/4/26,F,,JRZIZrnoxv77,2,YZQ
36,2019/4/28,F,,AHXJDysvll32,2,PXI


In [35]:
df.sort_values(by='Age' ,ascending=False) #ascending=False改成遞減排序

Unnamed: 0,Date,Binary,Age,UniqueKey,Rank,Tag
169,2020/8/2,F,99.0,LSNLOnsopw29,5,BRH
124,2020/2/7,F,99.0,ATRXKfwxrl99,4,DFN
31,2019/4/17,M,99.0,UXAMDucdha58,5,ILX
43,2019/5/16,M,98.0,GFRVLobraf35,4,YEC
62,2019/7/3,M,98.0,CGSFHydifc22,5,KHM
...,...,...,...,...,...,...
268,2021/8/13,F,1.0,KHNMTjdwbc33,5,TKC
34,2019/4/24,F,,CEXYUbxkqd36,2,XJF
35,2019/4/26,F,,JRZIZrnoxv77,2,YZQ
36,2019/4/28,F,,AHXJDysvll32,2,PXI


In [36]:
df.sort_values(by=['Age', 'Rank'] ) #先按照年齡排，再用Rank排序

Unnamed: 0,Date,Binary,Age,UniqueKey,Rank,Tag
108,2019/12/10,F,1.0,PYTZGnedko80,2,HXX
291,2021/11/7,F,1.0,JMPWCeezvi24,4,LWK
144,2020/5/11,F,1.0,FBWCTeikqy95,5,EWT
263,2021/7/21,M,1.0,YCEZJcyyav82,5,LKA
268,2021/8/13,F,1.0,KHNMTjdwbc33,5,TKC
...,...,...,...,...,...,...
169,2020/8/2,F,99.0,LSNLOnsopw29,5,BRH
34,2019/4/24,F,,CEXYUbxkqd36,2,XJF
35,2019/4/26,F,,JRZIZrnoxv77,2,YZQ
36,2019/4/28,F,,AHXJDysvll32,2,PXI


## groupby

In [37]:
df.groupby('Binary').sum() 

Unnamed: 0_level_0,Age,Rank
Binary,Unnamed: 1_level_1,Unnamed: 2_level_1
F,7445.0,453
M,7216.0,462


In [38]:
df.groupby('Binary' ,as_index=False).sum()

Unnamed: 0,Binary,Age,Rank
0,F,7445.0,453
1,M,7216.0,462


In [39]:
df.groupby('Binary')['Age'].sum()

Binary
F    7445.0
M    7216.0
Name: Age, dtype: float64

## 刪除欄位

### 1.刪除欄

In [40]:
df.head()

Unnamed: 0,Date,Binary,Age,UniqueKey,Rank,Tag
0,2019/1/3,F,90.0,FEPNDpfzky33,2,JJC
1,2019/1/3,F,40.0,OBMUHsapme35,2,CLP
2,2019/1/6,F,49.0,IPLXWuxetd57,1,XOI
3,2019/1/9,F,48.0,ILJAEfznol59,1,AKG
4,2019/1/13,M,52.0,AJRDLgpbta11,5,WHK


In [41]:
df.drop('Age', axis=1) #刪除Age欄

Unnamed: 0,Date,Binary,UniqueKey,Rank,Tag
0,2019/1/3,F,FEPNDpfzky33,2,JJC
1,2019/1/3,F,OBMUHsapme35,2,CLP
2,2019/1/6,F,IPLXWuxetd57,1,XOI
3,2019/1/9,F,ILJAEfznol59,1,AKG
4,2019/1/13,M,AJRDLgpbta11,5,WHK
...,...,...,...,...,...
295,2021/11/14,F,EBINKcuiyn43,5,TNC
296,2021/11/28,M,XMNWClfudo21,1,NTS
297,2021/12/17,F,BCSNLtrrjb12,4,EXY
298,2021/12/18,F,SPBTSpoepd15,5,QWC


In [42]:
df.drop(['Age', 'UniqueKey','Rank'], axis=1) #刪除多個欄位

Unnamed: 0,Date,Binary,Tag
0,2019/1/3,F,JJC
1,2019/1/3,F,CLP
2,2019/1/6,F,XOI
3,2019/1/9,F,AKG
4,2019/1/13,M,WHK
...,...,...,...
295,2021/11/14,F,TNC
296,2021/11/28,M,NTS
297,2021/12/17,F,EXY
298,2021/12/18,F,QWC


### 2. 刪除列

In [43]:
df.drop([3]) #刪除第3列

Unnamed: 0,Date,Binary,Age,UniqueKey,Rank,Tag
0,2019/1/3,F,90.0,FEPNDpfzky33,2,JJC
1,2019/1/3,F,40.0,OBMUHsapme35,2,CLP
2,2019/1/6,F,49.0,IPLXWuxetd57,1,XOI
4,2019/1/13,M,52.0,AJRDLgpbta11,5,WHK
5,2019/1/15,M,75.0,VTSNEcjenw28,3,PVQ
...,...,...,...,...,...,...
295,2021/11/14,F,17.0,EBINKcuiyn43,5,TNC
296,2021/11/28,M,47.0,XMNWClfudo21,1,NTS
297,2021/12/17,F,2.0,BCSNLtrrjb12,4,EXY
298,2021/12/18,F,26.0,SPBTSpoepd15,5,QWC


In [44]:
df.drop([1,2,3]) #刪除第1-3列

Unnamed: 0,Date,Binary,Age,UniqueKey,Rank,Tag
0,2019/1/3,F,90.0,FEPNDpfzky33,2,JJC
4,2019/1/13,M,52.0,AJRDLgpbta11,5,WHK
5,2019/1/15,M,75.0,VTSNEcjenw28,3,PVQ
6,2019/1/30,F,3.0,RMKVPosvij11,4,IQC
7,2019/1/31,M,26.0,RIBABpejsc95,3,ZKR
...,...,...,...,...,...,...
295,2021/11/14,F,17.0,EBINKcuiyn43,5,TNC
296,2021/11/28,M,47.0,XMNWClfudo21,1,NTS
297,2021/12/17,F,2.0,BCSNLtrrjb12,4,EXY
298,2021/12/18,F,26.0,SPBTSpoepd15,5,QWC


## 資料合併

In [45]:
df4=df.merge(df1, on='UniqueKey') #將兩個資料表依據UniqueKey合併
df4

Unnamed: 0,Date,Binary,Age,UniqueKey,Rank,Tag,Amount
0,2019/1/3,F,90.0,FEPNDpfzky33,2,JJC,957556
1,2019/1/3,F,40.0,OBMUHsapme35,2,CLP,624250
2,2019/1/6,F,49.0,IPLXWuxetd57,1,XOI,567672
3,2019/1/9,F,48.0,ILJAEfznol59,1,AKG,817285
4,2019/1/13,M,52.0,AJRDLgpbta11,5,WHK,798313
...,...,...,...,...,...,...,...
295,2021/11/14,F,17.0,EBINKcuiyn43,5,TNC,772347
296,2021/11/28,M,47.0,XMNWClfudo21,1,NTS,863901
297,2021/12/17,F,2.0,BCSNLtrrjb12,4,EXY,886657
298,2021/12/18,F,26.0,SPBTSpoepd15,5,QWC,88644


## 樞紐分析

In [46]:
df5 = df.pivot_table(index=['Binary', 'Rank'], values='Age')
df5

Unnamed: 0_level_0,Unnamed: 1_level_0,Age
Binary,Rank,Unnamed: 2_level_1
F,1,54.962963
F,2,47.870968
F,3,56.305556
F,4,44.72
F,5,44.4
M,1,49.642857
M,2,38.96
M,3,51.233333
M,4,55.0
M,5,49.305556


In [47]:
df6 = df.pivot_table(index=['Binary'], values='Age', columns='Rank')
df6

Rank,1,2,3,4,5
Binary,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
F,54.962963,47.870968,56.305556,44.72,44.4
M,49.642857,38.96,51.233333,55.0,49.305556


In [48]:
df7 = df.pivot_table(index=['Binary'], values='Age', aggfunc=['mean', 'count','max','min'])  #margins=True是否顯示欄位加總
df7

Unnamed: 0_level_0,mean,count,max,min
Unnamed: 0_level_1,Age,Age,Age,Age
Binary,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
F,49.966443,149,99.0,1.0
M,49.088435,147,99.0,1.0


## 匯出資料

In [49]:
df7.to_excel('output.xlsx')