# Pandas - Data Frame



In [1]:
import pandas as pd

In [2]:
import numpy as np

## Create a dataframe with random values

In [3]:
x = np.arange(10,99,5)

In [4]:
x

array([10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80, 85, 90,
       95])

In [5]:
x=x.reshape(6,3)

In [7]:
df=pd.DataFrame(x,index=["Hung","Hoang","Hai","Tung","Toan","Dat"],columns=["Jan","Feb","Mar"])

In [8]:
df

Unnamed: 0,Jan,Feb,Mar
Hung,10,15,20
Hoang,25,30,35
Hai,40,45,50
Tung,55,60,65
Toan,70,75,80
Dat,85,90,95


## Selecting columns and rows


In [9]:
#Selecting columns

In [10]:
df["Jan"]

Hung     10
Hoang    25
Hai      40
Tung     55
Toan     70
Dat      85
Name: Jan, dtype: int32

In [11]:
type(df["Jan"])

pandas.core.series.Series

In [12]:
type(df)

pandas.core.frame.DataFrame

In [13]:
#Selecting 2 cột trở lên phải ngoặc vuông

In [14]:
df[["Jan","Mar"]]

Unnamed: 0,Jan,Mar
Hung,10,20
Hoang,25,35
Hai,40,50
Tung,55,65
Toan,70,80
Dat,85,95


In [15]:
#Select rows

In [27]:
df.loc["Hung"] #.loc chỉ dùng với tên

Jan    10
Feb    15
Mar    20
Name: Hung, dtype: int32

In [17]:
#Select 2 hàng

In [18]:
df.loc[["Hung","Hoang"]]

Unnamed: 0,Jan,Feb,Mar
Hung,10,15,20
Hoang,25,30,35


In [19]:
#Select dựa trên số thứ tự của hàng

In [20]:
df.iloc[3] #.iloc dùng với số

Jan    55
Feb    60
Mar    65
Name: Tung, dtype: int32

## Select subset (lấy giá trị)



In [21]:
df.iloc[3,1]

60

In [22]:
df.loc["Tung","Feb"]

60

## Conditional selecting

In [23]:
df

Unnamed: 0,Jan,Feb,Mar
Hung,10,15,20
Hoang,25,30,35
Hai,40,45,50
Tung,55,60,65
Toan,70,75,80
Dat,85,90,95


In [24]:
df>10 

Unnamed: 0,Jan,Feb,Mar
Hung,False,True,True
Hoang,True,True,True
Hai,True,True,True
Tung,True,True,True
Toan,True,True,True
Dat,True,True,True


In [25]:
df[df>10]

Unnamed: 0,Jan,Feb,Mar
Hung,,15,20
Hoang,25.0,30,35
Hai,40.0,45,50
Tung,55.0,60,65
Toan,70.0,75,80
Dat,85.0,90,95


In [26]:
df[df["Jan"]>10]

Unnamed: 0,Jan,Feb,Mar
Hoang,25,30,35
Hai,40,45,50
Tung,55,60,65
Toan,70,75,80
Dat,85,90,95


In [28]:
df[df["Mar"]>60]

Unnamed: 0,Jan,Feb,Mar
Tung,55,60,65
Toan,70,75,80
Dat,85,90,95


In [30]:
df[df["Jan"]>10]["Jan"] #chỉ lấy cột "Jan"

Hoang    25
Hai      40
Tung     55
Toan     70
Dat      85
Name: Jan, dtype: int32

In [31]:
df[df["Jan"]>10][["Jan","Feb"]]

Unnamed: 0,Jan,Feb
Hoang,25,30
Hai,40,45
Tung,55,60
Toan,70,75
Dat,85,90


# Combine many conditions (kết hợp nhiêù điều kiện với nhau)

In [32]:
df[(df["Jan"]>10) & (df["Feb"]>20)]

Unnamed: 0,Jan,Feb,Mar
Hoang,25,30,35
Hai,40,45,50
Tung,55,60,65
Toan,70,75,80
Dat,85,90,95


In [33]:
df[(df["Jan"]>10) | (df["Feb"]>20)] # | nghĩa là or

Unnamed: 0,Jan,Feb,Mar
Hoang,25,30,35
Hai,40,45,50
Tung,55,60,65
Toan,70,75,80
Dat,85,90,95


## Create and drop columns, rows

In [41]:
#Create new column (tạo cột)

In [36]:
df

Unnamed: 0,Jan,Feb,Mar
Hung,10,15,20
Hoang,25,30,35
Hai,40,45,50
Tung,55,60,65
Toan,70,75,80
Dat,85,90,95


In [59]:
df=pd.DataFrame(x,index=["Hung","Hoang","Hai","Tung","Toan","Dat"],columns=["Jan","Feb","Mar"])

In [60]:
df["Apr"]=df["Jan"]*2 #tạo cột mới có giá trị gấp đôi cột "Jan"

In [61]:
#Drop columns (xóa cột)

In [62]:
df.drop("Apr",axis=1,inplace=True) #inlace để xóa vĩnh viễn khỏi df

In [53]:
df

Unnamed: 0,Jan,Feb,Mar
Hung,10,15,20
Hoang,25,30,35
Hai,40,45,50
Tung,55,60,65
Toan,70,75,80
Dat,85,90,95


In [46]:
df.drop("Hung") #vì là dòng nên axis mặc định là 0, khỏi ghi cũng đc

Unnamed: 0,Jan,Feb,Mar
Hoang,25,30,35
Hai,40,45,50
Tung,55,60,65
Toan,70,75,80
Dat,85,90,95


In [47]:
df

Unnamed: 0,Jan,Feb,Mar
Hung,10,15,20
Hoang,25,30,35
Hai,40,45,50
Tung,55,60,65
Toan,70,75,80
Dat,85,90,95


## Reset, set and name the index

In [63]:
df

Unnamed: 0,Jan,Feb,Mar
Hung,10,15,20
Hoang,25,30,35
Hai,40,45,50
Tung,55,60,65
Toan,70,75,80
Dat,85,90,95


In [64]:
df.reset_index()

Unnamed: 0,index,Jan,Feb,Mar
0,Hung,10,15,20
1,Hoang,25,30,35
2,Hai,40,45,50
3,Tung,55,60,65
4,Toan,70,75,80
5,Dat,85,90,95


In [65]:
df.reset_index(inplace=True)

In [66]:
df

Unnamed: 0,index,Jan,Feb,Mar
0,Hung,10,15,20
1,Hoang,25,30,35
2,Hai,40,45,50
3,Tung,55,60,65
4,Toan,70,75,80
5,Dat,85,90,95


In [72]:
df.set_index("index",inplace=True)

In [68]:
#Name index

In [74]:
df.index.names = ["Name"]

In [75]:
df

Unnamed: 0_level_0,Jan,Feb,Mar
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Hung,10,15,20
Hoang,25,30,35
Hai,40,45,50
Tung,55,60,65
Toan,70,75,80
Dat,85,90,95
