<a href="https://colab.research.google.com/github/hatttruong/machine-learning-from-scratch/blob/master/pandas.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

This notebook belongs to github https://github.com/hatttruong/machine-learning-from-scratch

In [1]:
import pandas as pd

# Init

In [2]:
# init from dictionary: key=column_name, value=list_of_value
df = pd.DataFrame(
    {'sqfeet': [750, 800, 850, 900, 950],
     'rent': [1160, 1200, 1280, 1450, 2000]})
df.head()

Unnamed: 0,sqfeet,rent
0,750,1160
1,800,1200
2,850,1280
3,900,1450
4,950,2000


In [3]:
# init from list of tuple (each tuple is a row)
df = pd.DataFrame(
    [(750, 1160), (800, 1200), (850, 1280), (900, 1450), (950, 2000)],
    columns=['sqfeet', 'rent']
    )
df.head()

Unnamed: 0,sqfeet,rent
0,750,1160
1,800,1200
2,850,1280
3,900,1450
4,950,2000


# Filter / Selection

In [4]:
df = pd.DataFrame({'name': ['A', 'B', 'C', 'D', 'E', 'F'],
                   'salary': [100, 77, 80, 95, 30, 45],
                   'title': ['Mr', None, 'Ms', 'Mrs', 'Mr', 'Mr'],
                   'addr': ['123 LTK', '83 PVB', '4 CX', '45/6 LDH', None, None]
                   })
df.head()

Unnamed: 0,name,salary,title,addr
0,A,100,Mr,123 LTK
1,B,77,,83 PVB
2,C,80,Ms,4 CX
3,D,95,Mrs,45/6 LDH
4,E,30,Mr,


## Using `iloc`



In [5]:
# select a single row
df.iloc[1] # return a Series
df.iloc[[1]] # return a df

Unnamed: 0,name,salary,title,addr
1,B,77,,83 PVB


In [6]:
# select a subset continuous row with all columns
df.iloc[1:3,]

Unnamed: 0,name,salary,title,addr
1,B,77,,83 PVB
2,C,80,Ms,4 CX


In [7]:
# select a subset continuous row with a subset columns
df.iloc[1:3, 2:]

Unnamed: 0,title,addr
1,,83 PVB
2,Ms,4 CX


In [8]:
# select a subset continuous row with one column
df.iloc[1:3, [2]]   # return df
df.iloc[1:3, 2]     # return Series

1    None
2      Ms
Name: title, dtype: object

In [9]:
# select a subset not continuous row
df.iloc[[0,2,3]]

Unnamed: 0,name,salary,title,addr
0,A,100,Mr,123 LTK
2,C,80,Ms,4 CX
3,D,95,Mrs,45/6 LDH


In [10]:
# select a subset not continuous row & not continuous columns
df.iloc[[0,2,3], [0, 2]]

Unnamed: 0,name,title
0,A,Mr
2,C,Ms
3,D,Mrs


## Using `loc`

In [11]:
df = pd.DataFrame({'name': ['A', 'B', 'C', 'D', 'E', 'F'],
                   'salary': [100, 77, None, 95, 30, 45],
                   'title': ['Mr', None, 'Ms', 'Mrs', 'Mr', 'Mr'],
                   'addr': ['123 LTK', '83 PVB', '4 CX', '45/6 LDH', None, None]
                   })
df.head()

Unnamed: 0,name,salary,title,addr
0,A,100.0,Mr,123 LTK
1,B,77.0,,83 PVB
2,C,,Ms,4 CX
3,D,95.0,Mrs,45/6 LDH
4,E,30.0,Mr,


In [12]:
df.isna().sum()

name      0
salary    1
title     1
addr      2
dtype: int64

In [15]:
df.loc[[1]]

Unnamed: 0,name,salary,title,addr
1,B,77.0,,83 PVB


In [16]:
df.set_index('name', inplace=True)
df.loc[['A']]

Unnamed: 0_level_0,salary,title,addr
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
A,100.0,Mr,123 LTK


In [17]:
df.loc[['A', 'C']]

Unnamed: 0_level_0,salary,title,addr
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
A,100.0,Mr,123 LTK
C,,Ms,4 CX


In [18]:
df.loc[['A', 'C'],  ['salary', 'addr']]

Unnamed: 0_level_0,salary,addr
name,Unnamed: 1_level_1,Unnamed: 2_level_1
A,100.0,123 LTK
C,,4 CX


In [19]:
# get a single row based on condition

df.loc[df.title == 'Mr']

Unnamed: 0_level_0,salary,title,addr
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
A,100.0,Mr,123 LTK
E,30.0,Mr,
F,45.0,Mr,


In [20]:
df.loc[(df.salary >= 40) & (df.title == 'Mr')]

Unnamed: 0_level_0,salary,title,addr
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
A,100.0,Mr,123 LTK
F,45.0,Mr,


In [21]:
df.loc[df.salary.isna()]

Unnamed: 0_level_0,salary,title,addr
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
C,,Ms,4 CX


In [22]:
df.loc[(~df.salary.isna()) & (~df.addr.isna())]

Unnamed: 0_level_0,salary,title,addr
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
A,100.0,Mr,123 LTK
B,77.0,,83 PVB
D,95.0,Mrs,45/6 LDH
