# Get to know with Pandas
- Ref: *Python for Data Analysis: Data Drangling with Pandas, Numpy, and IPython*, 2nd edition, by Wes McKinney

In [2]:
import numpy as np
import pandas as pd
from pandas import Series, DataFrame

## Series

In [3]:
s = Series([4, 7, -5, 3])
s

0    4
1    7
2   -5
3    3
dtype: int64

In [4]:
s.values

array([ 4,  7, -5,  3])

In [5]:
s.index # 相當於 range(4)

RangeIndex(start=0, stop=4, step=1)

In [6]:
s2 = Series([ 4,  7, -5,  3], index=['a', 'b', 'c', 'd'])
s2

a    4
b    7
c   -5
d    3
dtype: int64

In [7]:
s2.index

Index(['a', 'b', 'c', 'd'], dtype='object')

In [8]:
s2['a']

4

In [9]:
s2['c']

-5

In [10]:
s2[['d', 'c', 'a']]

d    3
c   -5
a    4
dtype: int64

In [11]:
s2[s2 > 0]

a    4
b    7
d    3
dtype: int64

In [12]:
s2 / 2

a    2.0
b    3.5
c   -2.5
d    1.5
dtype: float64

In [13]:
np.sin(s2)

a   -0.756802
b    0.656987
c    0.958924
d    0.141120
dtype: float64

In [14]:
'a' in s2

True

In [15]:
2 in s2

False

In [16]:
di = {'Taiwan': 23400, 'Japan': 34034, 'Korea': 20302}

In [17]:
s3 = Series(di)
s3

Taiwan    23400
Japan     34034
Korea     20302
dtype: int64

In [18]:
idx_perm = ['Japan', 'Korea', 'Taiwan'] # permutated index

In [19]:
Series(di, index=idx_perm)

Japan     34034
Korea     20302
Taiwan    23400
dtype: int64

In [20]:
idx_perm2 = ['Japan', 'Korea', 'Taiwan', 'Mongolia'] # permutated index & 1 extra item

In [21]:
s4 = Series(di, index=idx_perm2)
s4

Japan       34034.0
Korea       20302.0
Taiwan      23400.0
Mongolia        NaN
dtype: float64

In [22]:
pd.isnull(s4)

Japan       False
Korea       False
Taiwan      False
Mongolia     True
dtype: bool

In [23]:
pd.notnull(s4)

Japan        True
Korea        True
Taiwan       True
Mongolia    False
dtype: bool

In [24]:
s4.isnull()

Japan       False
Korea       False
Taiwan      False
Mongolia     True
dtype: bool

In [25]:
s4.notnull()

Japan        True
Korea        True
Taiwan       True
Mongolia    False
dtype: bool

In [26]:
s3 + s4

Japan       68068.0
Korea       40604.0
Mongolia        NaN
Taiwan      46800.0
dtype: float64

In [33]:
s4.name = 'gdp' #these are built-in attributes
s4.index.name = 'country' 

In [34]:
s4

country
Japan       34034.0
Korea       20302.0
Taiwan      23400.0
Mongolia        NaN
Name: gdp, dtype: float64

## DataFrame

In [69]:
data = {'city': ['Tokyo', 'Hong Kong', 'Manila', 'Taipei', 'Singapore'],
        'country': ['Japan', 'China', 'Philippines', 'Taiwan', 'Singapore'],
        'population': [30.0, 6.5, 64.0, 25.2, 8.9]}
frame = pd.DataFrame(data)

In [70]:
frame

Unnamed: 0,city,country,population
0,Tokyo,Japan,30.0
1,Hong Kong,China,6.5
2,Manila,Philippines,64.0
3,Taipei,Taiwan,25.2
4,Singapore,Singapore,8.9


In [71]:
frame.head(2)

Unnamed: 0,city,country,population
0,Tokyo,Japan,30.0
1,Hong Kong,China,6.5


In [72]:
pd.DataFrame(data, columns=['country', 'population', 'city'])

Unnamed: 0,country,population,city
0,Japan,30.0,Tokyo
1,China,6.5,Hong Kong
2,Philippines,64.0,Manila
3,Taiwan,25.2,Taipei
4,Singapore,8.9,Singapore


In [85]:
pd.DataFrame(data, columns=['country', 'city', 'population', 'debt'])

Unnamed: 0,country,city,population,debt
0,Japan,Tokyo,30.0,
1,China,Hong Kong,6.5,
2,Philippines,Manila,64.0,
3,Taiwan,Taipei,25.2,
4,Singapore,Singapore,8.9,


In [86]:
frame2 = pd.DataFrame(data, columns=['country', 'population', 'city', 'debt'], index=['a', 'b', 'c', 'd', 'e'])
frame2

Unnamed: 0,country,population,city,debt
a,Japan,30.0,Tokyo,
b,China,6.5,Hong Kong,
c,Philippines,64.0,Manila,
d,Taiwan,25.2,Taipei,
e,Singapore,8.9,Singapore,


In [87]:
frame2.columns

Index(['country', 'population', 'city', 'debt'], dtype='object')

In [88]:
frame2.country

a          Japan
b          China
c    Philippines
d         Taiwan
e      Singapore
Name: country, dtype: object

In [89]:
frame2.loc['c']

country       Philippines
population           64.0
city               Manila
debt                  NaN
Name: c, dtype: object

In [90]:
frame2['debt'] = 101
frame2

Unnamed: 0,country,population,city,debt
a,Japan,30.0,Tokyo,101
b,China,6.5,Hong Kong,101
c,Philippines,64.0,Manila,101
d,Taiwan,25.2,Taipei,101
e,Singapore,8.9,Singapore,101


In [91]:
frame2['debt'] = np.arange(5) * 101
frame2

Unnamed: 0,country,population,city,debt
a,Japan,30.0,Tokyo,0
b,China,6.5,Hong Kong,101
c,Philippines,64.0,Manila,202
d,Taiwan,25.2,Taipei,303
e,Singapore,8.9,Singapore,404


In [93]:
val = pd.Series([2103, 3100, 1089], index=['a', 'c', 'e'])
frame2['debt'] = val
frame2

Unnamed: 0,country,population,city,debt
a,Japan,30.0,Tokyo,2103.0
b,China,6.5,Hong Kong,
c,Philippines,64.0,Manila,3100.0
d,Taiwan,25.2,Taipei,
e,Singapore,8.9,Singapore,1089.0


In [100]:
frame2['north-east'] = frame2['country'] == 'Japan'
frame2

Unnamed: 0,country,population,city,debt,north-east-asia,north-east
a,Japan,30.0,Tokyo,2103.0,True,True
b,China,6.5,Hong Kong,,False,False
c,Philippines,64.0,Manila,3100.0,False,False
d,Taiwan,25.2,Taipei,,False,False
e,Singapore,8.9,Singapore,1089.0,False,False


In [101]:
del frame2['north-east']
frame2.columns

Index(['country', 'population', 'city', 'debt', 'north-east-asia'], dtype='object')

In [102]:
frame2.T # transpose

Unnamed: 0,a,b,c,d,e
country,Japan,China,Philippines,Taiwan,Singapore
population,30.0,6.5,64.0,25.2,8.9
city,Tokyo,Hong Kong,Manila,Taipei,Singapore
debt,2103.0,,3100.0,,1089.0
north-east-asia,True,False,False,False,False


In [107]:
frame2.index.name = 'alphabet'
frame2.columns.name = 'info'
frame2

info,country,population,city,debt,north-east-asia
alphabet,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
a,Japan,30.0,Tokyo,2103.0,True
b,China,6.5,Hong Kong,,False
c,Philippines,64.0,Manila,3100.0,False
d,Taiwan,25.2,Taipei,,False
e,Singapore,8.9,Singapore,1089.0,False


In [110]:
frame2.values

array([['Japan', 30.0, 'Tokyo', 2103.0, True],
       ['China', 6.5, 'Hong Kong', nan, False],
       ['Philippines', 64.0, 'Manila', 3100.0, False],
       ['Taiwan', 25.2, 'Taipei', nan, False],
       ['Singapore', 8.9, 'Singapore', 1089.0, False]], dtype=object)

## Index

In [126]:
obj = pd.Series(range(3), index = ['A', 'B', 'C'])
idx = obj.index
idx[1:] # this is an index

Index(['B', 'C'], dtype='object')

In [127]:
type(idx)

pandas.core.indexes.base.Index

In [123]:
labels = pd.Index(np.arange(3))
labels

Index([0, 1, 2], dtype='int64')

In [125]:
1 in labels

True

In [129]:
type(frame2)

pandas.core.frame.DataFrame

In [133]:
type(frame2['debt'])

pandas.core.series.Series

In [132]:
type(frame2.columns)

pandas.core.indexes.base.Index

### `reindex` — Reindex a dataframe
[`pandas.DataFrame.reindex`](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.reindex.html)

In [139]:
obj = pd.Series([4.5, 7.2, -5.3, 3.6], index=['d', 'b', 'a', 'c'])
obj

d    4.5
b    7.2
a   -5.3
c    3.6
dtype: float64

In [140]:
obj.reindex(['c', 'd', 'b', 'e', 'a'])

c    3.6
d    4.5
b    7.2
e    NaN
a   -5.3
dtype: float64

In [147]:
obj3 = pd.Series(['blue', 'purple', 'yellow'], index=[0, 2, 4])
obj3

0      blue
2    purple
4    yellow
dtype: object

In [149]:
obj3.reindex(range(6), method='ffill')

0      blue
1      blue
2    purple
3    purple
4    yellow
5    yellow
dtype: object

`method` — filling holes in reindexed DataFrame. *Only applicable to DataFrames/Series with a monotonically increasing/decreasing index.*
- `None` (default): don’t fill gaps
- `pad` / `ffill`: Propagate last valid observation forward to next valid.
- `backfill` / `bfill`: Use next valid observation to fill gap.
- `nearest`: Use nearest valid observations to fill gap.

In [160]:
cities = ['Taipak', 'Kohiong', 'Taitiong']
frame = pd.DataFrame(np.arange(9).reshape((3,3)), 
                     index=['a', 'c', 'd'],
                     columns=cities)
frame

Unnamed: 0,Taipak,Kohiong,Taitiong
a,0,1,2
c,3,4,5
d,6,7,8


In [161]:
frame.reindex(['a', 'b', 'c', 'd'])

Unnamed: 0,Taipak,Kohiong,Taitiong
a,0.0,1.0,2.0
b,,,
c,3.0,4.0,5.0
d,6.0,7.0,8.0


In [162]:
cities_new = ['Kohiong', 'Kelang', 'Taipak']
frame.reindex(columns=cities_new)

Unnamed: 0,Kohiong,Kelang,Taipak
a,1,,0
c,4,,3
d,7,,6


In [163]:
frame.loc[['a', 'd', 'c'], cities]

Unnamed: 0,Taipak,Kohiong,Taitiong
a,0,1,2
d,6,7,8
c,3,4,5


### `drop` — drop specified labels from rows or columns
[`pandas.DataFrame.drop`](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.drop.html)

In [165]:
obj = pd.Series(np.arange(5.), index=['a', 'b', 'c', 'd', 'e'])
obj

a    0.0
b    1.0
c    2.0
d    3.0
e    4.0
dtype: float64

In [167]:
obj2 = obj.drop('c')
obj2

a    0.0
b    1.0
d    3.0
e    4.0
dtype: float64

In [168]:
obj2 = obj.drop(['c', 'a'])
obj2

b    1.0
d    3.0
e    4.0
dtype: float64

In [None]:
df = pd.DataFrame(np.arange(16).reshape((4,4))