# 10 Minutes to pandas

In [68]:
import numpy as np 
import matplotlib.pyplot as plt 

import pandas as pd 

## Object Creation

In [69]:
#== Series by passing list: Index is auto-created ==
s = pd.Series([1,3,5,np.nan,6,8])
print(s)

0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64


In [70]:
#== DataFrame by passing numpy array
#   index column is specified (as dates). 
#   column name is also given
dates = pd.date_range('20180101', periods=6)
print(dates, '\n')

# random data 6(rows)x4(columns)
data = np.random.randn(6,4)

df = pd.DataFrame(data, index=dates, columns=['A','B','C','D'])
print(df)

DatetimeIndex(['2018-01-01', '2018-01-02', '2018-01-03', '2018-01-04',
               '2018-01-05', '2018-01-06'],
              dtype='datetime64[ns]', freq='D') 

                   A         B         C         D
2018-01-01 -0.555704  0.033537  0.547586  1.592913
2018-01-02 -1.230058 -0.125547  0.124874 -0.644727
2018-01-03  0.955791  0.778720  0.377020  0.986278
2018-01-04 -0.860820  0.287548  2.773876 -2.570816
2018-01-05 -0.090868  0.420237 -1.353417 -2.420090
2018-01-06 -0.797660 -0.082788  2.509065 -1.084843


In [71]:
#== DataFame by passing dict 
df2 = pd.DataFrame({ 
    'A': 1., 
    'B': pd.Timestamp('20180101'),
    'C': pd.Series(1, index=list(range(4)), dtype='float32'),
    'D': np.array([3]*3+[4], dtype='int32'),
    'E': pd.Categorical(["test","train","train","test"]),
    'F': 'foo'})
print(df2,'\n')
print(df2.dtypes)

     A          B    C  D      E    F
0  1.0 2018-01-01  1.0  3   test  foo
1  1.0 2018-01-01  1.0  3  train  foo
2  1.0 2018-01-01  1.0  3  train  foo
3  1.0 2018-01-01  1.0  4   test  foo 

A           float64
B    datetime64[ns]
C           float32
D             int32
E          category
F            object
dtype: object


In [72]:
#== interactive browsing 
df.head(3) # only 3 data rows

Unnamed: 0,A,B,C,D
2018-01-01,-0.555704,0.033537,0.547586,1.592913
2018-01-02,-1.230058,-0.125547,0.124874,-0.644727
2018-01-03,0.955791,0.77872,0.37702,0.986278


In [73]:
df.tail(3)

Unnamed: 0,A,B,C,D
2018-01-04,-0.86082,0.287548,2.773876,-2.570816
2018-01-05,-0.090868,0.420237,-1.353417,-2.42009
2018-01-06,-0.79766,-0.082788,2.509065,-1.084843


In [74]:
#== Access to internal of DataFrame 

print(df.index, '\n')
print(df.columns, '\n')
print(df.values,'\n')  #as numpy array
print(df.values.shape)

DatetimeIndex(['2018-01-01', '2018-01-02', '2018-01-03', '2018-01-04',
               '2018-01-05', '2018-01-06'],
              dtype='datetime64[ns]', freq='D') 

Index(['A', 'B', 'C', 'D'], dtype='object') 

[[-0.5557044   0.03353732  0.54758616  1.59291253]
 [-1.2300578  -0.12554719  0.12487448 -0.64472679]
 [ 0.95579115  0.77872029  0.3770196   0.98627773]
 [-0.86082003  0.28754822  2.77387626 -2.57081626]
 [-0.09086792  0.42023689 -1.3534175  -2.42008974]
 [-0.79766026 -0.08278798  2.50906455 -1.08484293]] 

(6, 4)


In [75]:
#== Describe : Show quick status summary (interactive tool)
df.describe()

Unnamed: 0,A,B,C,D
count,6.0,6.0,6.0,6.0
mean,-0.429887,0.218618,0.829834,-0.690214
std,0.776157,0.347179,1.558677,1.715239
min,-1.230058,-0.125547,-1.353417,-2.570816
25%,-0.84503,-0.053707,0.187911,-2.086278
50%,-0.676682,0.160543,0.462303,-0.864785
75%,-0.207077,0.387065,2.018695,0.578527
max,0.955791,0.77872,2.773876,1.592913


In [76]:
#== Transpose (like numpy)
df.T

Unnamed: 0,2018-01-01 00:00:00,2018-01-02 00:00:00,2018-01-03 00:00:00,2018-01-04 00:00:00,2018-01-05 00:00:00,2018-01-06 00:00:00
A,-0.555704,-1.230058,0.955791,-0.86082,-0.090868,-0.79766
B,0.033537,-0.125547,0.77872,0.287548,0.420237,-0.082788
C,0.547586,0.124874,0.37702,2.773876,-1.353417,2.509065
D,1.592913,-0.644727,0.986278,-2.570816,-2.42009,-1.084843


In [77]:
#== Sorting axis by labels (axis=0:row, axis=1:column)
df.sort_index(axis=1,ascending=False)

Unnamed: 0,D,C,B,A
2018-01-01,1.592913,0.547586,0.033537,-0.555704
2018-01-02,-0.644727,0.124874,-0.125547,-1.230058
2018-01-03,0.986278,0.37702,0.77872,0.955791
2018-01-04,-2.570816,2.773876,0.287548,-0.86082
2018-01-05,-2.42009,-1.353417,0.420237,-0.090868
2018-01-06,-1.084843,2.509065,-0.082788,-0.79766


In [78]:
#== Sorting by values of a column 
df.sort_values(by='B', ascending=True)

Unnamed: 0,A,B,C,D
2018-01-02,-1.230058,-0.125547,0.124874,-0.644727
2018-01-06,-0.79766,-0.082788,2.509065,-1.084843
2018-01-01,-0.555704,0.033537,0.547586,1.592913
2018-01-04,-0.86082,0.287548,2.773876,-2.570816
2018-01-05,-0.090868,0.420237,-1.353417,-2.42009
2018-01-03,0.955791,0.77872,0.37702,0.986278


## Selection

In [81]:
#== Select by column (like map, or structure)
df['A']

2018-01-01   -0.555704
2018-01-02   -1.230058
2018-01-03    0.955791
2018-01-04   -0.860820
2018-01-05   -0.090868
2018-01-06   -0.797660
Freq: D, Name: A, dtype: float64

In [93]:
#== Slicing rows, either by row index or data index
#   Note: Slicing by data index is **inclusive** range specification. 
#         Slicing by row index is same as numpy slice (last element is exclusive)
print(df[0:3],'\n')
print(df['20180102':'20180103'], '\n')

x=np.array([0,1,2,3,4])
print('Numpy slice x[0:3]=', x[0:3])

                   A         B         C         D
2018-01-01 -0.555704  0.033537  0.547586  1.592913
2018-01-02 -1.230058 -0.125547  0.124874 -0.644727
2018-01-03  0.955791  0.778720  0.377020  0.986278 

                   A         B         C         D
2018-01-02 -1.230058 -0.125547  0.124874 -0.644727
2018-01-03  0.955791  0.778720  0.377020  0.986278 

Numpy slice x[0:3]= [0 1 2]


In [96]:
#=== By Label: row selection by 'data index'
df.loc[dates[0]]

A   -0.555704
B    0.033537
C    0.547586
D    1.592913
Name: 2018-01-01 00:00:00, dtype: float64

In [98]:
#=== By Label: Selecting multi-axis by column labels 
df.loc[:,['A','B']]

Unnamed: 0,A,B
2018-01-01,-0.555704,0.033537
2018-01-02,-1.230058,-0.125547
2018-01-03,0.955791,0.77872
2018-01-04,-0.86082,0.287548
2018-01-05,-0.090868,0.420237
2018-01-06,-0.79766,-0.082788


In [101]:
#== By Label: row range by data index, column range by column names 
df.loc['20180101':'20180103', ['A','B']]

Unnamed: 0,A,B
2018-01-01,-0.555704,0.033537
2018-01-02,-1.230058,-0.125547
2018-01-03,0.955791,0.77872


In [103]:
#== By label: only one row -> dimension reduction 
df.loc['20180103', ['A','B']]

A    0.955791
B    0.778720
Name: 2018-01-03 00:00:00, dtype: float64

In [105]:
#== By Label: A parcular cell (scalar value)
df.loc['20180103','A']

0.95579114900553885

In [108]:
#== By Label: Same as above, but faster 
df.at[dates[2],'A']

0.95579114900553885

In [112]:
#== By position (row/column indexes)
print(df.iloc[3],'\n')

print(df.iloc[3:5,0:2],'\n')

print(df.iloc[[1,2,4],[0,2]],'\n')

print(df.iloc[1:3,:],'\n')

print(df.iloc[:,1:3],'\n')

print(df.iloc[1,1])

A   -0.860820
B    0.287548
C    2.773876
D   -2.570816
Name: 2018-01-04 00:00:00, dtype: float64 

                   A         B
2018-01-04 -0.860820  0.287548
2018-01-05 -0.090868  0.420237 

                   A         C
2018-01-02 -1.230058  0.124874
2018-01-03  0.955791  0.377020
2018-01-05 -0.090868 -1.353417 

                   A         B         C         D
2018-01-02 -1.230058 -0.125547  0.124874 -0.644727
2018-01-03  0.955791  0.778720  0.377020  0.986278 

                   B         C
2018-01-01  0.033537  0.547586
2018-01-02 -0.125547  0.124874
2018-01-03  0.778720  0.377020
2018-01-04  0.287548  2.773876
2018-01-05  0.420237 -1.353417
2018-01-06 -0.082788  2.509065 

-0.125547194261


In [117]:
#== Boolean Indexing 
print(df[df.B>0],'\n')

print(df[df>0],'\n')

                   A         B         C         D
2018-01-01 -0.555704  0.033537  0.547586  1.592913
2018-01-03  0.955791  0.778720  0.377020  0.986278
2018-01-04 -0.860820  0.287548  2.773876 -2.570816
2018-01-05 -0.090868  0.420237 -1.353417 -2.420090 

                   A         B         C         D
2018-01-01       NaN  0.033537  0.547586  1.592913
2018-01-02       NaN       NaN  0.124874       NaN
2018-01-03  0.955791  0.778720  0.377020  0.986278
2018-01-04       NaN  0.287548  2.773876       NaN
2018-01-05       NaN  0.420237       NaN       NaN
2018-01-06       NaN       NaN  2.509065       NaN 



In [122]:
#== Using isin() method 
print(df2,'\n')

print(df2[df2['E'].isin(['test'])])  # select E column=='test' only 

     A          B    C  D      E    F
0  1.0 2018-01-01  1.0  3   test  foo
1  1.0 2018-01-01  1.0  3  train  foo
2  1.0 2018-01-01  1.0  3  train  foo
3  1.0 2018-01-01  1.0  4   test  foo 

     A          B    C  D     E    F
0  1.0 2018-01-01  1.0  3  test  foo
3  1.0 2018-01-01  1.0  4  test  foo


## Setting (by Selection)

In [125]:
# prerequisite:  len(df) = # of data rows 
len(df)

6

In [135]:
s1 = pd.Series([1,2,3,4,5,6],index=pd.date_range('20180102',periods=6))
print(s1,'\n')

#== Set by Series (new column)
df['F'] = s1;   # Note that last element of s1 is not set to df[]. Not union, outer join. 
print(df,'\n')

#== Set by label 
df.at[dates[0], 'A'] = 0

#== Set by position index 
df.iat[0,1] = -1  # 'B' column first row

#== Set by numpy array 
df.loc[:,'D'] = np.array([5]*len(df))

print(df)

2018-01-02    1
2018-01-03    2
2018-01-04    3
2018-01-05    4
2018-01-06    5
2018-01-07    6
Freq: D, dtype: int64 

                   A         B         C  D    F
2018-01-01  0.000000 -1.000000  0.547586  5  NaN
2018-01-02 -1.230058 -0.125547  0.124874  5  1.0
2018-01-03  0.955791  0.778720  0.377020  5  2.0
2018-01-04 -0.860820  0.287548  2.773876  5  3.0
2018-01-05 -0.090868  0.420237 -1.353417  5  4.0
2018-01-06 -0.797660 -0.082788  2.509065  5  5.0 

                   A         B         C  D    F
2018-01-01  0.000000 -1.000000  0.547586  5  NaN
2018-01-02 -1.230058 -0.125547  0.124874  5  1.0
2018-01-03  0.955791  0.778720  0.377020  5  2.0
2018-01-04 -0.860820  0.287548  2.773876  5  3.0
2018-01-05 -0.090868  0.420237 -1.353417  5  4.0
2018-01-06 -0.797660 -0.082788  2.509065  5  5.0


In [137]:
#== Selective overwite 
dfx = df.copy()
dfx[dfx < 0] = -dfx  # abs
print(dfx)

                   A         B         C  D    F
2018-01-01  0.000000  1.000000  0.547586  5  NaN
2018-01-02  1.230058  0.125547  0.124874  5  1.0
2018-01-03  0.955791  0.778720  0.377020  5  2.0
2018-01-04  0.860820  0.287548  2.773876  5  3.0
2018-01-05  0.090868  0.420237  1.353417  5  4.0
2018-01-06  0.797660  0.082788  2.509065  5  5.0


## Missing Data

In [140]:
#== Reindex and get new DataFrame (subset) 
dfx = df.reindex( index=dates[0:4], columns=list(df.columns) + ['E'])
print(dfx, '\n')

dfx.loc[dates[0]:dates[1], 'E'] = 1
print(dfx, '\n')

                   A         B         C  D    F   E
2018-01-01  0.000000 -1.000000  0.547586  5  NaN NaN
2018-01-02 -1.230058 -0.125547  0.124874  5  1.0 NaN
2018-01-03  0.955791  0.778720  0.377020  5  2.0 NaN
2018-01-04 -0.860820  0.287548  2.773876  5  3.0 NaN 

                   A         B         C  D    F    E
2018-01-01  0.000000 -1.000000  0.547586  5  NaN  1.0
2018-01-02 -1.230058 -0.125547  0.124874  5  1.0  1.0
2018-01-03  0.955791  0.778720  0.377020  5  2.0  NaN
2018-01-04 -0.860820  0.287548  2.773876  5  3.0  NaN 



In [142]:
#== Drop any rows having missing data 
print(dfx.dropna(how='any'))

                   A         B         C  D    F    E
2018-01-02 -1.230058 -0.125547  0.124874  5  1.0  1.0


In [145]:
#== Filling missing value 
print(dfx.fillna(value=-999.25),'\n')
print(dfx) # original data remains unchanged. 

                   A         B         C  D       F       E
2018-01-01  0.000000 -1.000000  0.547586  5 -999.25    1.00
2018-01-02 -1.230058 -0.125547  0.124874  5    1.00    1.00
2018-01-03  0.955791  0.778720  0.377020  5    2.00 -999.25
2018-01-04 -0.860820  0.287548  2.773876  5    3.00 -999.25 

                   A         B         C  D    F    E
2018-01-01  0.000000 -1.000000  0.547586  5  NaN  1.0
2018-01-02 -1.230058 -0.125547  0.124874  5  1.0  1.0
2018-01-03  0.955791  0.778720  0.377020  5  2.0  NaN
2018-01-04 -0.860820  0.287548  2.773876  5  3.0  NaN


In [151]:
#== Check if NaN or not 
#print(pd.isna(dfx)) <== depreted?? 
print(pd.isnull(dfx))

                A      B      C      D      F      E
2018-01-01  False  False  False  False   True  False
2018-01-02  False  False  False  False  False  False
2018-01-03  False  False  False  False  False   True
2018-01-04  False  False  False  False  False   True


## Operations

### Stats

In [155]:
# mean() along each column 
df.mean() # same as df.mean(0)

A   -0.337269
B    0.046362
C    0.829834
D    5.000000
F    3.000000
dtype: float64

In [158]:
# mean() along each row (axis=1)
df.mean(1)

2018-01-01    1.136897
2018-01-02    0.953854
2018-01-03    1.822306
2018-01-04    2.040121
2018-01-05    1.595190
2018-01-06    2.325723
Freq: D, dtype: float64

In [162]:
# subtract 
print(df,'\n')

s = pd.Series([1,3,5,np.nan,6,8],index=dates).shift(2) # shift 2 rows down (2 NaN at top)
print(s,'\n')

# subtract 
print(df.sub(s,axis='index'))

                   A         B         C  D    F
2018-01-01  0.000000 -1.000000  0.547586  5  NaN
2018-01-02 -1.230058 -0.125547  0.124874  5  1.0
2018-01-03  0.955791  0.778720  0.377020  5  2.0
2018-01-04 -0.860820  0.287548  2.773876  5  3.0
2018-01-05 -0.090868  0.420237 -1.353417  5  4.0
2018-01-06 -0.797660 -0.082788  2.509065  5  5.0 

2018-01-01    NaN
2018-01-02    NaN
2018-01-03    1.0
2018-01-04    3.0
2018-01-05    5.0
2018-01-06    NaN
Freq: D, dtype: float64 

                   A         B         C    D    F
2018-01-01       NaN       NaN       NaN  NaN  NaN
2018-01-02       NaN       NaN       NaN  NaN  NaN
2018-01-03 -0.044209 -0.221280 -0.622980  4.0  1.0
2018-01-04 -3.860820 -2.712452 -0.226124  2.0  0.0
2018-01-05 -5.090868 -4.579763 -6.353417  0.0 -1.0
2018-01-06       NaN       NaN       NaN  NaN  NaN


### Apply 

In [166]:
# Apply np.cumsum() to each column 
print(df.apply(np.cumsum))

                   A         B         C   D     F
2018-01-01  0.000000 -1.000000  0.547586   5   NaN
2018-01-02 -1.230058 -1.125547  0.672461  10   1.0
2018-01-03 -0.274267 -0.346827  1.049480  15   3.0
2018-01-04 -1.135087 -0.059279  3.823357  20   6.0
2018-01-05 -1.225955  0.360958  2.469939  25  10.0
2018-01-06 -2.023615  0.278170  4.979004  30  15.0


In [174]:
# Apply lambda function to each column.  This example yilelds scalar value from function.
# As seen below, NaN is ignored in calculation. 
print(df,'\n')
print(df.apply(lambda x: x.max()-x.min()))

                   A         B         C  D    F
2018-01-01  0.000000 -1.000000  0.547586  5  NaN
2018-01-02 -1.230058 -0.125547  0.124874  5  1.0
2018-01-03  0.955791  0.778720  0.377020  5  2.0
2018-01-04 -0.860820  0.287548  2.773876  5  3.0
2018-01-05 -0.090868  0.420237 -1.353417  5  4.0
2018-01-06 -0.797660 -0.082788  2.509065  5  5.0 

A    2.185849
B    1.778720
C    4.127294
D    0.000000
F    4.000000
dtype: float64


In [179]:
print(df.apply(lambda x: abs(x)),'\n')  # example of element-by-element lambda fx
print(df.abs()) # same result (to confirm)

                   A         B         C  D    F
2018-01-01  0.000000  1.000000  0.547586  5  NaN
2018-01-02  1.230058  0.125547  0.124874  5  1.0
2018-01-03  0.955791  0.778720  0.377020  5  2.0
2018-01-04  0.860820  0.287548  2.773876  5  3.0
2018-01-05  0.090868  0.420237  1.353417  5  4.0
2018-01-06  0.797660  0.082788  2.509065  5  5.0 

                   A         B         C    D    F
2018-01-01  0.000000  1.000000  0.547586  5.0  NaN
2018-01-02  1.230058  0.125547  0.124874  5.0  1.0
2018-01-03  0.955791  0.778720  0.377020  5.0  2.0
2018-01-04  0.860820  0.287548  2.773876  5.0  3.0
2018-01-05  0.090868  0.420237  1.353417  5.0  4.0
2018-01-06  0.797660  0.082788  2.509065  5.0  5.0


### Histogram 

In [186]:
# sample data: note that Series auto-generate index 
s = pd.Series(np.random.randint(0,7,size=10))  #low,high,size 
print(s,'\n')

# histogram by value_counts
s.value_counts()

0    5
1    1
2    1
3    0
4    5
5    4
6    3
7    2
8    4
9    0
dtype: int64 



5    2
4    2
1    2
0    2
3    1
2    1
dtype: int64

### String method

In [189]:
s = pd.Series(['A','B','C',np.nan,'Dog'])
s.str.lower()

0      a
1      b
2      c
3    NaN
4    dog
dtype: object

## Merge