In [1]:
import numpy as np
import pandas as pd


# Pandas Series

A Series is a single column of a DataFrame. A Series does not have a column name 

In [2]:
pd.Series([1, 2, 3, 4, 5])

0    1
1    2
2    3
3    4
4    5
dtype: int64

In [3]:
pd.Series([30, 35, 40], index=['2015 Sales', '2016 Sales', '2017 Sales'], name='Product A')

2015 Sales    30
2016 Sales    35
2017 Sales    40
Name: Product A, dtype: int64

In [5]:
data = {'a' : 0., 'b' : 1., 'c' : 2.}
s = pd.Series(data)
s

a    0.0
b    1.0
c    2.0
dtype: float64

In [9]:
s['a']

1

In [8]:
#retrieve multiple elements
s[['a','c','d']]

a    1
c    3
d    4
dtype: int64

In [10]:
s = pd.Series(5, index=[0, 1, 2, 3]) 
s

0    5
1    5
2    5
3    5
dtype: int64

In [31]:
s = pd.Series([1,2,3,4,-5],index = ['a','b','c','d','e'])
s


a    1
b    2
c    3
d    4
e   -5
dtype: int64

In [24]:
s.size

5

In [25]:
# number of non-null values
s.count()

5

In [26]:
# product of values
s.prod()

120

In [27]:
# Cumulative Sum
s.cumsum()

a     1
b     3
c     6
d    10
e    15
dtype: int64

In [28]:
#Cumulative Products
s.cumprod()

a      1
b      2
c      6
d     24
e    120
dtype: int64

In [35]:
print(s.min())
print(s.max())
print(s.median())
print(s.std())


-5
4
2.0
3.5355339059327378
abs
a    1
b    2
c    3
d    4
e    5
dtype: int64


In [36]:
print("abs")
print(s.abs())

abs
a    1
b    2
c    3
d    4
e    5
dtype: int64


In [37]:
s.mode()

0   -5
1    1
2    2
3    3
4    4
dtype: int64

In [38]:
s.sort_values()

e   -5
a    1
b    2
c    3
d    4
dtype: int64

In [41]:
s

a    1
b    2
c    3
d    4
e   -5
dtype: int64

In [39]:
s.argmax()

3

In [40]:
s.max()

4

In [42]:
sorted(s)

[-5, 1, 2, 3, 4]

# DataFrame

In [45]:
dates = pd.date_range('20130101', periods=6)
dates

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [52]:
type(dates)

pandas.core.indexes.datetimes.DatetimeIndex

In [56]:
df = pd.DataFrame(np.random.randn(6, 4), index=dates, columns=list('ABCD'))
df

Unnamed: 0,A,B,C,D
2013-01-01,0.285164,-0.362255,-0.323429,-0.755384
2013-01-02,1.252953,-1.250368,1.413212,-0.474227
2013-01-03,0.557016,-1.207581,-1.969999,-1.511136
2013-01-04,-0.165609,0.409328,2.006479,-1.370305
2013-01-05,1.226009,0.728823,1.337384,0.104674
2013-01-06,-1.387106,-1.250046,0.047053,0.633616


In [57]:
df.index

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [50]:
df2 = pd.DataFrame({'A': 1.,
'B': pd.Timestamp('20130102'),
'C': pd.Series(1, index=list(range(4)), dtype='float32'),
'D': np.array([3] * 4, dtype='int32'),
'E': pd.Categorical(["test", "train", "test", "train"]),
'F': 'foo'})

df2

Unnamed: 0,A,B,C,D,E,F
0,1.0,2013-01-02,1.0,3,test,foo
1,1.0,2013-01-02,1.0,3,train,foo
2,1.0,2013-01-02,1.0,3,test,foo
3,1.0,2013-01-02,1.0,3,train,foo


In [51]:
df2.dtypes

A           float64
B    datetime64[ns]
C           float32
D             int32
E          category
F            object
dtype: object

In [53]:
df2.index

Int64Index([0, 1, 2, 3], dtype='int64')

In [54]:
df2.columns

Index(['A', 'B', 'C', 'D', 'E', 'F'], dtype='object')

In [58]:
df.describe()

Unnamed: 0,A,B,C,D
count,6.0,6.0,6.0,6.0
mean,0.294738,-0.488683,0.41845,-0.562127
std,0.989428,0.892343,1.466208,0.833459
min,-1.387106,-1.250368,-1.969999,-1.511136
25%,-0.052916,-1.23943,-0.230809,-1.216575
50%,0.42109,-0.784918,0.692218,-0.614806
75%,1.058761,0.216432,1.394255,-0.040051
max,1.252953,0.728823,2.006479,0.633616


In [61]:
df.sort_index(axis=1, ascending=True)

Unnamed: 0,A,B,C,D
2013-01-01,0.285164,-0.362255,-0.323429,-0.755384
2013-01-02,1.252953,-1.250368,1.413212,-0.474227
2013-01-03,0.557016,-1.207581,-1.969999,-1.511136
2013-01-04,-0.165609,0.409328,2.006479,-1.370305
2013-01-05,1.226009,0.728823,1.337384,0.104674
2013-01-06,-1.387106,-1.250046,0.047053,0.633616


In [62]:
df.sort_values(by='C')

Unnamed: 0,A,B,C,D
2013-01-03,0.557016,-1.207581,-1.969999,-1.511136
2013-01-01,0.285164,-0.362255,-0.323429,-0.755384
2013-01-06,-1.387106,-1.250046,0.047053,0.633616
2013-01-05,1.226009,0.728823,1.337384,0.104674
2013-01-02,1.252953,-1.250368,1.413212,-0.474227
2013-01-04,-0.165609,0.409328,2.006479,-1.370305


In [59]:
df.T

Unnamed: 0,2013-01-01,2013-01-02,2013-01-03,2013-01-04,2013-01-05,2013-01-06
A,0.285164,1.252953,0.557016,-0.165609,1.226009,-1.387106
B,-0.362255,-1.250368,-1.207581,0.409328,0.728823,-1.250046
C,-0.323429,1.413212,-1.969999,2.006479,1.337384,0.047053
D,-0.755384,-0.474227,-1.511136,-1.370305,0.104674,0.633616


In [64]:
df

Unnamed: 0,A,B,C,D
2013-01-01,0.285164,-0.362255,-0.323429,-0.755384
2013-01-02,1.252953,-1.250368,1.413212,-0.474227
2013-01-03,0.557016,-1.207581,-1.969999,-1.511136
2013-01-04,-0.165609,0.409328,2.006479,-1.370305
2013-01-05,1.226009,0.728823,1.337384,0.104674
2013-01-06,-1.387106,-1.250046,0.047053,0.633616


In [111]:
len(df)

6

In [112]:
df.size

30

In [114]:
df.shape

(6, 5)

## Selection

In [63]:
df['A']

2013-01-01    0.285164
2013-01-02    1.252953
2013-01-03    0.557016
2013-01-04   -0.165609
2013-01-05    1.226009
2013-01-06   -1.387106
Freq: D, Name: A, dtype: float64

In [65]:
df[0:3]

Unnamed: 0,A,B,C,D
2013-01-01,0.285164,-0.362255,-0.323429,-0.755384
2013-01-02,1.252953,-1.250368,1.413212,-0.474227
2013-01-03,0.557016,-1.207581,-1.969999,-1.511136


In [66]:
df['20130102':'20130104']

Unnamed: 0,A,B,C,D
2013-01-02,1.252953,-1.250368,1.413212,-0.474227
2013-01-03,0.557016,-1.207581,-1.969999,-1.511136
2013-01-04,-0.165609,0.409328,2.006479,-1.370305


### Selection by label


In [73]:
df.loc[:,'A']

2013-01-01    0.285164
2013-01-02    1.252953
2013-01-03    0.557016
2013-01-04   -0.165609
2013-01-05    1.226009
2013-01-06   -1.387106
Freq: D, Name: A, dtype: float64

In [74]:
df.loc[:,['A','B']]

Unnamed: 0,A,B
2013-01-01,0.285164,-0.362255
2013-01-02,1.252953,-1.250368
2013-01-03,0.557016,-1.207581
2013-01-04,-0.165609,0.409328
2013-01-05,1.226009,0.728823
2013-01-06,-1.387106,-1.250046


In [77]:
df.loc['20130103']

A    0.557016
B   -1.207581
C   -1.969999
D   -1.511136
Name: 2013-01-03 00:00:00, dtype: float64

In [78]:
df.loc['20130103','A']

0.5570156393225211

### Selection by position

In [79]:
df.iloc[3]

A   -0.165609
B    0.409328
C    2.006479
D   -1.370305
Name: 2013-01-04 00:00:00, dtype: float64

In [83]:
df.iloc[3,3]

-1.3703051064898302

In [80]:
df.iloc[3:5, 0:2]

Unnamed: 0,A,B
2013-01-04,-0.165609,0.409328
2013-01-05,1.226009,0.728823


In [81]:
df.iloc[[1, 2, 4], [0, 2]]

Unnamed: 0,A,C
2013-01-02,1.252953,1.413212
2013-01-03,0.557016,-1.969999
2013-01-05,1.226009,1.337384


In [84]:
df.iloc[:, 2]

2013-01-01   -0.323429
2013-01-02    1.413212
2013-01-03   -1.969999
2013-01-04    2.006479
2013-01-05    1.337384
2013-01-06    0.047053
Freq: D, Name: C, dtype: float64

In [85]:
df.iloc[-1,-1]

0.6336158679808335

In [91]:
df.iloc[-1,3]

0.6336158679808335

## Boolean indexing

In [92]:
df['A']>0

2013-01-01     True
2013-01-02     True
2013-01-03     True
2013-01-04    False
2013-01-05     True
2013-01-06    False
Freq: D, Name: A, dtype: bool

In [93]:
df[df['A']>0]

Unnamed: 0,A,B,C,D
2013-01-01,0.285164,-0.362255,-0.323429,-0.755384
2013-01-02,1.252953,-1.250368,1.413212,-0.474227
2013-01-03,0.557016,-1.207581,-1.969999,-1.511136
2013-01-05,1.226009,0.728823,1.337384,0.104674


In [94]:
df>0

Unnamed: 0,A,B,C,D
2013-01-01,True,False,False,False
2013-01-02,True,False,True,False
2013-01-03,True,False,False,False
2013-01-04,False,True,True,False
2013-01-05,True,True,True,True
2013-01-06,False,False,True,True


In [95]:
df[df>0]

Unnamed: 0,A,B,C,D
2013-01-01,0.285164,,,
2013-01-02,1.252953,,1.413212,
2013-01-03,0.557016,,,
2013-01-04,,0.409328,2.006479,
2013-01-05,1.226009,0.728823,1.337384,0.104674
2013-01-06,,,0.047053,0.633616


In [98]:
df['E'] = ['one', 'one', 'two', 'three', 'four', 'three']
df[df['E'].isin(['two', 'four'])]

Unnamed: 0,A,B,C,D,E
2013-01-03,0.557016,-1.207581,-1.969999,-1.511136,two
2013-01-05,1.226009,0.728823,1.337384,0.104674,four


## Setting

In [102]:
df.at[dates[0], 'A'] = 999
df

Unnamed: 0,A,B,C,D,E
2013-01-01,999.0,-0.362255,-0.323429,-0.755384,one
2013-01-02,1.252953,-1.250368,1.413212,-0.474227,one
2013-01-03,0.557016,-1.207581,-1.969999,-1.511136,two
2013-01-04,-0.165609,0.409328,2.006479,-1.370305,three
2013-01-05,1.226009,0.728823,1.337384,0.104674,four
2013-01-06,-1.387106,-1.250046,0.047053,0.633616,three


In [104]:
df.iloc[0,1]=888
df

Unnamed: 0,A,B,C,D,E
2013-01-01,999.0,888.0,-0.323429,-0.755384,one
2013-01-02,1.252953,-1.250368,1.413212,-0.474227,one
2013-01-03,0.557016,-1.207581,-1.969999,-1.511136,two
2013-01-04,-0.165609,0.409328,2.006479,-1.370305,three
2013-01-05,1.226009,0.728823,1.337384,0.104674,four
2013-01-06,-1.387106,-1.250046,0.047053,0.633616,three


In [106]:
df.loc[:, 'D'] = np.array([5] * len(df))
df

Unnamed: 0,A,B,C,D,E
2013-01-01,999.0,888.0,-0.323429,5,one
2013-01-02,1.252953,-1.250368,1.413212,5,one
2013-01-03,0.557016,-1.207581,-1.969999,5,two
2013-01-04,-0.165609,0.409328,2.006479,5,three
2013-01-05,1.226009,0.728823,1.337384,5,four
2013-01-06,-1.387106,-1.250046,0.047053,5,three


## Missing Data

In [121]:
df1 = df.reindex(index=dates[0:4], columns=list(df.columns) + ['G'])
df1.loc[0:2,'G' ]=1
df1

Unnamed: 0,A,B,C,D,E,G
2013-01-01,999.0,888.0,-0.323429,5,one,1.0
2013-01-02,1.252953,-1.250368,1.413212,5,one,1.0
2013-01-03,0.557016,-1.207581,-1.969999,5,two,
2013-01-04,-0.165609,0.409328,2.006479,5,three,


In [123]:
df1.dropna(how='any')

Unnamed: 0,A,B,C,D,E,G
2013-01-01,999.0,888.0,-0.323429,5,one,1.0
2013-01-02,1.252953,-1.250368,1.413212,5,one,1.0


In [124]:
df1

Unnamed: 0,A,B,C,D,E,G
2013-01-01,999.0,888.0,-0.323429,5,one,1.0
2013-01-02,1.252953,-1.250368,1.413212,5,one,1.0
2013-01-03,0.557016,-1.207581,-1.969999,5,two,
2013-01-04,-0.165609,0.409328,2.006479,5,three,


In [125]:
df1.fillna(value=5)

Unnamed: 0,A,B,C,D,E,G
2013-01-01,999.0,888.0,-0.323429,5,one,1.0
2013-01-02,1.252953,-1.250368,1.413212,5,one,1.0
2013-01-03,0.557016,-1.207581,-1.969999,5,two,5.0
2013-01-04,-0.165609,0.409328,2.006479,5,three,5.0


In [127]:
pd.isna(df1)

Unnamed: 0,A,B,C,D,E,G
2013-01-01,False,False,False,False,False,False
2013-01-02,False,False,False,False,False,False
2013-01-03,False,False,False,False,False,True
2013-01-04,False,False,False,False,False,True


In [128]:
pd.isnull(df1)

Unnamed: 0,A,B,C,D,E,G
2013-01-01,False,False,False,False,False,False
2013-01-02,False,False,False,False,False,False
2013-01-03,False,False,False,False,False,True
2013-01-04,False,False,False,False,False,True


# Sources
https://pandas.pydata.org/pandas-docs/stable/user_guide/10min.html