# Pandas Series

In [1]:
import numpy as np

In [2]:
import pandas as pd

In [4]:
s = pd.Series(np.random.randn(5),index=['a','b','c','d','e'])

In [5]:
s

a   -0.664848
b    1.762448
c    0.561403
d   -0.568947
e    1.827330
dtype: float64

In [6]:
s.index

Index(['a', 'b', 'c', 'd', 'e'], dtype='object')

In [7]:
pd.Series(np.random.randn(5))

0    0.050095
1   -0.283929
2    1.089566
3   -1.399544
4    0.121024
dtype: float64

In [8]:
d = {'b':1,'a':0,'c':2}
pd.Series(d)

b    1
a    0
c    2
dtype: int64

In [9]:
pd.Series(5., index=['a','b','c','d','e'])

a    5.0
b    5.0
c    5.0
d    5.0
e    5.0
dtype: float64

In [10]:
s[0]

-0.6648482092423232

In [11]:
s[:3]

a   -0.664848
b    1.762448
c    0.561403
dtype: float64

In [12]:
s[s>s.median()]

b    1.762448
e    1.827330
dtype: float64

In [13]:
s[[4,3,1]]

e    1.827330
d   -0.568947
b    1.762448
dtype: float64

In [14]:
np.exp(s)

a    0.514352
b    5.826682
c    1.753130
d    0.566121
e    6.217261
dtype: float64

In [15]:
s.dtype

dtype('float64')

In [16]:
s.to_numpy()

array([-0.66484821,  1.7624478 ,  0.56140297, -0.56894741,  1.82732952])

In [17]:
s['a']

-0.6648482092423232

In [18]:
s[0]

-0.6648482092423232

In [19]:
s['e']

1.8273295224915707

In [20]:
s

a   -0.664848
b    1.762448
c    0.561403
d   -0.568947
e    1.827330
dtype: float64

In [21]:
'e' in s

True

In [22]:
'f' in s

False

In [23]:
s['f']

KeyError: 'f'

In [24]:
s+s

a   -1.329696
b    3.524896
c    1.122806
d   -1.137895
e    3.654659
dtype: float64

In [25]:
s*2

a   -1.329696
b    3.524896
c    1.122806
d   -1.137895
e    3.654659
dtype: float64

In [26]:
np.exp(s)

a    0.514352
b    5.826682
c    1.753130
d    0.566121
e    6.217261
dtype: float64

In [28]:
x = pd.Series((1,2,2), index = ['a','b','c'])

In [29]:
x

a    1
b    2
c    2
dtype: int64

In [30]:
np.exp(x)

a    2.718282
b    7.389056
c    7.389056
dtype: float64

In [31]:
x+s

a    0.335152
b    3.762448
c    2.561403
d         NaN
e         NaN
dtype: float64

In [32]:
s = pd.Series(np.random.randn(5), name='something')
s

0    1.500436
1   -0.788273
2   -0.837529
3   -1.341891
4    0.111546
Name: something, dtype: float64

# Pandas Dataframe

In [33]:
d = {'one':pd.Series([1.,2.,3.], index=['a','b','c']),
     'two':pd.Series([1.,2.,3.,4.,], index=['a','b','c','d'])}

In [35]:
df = pd.DataFrame(d)

In [36]:
df

Unnamed: 0,one,two
a,1.0,1.0
b,2.0,2.0
c,3.0,3.0
d,,4.0


In [37]:
pd.DataFrame(d,index=['d','b','a'])

Unnamed: 0,one,two
d,,4.0
b,2.0,2.0
a,1.0,1.0


In [38]:
pd.DataFrame(d, index=['d','b','a'], columns=['two','three'])

Unnamed: 0,two,three
d,4.0,
b,2.0,
a,1.0,


In [39]:
d = {'one':[1.,2.,3.,4.],
     'two':[4.,3.,2.,1.]}

In [40]:
pd.DataFrame(d)

Unnamed: 0,one,two
0,1.0,4.0
1,2.0,3.0
2,3.0,2.0
3,4.0,1.0


In [41]:
pd.DataFrame(d, index=['a','b','c','d'])

Unnamed: 0,one,two
a,1.0,4.0
b,2.0,3.0
c,3.0,2.0
d,4.0,1.0


In [42]:
pd.DataFrame(pd.Series(np.random.rand(5), name='something'))

Unnamed: 0,something
0,0.289123
1,0.767706
2,0.656474
3,0.395025
4,0.046703


In [43]:
df['one']

a    1.0
b    2.0
c    3.0
d    NaN
Name: one, dtype: float64

In [44]:
df['three'] = df['one']*df['two']

In [46]:
df['flag'] = df['one'] >2

In [47]:
df

Unnamed: 0,one,two,three,flag
a,1.0,1.0,1.0,False
b,2.0,2.0,4.0,False
c,3.0,3.0,9.0,True
d,,4.0,,False


In [48]:
del df['two']

In [49]:
df

Unnamed: 0,one,three,flag
a,1.0,1.0,False
b,2.0,4.0,False
c,3.0,9.0,True
d,,,False


In [50]:
df['foo'] = 'bar'

In [51]:
df

Unnamed: 0,one,three,flag,foo
a,1.0,1.0,False,bar
b,2.0,4.0,False,bar
c,3.0,9.0,True,bar
d,,,False,bar


In [52]:
df['one_trunc'] = df['one'][:2]

In [53]:
df

Unnamed: 0,one,three,flag,foo,one_trunc
a,1.0,1.0,False,bar,1.0
b,2.0,4.0,False,bar,2.0
c,3.0,9.0,True,bar,
d,,,False,bar,


In [59]:
df = pd.DataFrame(np.random.randn(8, 3), columns=list('ABC'))
df

Unnamed: 0,A,B,C
0,0.159656,-0.784989,0.602134
1,-0.034103,-0.282426,-2.398916
2,-0.798162,-0.761982,0.694208
3,0.727182,1.446553,-0.735478
4,-0.074692,0.565921,-0.852552
5,-1.765525,-0.288771,0.363109
6,-1.071078,1.295028,2.103286
7,-1.274926,0.167766,1.186387


In [60]:
df*5+2

Unnamed: 0,A,B,C
0,2.798281,-1.924946,5.01067
1,1.829487,0.587871,-9.994578
2,-1.990809,-1.809909,5.471038
3,5.63591,9.232764,-1.67739
4,1.626539,4.829603,-2.262759
5,-6.827624,0.556143,3.815543
6,-3.355391,8.47514,12.516429
7,-4.374629,2.838829,7.931937


In [61]:
1/df

Unnamed: 0,A,B,C
0,6.263462,-1.273903,1.66076
1,-29.323341,-3.540754,-0.416855
2,-1.252879,-1.312367,1.440491
3,1.375172,0.691299,-1.35966
4,-13.388295,1.767033,-1.172949
5,-0.566404,-3.462948,2.753997
6,-0.933639,0.772184,0.475447
7,-0.784359,5.960687,0.842895


In [63]:
1/0.159656

6.263466452873678

In [64]:
df**4

Unnamed: 0,A,B,C
0,0.00065,0.379712,0.131454
1,1e-06,0.006362,33.117682
2,0.405848,0.337115,0.232251
3,0.279623,4.378619,0.292603
4,3.1e-05,0.10257,0.528303
5,9.716176,0.006954,0.017384
6,1.316088,2.812656,19.570108
7,2.642041,0.000792,1.981099


In [65]:
df1 = pd.DataFrame({'a': [1,0,1], 'b':[0,1,1]},dtype=bool)
df2 = pd.DataFrame({'a': [0,1,1], 'b': [1,1,0]}, dtype=bool)
df1&df2

Unnamed: 0,a,b
0,False,False
1,False,True
2,True,False


In [66]:
df1

Unnamed: 0,a,b
0,True,False
1,False,True
2,True,True


In [67]:
df2

Unnamed: 0,a,b
0,False,True
1,True,True
2,True,False


In [68]:
df1 | df2

Unnamed: 0,a,b
0,True,True
1,True,True
2,True,True


In [69]:
df1 ^ df2

Unnamed: 0,a,b
0,True,True
1,True,False
2,False,True


In [70]:
-df1

Unnamed: 0,a,b
0,False,True
1,True,False
2,False,False


# Pandas dtypes

In [73]:
dft = pd.DataFrame({
    'A':np.random.randn(3),
    'B':1,
    'C':'foo',
    'D':pd.Timestamp('20010102'),
    'E':pd.Series([1.0]*3).astype('float32'),
    'F':False,
    'G':pd.Series([1]*3,dtype='int8')})
dft

Unnamed: 0,A,B,C,D,E,F,G
0,-1.022194,1,foo,2001-01-02,1.0,False,1
1,0.947974,1,foo,2001-01-02,1.0,False,1
2,-0.285683,1,foo,2001-01-02,1.0,False,1


In [75]:
dft.dtypes

A           float64
B             int64
C            object
D    datetime64[ns]
E           float32
F              bool
G              int8
dtype: object

In [76]:
dft['A'].dtype

dtype('float64')

In [77]:
pd.Series([1,2,3,6,'food'])

0       1
1       2
2       3
3       6
4    food
dtype: object

In [78]:
df1 = pd.DataFrame(np.random.randn(8,1),columns=['A'],dtype='float32')

In [79]:
df1

Unnamed: 0,A
0,-1.939969
1,0.753133
2,0.811729
3,0.641279
4,0.885499
5,-0.575037
6,0.211648
7,0.661722


In [80]:
df1.dtypes

A    float32
dtype: object

In [81]:
df1 = df1.astype('float64')

In [83]:
df1.dtypes

A    float64
dtype: object

In [84]:
dft1 = pd.DataFrame({'a':[1,0,1], 'b':[4,5,6], 'c': [7,8,9]})

In [85]:
dft1

Unnamed: 0,a,b,c
0,1,4,7
1,0,5,8
2,1,6,9


In [86]:
dft1 = dft1.astype({'a':np.bool,'c':np.float64})

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  dft1 = dft1.astype({'a':np.bool,'c':np.float64})


In [87]:
dft1

Unnamed: 0,a,b,c
0,True,4,7.0
1,False,5,8.0
2,True,6,9.0


In [88]:
dft1.dtypes

a       bool
b      int64
c    float64
dtype: object

# Attributes of Pandas Objects

In [90]:
df = pd.DataFrame(np.random.randn(8,3),columns=['A','B','C'])

In [91]:
df

Unnamed: 0,A,B,C
0,-0.959505,0.921555,1.801047
1,1.067066,0.322091,-1.315954
2,-0.702776,-0.952972,-0.14736
3,-0.684396,0.09748,-0.035101
4,-0.260579,0.69186,1.963504
5,-0.182114,-0.663018,-0.68502
6,0.620317,-1.292103,1.449877
7,0.766505,-1.928438,-0.024815


In [92]:
df.columns = [x.lower() for x in df.columns]
df

Unnamed: 0,a,b,c
0,-0.959505,0.921555,1.801047
1,1.067066,0.322091,-1.315954
2,-0.702776,-0.952972,-0.14736
3,-0.684396,0.09748,-0.035101
4,-0.260579,0.69186,1.963504
5,-0.182114,-0.663018,-0.68502
6,0.620317,-1.292103,1.449877
7,0.766505,-1.928438,-0.024815


In [93]:
df.a.array

<PandasArray>
[ -0.9595050937748989,   1.0670660211764342,   -0.702775915777827,
  -0.6843960763389084, -0.26057941719445477, -0.18211393465619666,
   0.6203168223649278,   0.7665046573623215]
Length: 8, dtype: float64

In [102]:
data = np.random.randint(0,7,size=50)

In [103]:
data

array([1, 6, 1, 2, 2, 2, 2, 0, 5, 3, 6, 4, 5, 5, 5, 3, 6, 0, 6, 5, 3, 3,
       5, 0, 1, 3, 0, 6, 4, 6, 1, 3, 6, 3, 5, 5, 3, 3, 2, 4, 2, 5, 0, 2,
       6, 5, 5, 3, 4, 4])

In [104]:
s = pd.Series(data)

In [105]:
s.value_counts()

5    11
3    10
6     8
2     7
0     5
4     5
1     4
dtype: int64

In [106]:
s5 = pd.Series([1,1,3,3,3,5,5,7,7,7])

In [107]:
s5.mode()

0    3
1    7
dtype: int64

In [117]:
df5 = pd.DataFrame({"A": np.random.randint(0,7,size=50),
                    "B": np.random.randint(-10,15,size=50)})

In [109]:
df5

Unnamed: 0,A,B
0,4,12
1,2,12
2,6,-8
3,5,4
4,3,1
5,6,2
6,5,7
7,1,8
8,1,5
9,1,9


In [118]:
df5.mode()

Unnamed: 0,A,B
0,4,-2


## Altering Labels

### Reindexing

In [119]:
s = pd.Series(np.random.randn(5), index=['a','b','c','d','e'])

In [120]:
s

a   -1.187232
b   -0.330814
c   -0.367848
d    0.185387
e   -0.490026
dtype: float64

In [122]:
s.reindex(['e','b','f','d'])

e   -0.490026
b   -0.330814
f         NaN
d    0.185387
dtype: float64

In [123]:
df = pd.DataFrame({
    'one':pd.Series(np.random.randn(3),index=['a','b','c']),
    'two':pd.Series(np.random.randn(4),index=['a','b','c','d']),
    'three':pd.Series(np.random.randn(3), index=['b','c','d'])})

In [124]:
df

Unnamed: 0,one,two,three
a,-0.581711,0.813558,
b,0.674942,-0.926176,-0.518509
c,1.311002,-0.176288,-0.204686
d,,-0.840606,-0.145001


In [126]:
df.reindex(index=['c','f','b'],columns=['three','two','one'])

Unnamed: 0,three,two,one
c,-0.204686,-0.176288,1.311002
f,,,
b,-0.518509,-0.926176,0.674942


In [129]:
df.reindex(['c','f','b'], axis='index')

Unnamed: 0,one,two,three
c,1.311002,-0.176288,-0.204686
f,,,
b,0.674942,-0.926176,-0.518509


In [130]:
rs = s.reindex(df.index)

In [131]:
rs

a   -1.187232
b   -0.330814
c   -0.367848
d    0.185387
dtype: float64

In [132]:
df

Unnamed: 0,one,two,three
a,-0.581711,0.813558,
b,0.674942,-0.926176,-0.518509
c,1.311002,-0.176288,-0.204686
d,,-0.840606,-0.145001


In [133]:
df.drop(['a','d'],axis=0)

Unnamed: 0,one,two,three
b,0.674942,-0.926176,-0.518509
c,1.311002,-0.176288,-0.204686


In [134]:
df.drop(['one'],axis=1)

Unnamed: 0,two,three
a,0.813558,
b,-0.926176,-0.518509
c,-0.176288,-0.204686
d,-0.840606,-0.145001


### Renaming

In [135]:
s

a   -1.187232
b   -0.330814
c   -0.367848
d    0.185387
e   -0.490026
dtype: float64

In [136]:
s.rename(str.upper)

A   -1.187232
B   -0.330814
C   -0.367848
D    0.185387
E   -0.490026
dtype: float64

In [137]:
df.rename(columns={'one':'foo','two':'bar'},
          index={'a':'apple','b':'banana','d':'durian'})

Unnamed: 0,foo,bar,three
apple,-0.581711,0.813558,
banana,0.674942,-0.926176,-0.518509
c,1.311002,-0.176288,-0.204686
durian,,-0.840606,-0.145001


# .dt and .str accessors

In [144]:
s = pd.Series(pd.date_range('20130102 09:10:12', periods=4))

In [145]:
s

0   2013-01-02 09:10:12
1   2013-01-03 09:10:12
2   2013-01-04 09:10:12
3   2013-01-05 09:10:12
dtype: datetime64[ns]

In [146]:
s.dt.hour

0    9
1    9
2    9
3    9
dtype: int64

In [147]:
s.dt.second

0    12
1    12
2    12
3    12
dtype: int64

In [148]:
s.dt.day

0    2
1    3
2    4
3    5
dtype: int64

In [149]:
s.dt.dayofweek

0    2
1    3
2    4
3    5
dtype: int64

In [150]:
stz = s.dt.tz_localize('US/Eastern')
stz

0   2013-01-02 09:10:12-05:00
1   2013-01-03 09:10:12-05:00
2   2013-01-04 09:10:12-05:00
3   2013-01-05 09:10:12-05:00
dtype: datetime64[ns, US/Eastern]

In [151]:
stz.dt.tz

<DstTzInfo 'US/Eastern' LMT-1 day, 19:04:00 STD>

In [152]:
s.dt.tz_localize('UTC').dt.tz_convert('US/Eastern')

0   2013-01-02 04:10:12-05:00
1   2013-01-03 04:10:12-05:00
2   2013-01-04 04:10:12-05:00
3   2013-01-05 04:10:12-05:00
dtype: datetime64[ns, US/Eastern]

In [153]:
s = pd.Series(['A','B','C','Aaba',np.nan,'CABA','dog','cat'], dtype="string")
s.str.lower()

0       a
1       b
2       c
3    aaba
4    <NA>
5    caba
6     dog
7     cat
dtype: string

In [155]:
df = pd.DataFrame({
    'one': pd.Series(np.random.randn(3),index=['a','b','c']),
    'two': pd.Series(np.random.randn(4),index=['a','b','c','d']),
    'three':pd.Series(np.random.randn(3),index=['b','c','d'])})

In [156]:
unsorted_df = df.reindex(index=['a','d','c','b'],columns=['three','two','one'])

In [157]:
unsorted_df

Unnamed: 0,three,two,one
a,,-0.465398,1.308751
d,-0.670301,-0.301405,
c,-0.495758,-0.646941,-0.401404
b,0.896187,0.794148,-1.548517


In [158]:
unsorted_df.sort_index()

Unnamed: 0,three,two,one
a,,-0.465398,1.308751
b,0.896187,0.794148,-1.548517
c,-0.495758,-0.646941,-0.401404
d,-0.670301,-0.301405,


In [159]:
unsorted_df.sort_index(ascending=False)

Unnamed: 0,three,two,one
d,-0.670301,-0.301405,
c,-0.495758,-0.646941,-0.401404
b,0.896187,0.794148,-1.548517
a,,-0.465398,1.308751


In [160]:
unsorted_df.sort_index(axis=1)

Unnamed: 0,one,three,two
a,1.308751,,-0.465398
d,,-0.670301,-0.301405
c,-0.401404,-0.495758,-0.646941
b,-1.548517,0.896187,0.794148


In [161]:
unsorted_df['three'].sort_index()

a         NaN
b    0.896187
c   -0.495758
d   -0.670301
Name: three, dtype: float64

In [163]:
df1 = pd.DataFrame({'one': [2,1,1,1], 'two':[1,3,2,4], 'three': [5,4,3,2]})
df1

Unnamed: 0,one,two,three
0,2,1,5
1,1,3,4
2,1,2,3
3,1,4,2


In [165]:
df1.sort_values(by='two')

Unnamed: 0,one,two,three
0,2,1,5
2,1,2,3
1,1,3,4
3,1,4,2


In [167]:
df1[['one','two','three']].sort_values(by=['one','two'])

Unnamed: 0,one,two,three
2,1,2,3
1,1,3,4
3,1,4,2
0,2,1,5


In [168]:
s[2] = np.nan

In [169]:
s.sort_values()

0       A
3    Aaba
1       B
5    CABA
7     cat
6     dog
2    <NA>
4    <NA>
dtype: string

In [170]:
s.sort_values(na_position='first')

2    <NA>
4    <NA>
0       A
3    Aaba
1       B
5    CABA
7     cat
6     dog
dtype: string

In [172]:
idx = pd.MultiIndex.from_tuples([('a',1), ('a',2),('a',2),('b',2),('b',1),('b',1)])

In [173]:
idx

MultiIndex([('a', 1),
            ('a', 2),
            ('a', 2),
            ('b', 2),
            ('b', 1),
            ('b', 1)],
           )

In [174]:
idx.names = ['first','second']

In [175]:
df_multi = pd.DataFrame({'A':np.arange(6,0,-1)},index=idx)

In [176]:
df_multi

Unnamed: 0_level_0,Unnamed: 1_level_0,A
first,second,Unnamed: 2_level_1
a,1,6
a,2,5
a,2,4
b,2,3
b,1,2
b,1,1


In [177]:
df_multi.sort_values(by=['second','A'])

Unnamed: 0_level_0,Unnamed: 1_level_0,A
first,second,Unnamed: 2_level_1
b,1,1
b,1,2
a,1,6
b,2,3
a,2,4
a,2,5
