# Pandas Basic

In [2]:
import pandas as pd
import numpy as np
from pandas import Series, DataFrame

## Series

In [3]:
obj = pd.Series([4,7,-5,3])
obj

0    4
1    7
2   -5
3    3
dtype: int64

In [4]:
obj.values

array([ 4,  7, -5,  3])

In [5]:
obj.index

RangeIndex(start=0, stop=4, step=1)

In [6]:
obj2 = pd.Series([4,7,-5,3],index=['b','d','a','c'])

In [7]:
obj2

b    4
d    7
a   -5
c    3
dtype: int64

In [8]:
obj2.index

Index(['b', 'd', 'a', 'c'], dtype='object')

In [9]:
obj2['a']

-5

In [10]:
obj2['d']=6
obj2

b    4
d    6
a   -5
c    3
dtype: int64

In [11]:
obj2[obj2>0]

b    4
d    6
c    3
dtype: int64

In [12]:
obj2*2

b     8
d    12
a   -10
c     6
dtype: int64

In [13]:
np.exp(obj2)

b     54.598150
d    403.428793
a      0.006738
c     20.085537
dtype: float64

In [14]:
'b' in obj2

True

In [15]:
sdata = {'MH':71000, 'GJ':30000, 'KA':9000,'MP':23000}

#### passing a dict, the index in the resulting Series will have the dict’s keys in sorted order.

In [16]:
obj3 = pd.Series(sdata)
obj3

GJ    30000
KA     9000
MH    71000
MP    23000
dtype: int64

In [17]:
obj3['MH']

71000

In [18]:
states = ['MH', 'GJ', 'KA','MP']

#### override  the dict keys in the order you want them to appear in the resulting Series by passing index:

In [19]:
obj4 = pd.Series(sdata, index=states)
obj4

MH    71000
GJ    30000
KA     9000
MP    23000
dtype: int64

In [20]:
pd.isnull(obj4)

MH    False
GJ    False
KA    False
MP    False
dtype: bool

In [21]:
pd.notnull(obj4)

MH    True
GJ    True
KA    True
MP    True
dtype: bool

In [22]:
states = ['MH', 'GJ', 'KA','MP','RJ']

In [23]:
obj5 = pd.Series(sdata, index=states)
obj5

MH    71000.0
GJ    30000.0
KA     9000.0
MP    23000.0
RJ        NaN
dtype: float64

In [24]:
obj5.isnull()

MH    False
GJ    False
KA    False
MP    False
RJ     True
dtype: bool

In [25]:
obj4+obj5

GJ     60000.0
KA     18000.0
MH    142000.0
MP     46000.0
RJ         NaN
dtype: float64

In [26]:
obj4.name = 'population'

In [27]:
obj4.index.name = 'state'

In [28]:
obj4

state
MH    71000
GJ    30000
KA     9000
MP    23000
Name: population, dtype: int64

In [29]:
obj

0    4
1    7
2   -5
3    3
dtype: int64

In [30]:
obj.index = ['Kishor','Aaditya','Anushka','Aaradhya']

In [31]:
obj

Kishor      4
Aaditya     7
Anushka    -5
Aaradhya    3
dtype: int64

# DataFrames

####  A DataFrame represents a rectangular table of data and contains an ordered collection of columns, each of which can be a different value type (numeric, string, boolean, etc.).
#### The DataFrame has both a row and column index; it can be thought of as a dict of Series all sharing the same index.

In [32]:
data = {'state':['GJ','GJ','MH','MH','KA','KA'],
       'year':[2011,2012,2013,2014,2015,2016],
       'pop':[1.5,1.7,3.6,4.6,3.3,5.8]}

In [33]:
frame = pd.DataFrame(data)

In [34]:
frame

Unnamed: 0,pop,state,year
0,1.5,GJ,2011
1,1.7,GJ,2012
2,3.6,MH,2013
3,4.6,MH,2014
4,3.3,KA,2015
5,5.8,KA,2016


In [35]:
frame.head()

Unnamed: 0,pop,state,year
0,1.5,GJ,2011
1,1.7,GJ,2012
2,3.6,MH,2013
3,4.6,MH,2014
4,3.3,KA,2015


In [36]:
frame.tail()

Unnamed: 0,pop,state,year
1,1.7,GJ,2012
2,3.6,MH,2013
3,4.6,MH,2014
4,3.3,KA,2015
5,5.8,KA,2016


If you specify a sequence of columns, the DataFrame’s columns will be arranged in that order:

In [37]:
pd.DataFrame(data, columns=['year', 'state', 'pop'])

Unnamed: 0,year,state,pop
0,2011,GJ,1.5
1,2012,GJ,1.7
2,2013,MH,3.6
3,2014,MH,4.6
4,2015,KA,3.3
5,2016,KA,5.8


In [38]:
frame2 = pd.DataFrame(data, columns=['year', 'state', 'pop', 'debt'],
                      index=['one', 'two', 'three', 'four','five', 'six'])

In [39]:
frame2

Unnamed: 0,year,state,pop,debt
one,2011,GJ,1.5,
two,2012,GJ,1.7,
three,2013,MH,3.6,
four,2014,MH,4.6,
five,2015,KA,3.3,
six,2016,KA,5.8,


In [40]:
frame2.columns

Index(['year', 'state', 'pop', 'debt'], dtype='object')

In [41]:
frame2['state']

one      GJ
two      GJ
three    MH
four     MH
five     KA
six      KA
Name: state, dtype: object

In [42]:
frame2.year

one      2011
two      2012
three    2013
four     2014
five     2015
six      2016
Name: year, dtype: int64

In [43]:
frame2.loc['three']

year     2013
state      MH
pop       3.6
debt      NaN
Name: three, dtype: object

In [44]:
frame2['debt']=np.arange(6.)

In [45]:
frame2

Unnamed: 0,year,state,pop,debt
one,2011,GJ,1.5,0.0
two,2012,GJ,1.7,1.0
three,2013,MH,3.6,2.0
four,2014,MH,4.6,3.0
five,2015,KA,3.3,4.0
six,2016,KA,5.8,5.0


In [46]:
val = pd.Series([-1.2, -1.5, -1.7], index=['two', 'four', 'five'])

In [47]:
frame2['debt'] = val

In [48]:
frame2

Unnamed: 0,year,state,pop,debt
one,2011,GJ,1.5,
two,2012,GJ,1.7,-1.2
three,2013,MH,3.6,
four,2014,MH,4.6,-1.5
five,2015,KA,3.3,-1.7
six,2016,KA,5.8,


In [49]:
frame2['eastern']= frame2.state=='MH'

In [50]:
frame2

Unnamed: 0,year,state,pop,debt,eastern
one,2011,GJ,1.5,,False
two,2012,GJ,1.7,-1.2,False
three,2013,MH,3.6,,True
four,2014,MH,4.6,-1.5,True
five,2015,KA,3.3,-1.7,False
six,2016,KA,5.8,,False


In [51]:
frame2.columns

Index(['year', 'state', 'pop', 'debt', 'eastern'], dtype='object')

In [52]:
del frame2['eastern']

In [53]:
frame2.columns

Index(['year', 'state', 'pop', 'debt'], dtype='object')

In [54]:
frame2.T

Unnamed: 0,one,two,three,four,five,six
year,2011,2012,2013,2014,2015,2016
state,GJ,GJ,MH,MH,KA,KA
pop,1.5,1.7,3.6,4.6,3.3,5.8
debt,,-1.2,,-1.5,-1.7,


If the nested dict is passed to the DataFrame, pandas will interpret the outer dict keys as the columns and the inner keys as the row indices

In [55]:
pop = {'MH':{2013:3.9,2014:4.2},'GJ':{2011:2.2,2012:2.8}}

In [56]:
frame3 = pd.DataFrame(pop)

In [57]:
frame3

Unnamed: 0,GJ,MH
2011,2.2,
2012,2.8,
2013,,3.9
2014,,4.2


In [58]:
frame3.T

Unnamed: 0,2011,2012,2013,2014
GJ,2.2,2.8,,
MH,,,3.9,4.2


In [59]:
pd.DataFrame(pop,index=[2011,2012,2013,2014])

Unnamed: 0,GJ,MH
2011,2.2,
2012,2.8,
2013,,3.9
2014,,4.2


In [60]:
frame2.values

array([[2011, 'GJ', 1.5, nan],
       [2012, 'GJ', 1.7, -1.2],
       [2013, 'MH', 3.6, nan],
       [2014, 'MH', 4.6, -1.5],
       [2015, 'KA', 3.3, -1.7],
       [2016, 'KA', 5.8, nan]], dtype=object)

# Reindexing

In [61]:
obj = pd.Series([4.5,7.2,-5.3,3.6], index=['d','b','a','c'])

In [62]:
obj

d    4.5
b    7.2
a   -5.3
c    3.6
dtype: float64

In [63]:
obj2 = obj.reindex(['a','b','c','d','e'])

In [64]:
obj

d    4.5
b    7.2
a   -5.3
c    3.6
dtype: float64

In [65]:
obj2

a   -5.3
b    7.2
c    3.6
d    4.5
e    NaN
dtype: float64

In [66]:
obj3 = pd.Series(['blue','purple','yellow'], index=[0,2,4])
obj3

0      blue
2    purple
4    yellow
dtype: object

In [67]:
obj3.reindex(range(6), method='ffill')

0      blue
1      blue
2    purple
3    purple
4    yellow
5    yellow
dtype: object

In [69]:
frame = pd.DataFrame(np.arange(9).reshape((3,3,)), index=['a','b','c'],
                    columns=['IN','USA','AUS'])

In [70]:
frame

Unnamed: 0,IN,USA,AUS
a,0,1,2
b,3,4,5
c,6,7,8


In [71]:
frame2 = frame.reindex(['a','b','c','d'])
frame2

Unnamed: 0,IN,USA,AUS
a,0.0,1.0,2.0
b,3.0,4.0,5.0
c,6.0,7.0,8.0
d,,,


## Dropping Entries from an Axis

drop method will return a new object with the indicated value or values deleted from
an axis

In [73]:
obj = pd.Series(np.arange(5.),index=['a','b','c','d','e'])
obj

a    0.0
b    1.0
c    2.0
d    3.0
e    4.0
dtype: float64

In [74]:
obj.drop('c')

a    0.0
b    1.0
d    3.0
e    4.0
dtype: float64

In [76]:
obj.drop(['c','d'])

a    0.0
b    1.0
e    4.0
dtype: float64

# Indexing, Selection and Filtering

In [77]:
obj = pd.Series(np.arange(4.), index=['a', 'b', 'c', 'd'])
obj

a    0.0
b    1.0
c    2.0
d    3.0
dtype: float64

In [78]:
obj['b']

1.0

In [79]:
obj[2:4]

c    2.0
d    3.0
dtype: float64

In [80]:
obj[['b','a','d']]

b    1.0
a    0.0
d    3.0
dtype: float64

In [81]:
obj[[1,3]]

b    1.0
d    3.0
dtype: float64

In [82]:
obj[obj<2]

a    0.0
b    1.0
dtype: float64

In [83]:
obj['b':'c']

b    1.0
c    2.0
dtype: float64

In [84]:
obj['b':'c']=5

In [85]:
obj

a    0.0
b    5.0
c    5.0
d    3.0
dtype: float64

In [94]:
data = pd.DataFrame(np.arange(16).reshape((4,4)),
                   index=['MH','GJ','RJ','KA'],
                   columns=['one', 'two', 'three', 'four'])
data

Unnamed: 0,one,two,three,four
MH,0,1,2,3
GJ,4,5,6,7
RJ,8,9,10,11
KA,12,13,14,15


In [96]:
data['two']

MH     1
GJ     5
RJ     9
KA    13
Name: two, dtype: int64

In [97]:
data[-2:]

Unnamed: 0,one,two,three,four
RJ,8,9,10,11
KA,12,13,14,15


In [99]:
data[data['three']>5]

Unnamed: 0,one,two,three,four
GJ,4,5,6,7
RJ,8,9,10,11
KA,12,13,14,15


In [100]:
data<5

Unnamed: 0,one,two,three,four
MH,True,True,True,True
GJ,True,False,False,False
RJ,False,False,False,False
KA,False,False,False,False


In [101]:
data.loc['MH',['one','three']]

one      0
three    2
Name: MH, dtype: int64

In [103]:
data.iloc[2]

one       8
two       9
three    10
four     11
Name: RJ, dtype: int64

In [105]:
data.iloc[[1,2]]

Unnamed: 0,one,two,three,four
GJ,4,5,6,7
RJ,8,9,10,11


##  Arithmetic and Data Alignment

In [106]:
s1 = pd.Series([7.3, -2.5, 3.4, 1.5], index=['a', 'c', 'd', 'e'])

In [107]:
s2 = pd.Series([-2.1, 3.6, -1.5, 4, 3.1],index=['a', 'c', 'e', 'f', 'g'])

In [108]:
s1

a    7.3
c   -2.5
d    3.4
e    1.5
dtype: float64

In [109]:
s2

a   -2.1
c    3.6
e   -1.5
f    4.0
g    3.1
dtype: float64

In [110]:
s1+s2

a    5.2
c    1.1
d    NaN
e    0.0
f    NaN
g    NaN
dtype: float64

In [113]:
df1 = pd.DataFrame(np.arange(9.).reshape((3,3)), columns=list('bcd'),index=['IN','JP','RUS'])

In [114]:
df1

Unnamed: 0,b,c,d
IN,0.0,1.0,2.0
JP,3.0,4.0,5.0
RUS,6.0,7.0,8.0


In [115]:
df2 = pd.DataFrame(np.arange(12.).reshape((4,3)), columns=list('bde'),index=['IN','JP','RUS','ENG'])

In [116]:
df2

Unnamed: 0,b,d,e
IN,0.0,1.0,2.0
JP,3.0,4.0,5.0
RUS,6.0,7.0,8.0
ENG,9.0,10.0,11.0


In [117]:
df1+df2

Unnamed: 0,b,c,d,e
ENG,,,,
IN,0.0,,3.0,
JP,6.0,,9.0,
RUS,12.0,,15.0,


## Function Application and Mapping

In [118]:
frame = pd.DataFrame(np.random.randn(4,3), columns=list('bde'), 
                     index=['MH','GJ','PB','KA'])

In [119]:
frame

Unnamed: 0,b,d,e
MH,-0.403794,-0.355298,-0.603931
GJ,0.403126,-0.101063,-0.855654
PB,1.821795,0.224722,1.1702
KA,0.626097,1.188787,-1.365192


In [121]:
np.abs(frame)

Unnamed: 0,b,d,e
MH,0.403794,0.355298,0.603931
GJ,0.403126,0.101063,0.855654
PB,1.821795,0.224722,1.1702
KA,0.626097,1.188787,1.365192


In [122]:
f = lambda x:x.max() - x.min()

In [123]:
frame.apply(f)

b    2.225588
d    1.544085
e    2.535392
dtype: float64

In [124]:
frame.apply(f,axis='columns')

MH    0.248633
GJ    1.258780
PB    1.597073
KA    2.553979
dtype: float64

## Sorting and Ranking

In [125]:
obj = pd.Series(range(4), index=['d', 'a', 'b', 'c'])
obj

d    0
a    1
b    2
c    3
dtype: int64

In [126]:
obj.sort_index()

a    1
b    2
c    3
d    0
dtype: int64

In [127]:
frame = pd.DataFrame(np.arange(8).reshape((2, 4)),
    index=['three', 'one'],
    columns=['d', 'a', 'b', 'c'])

In [128]:
frame

Unnamed: 0,d,a,b,c
three,0,1,2,3
one,4,5,6,7


In [129]:
frame.sort_index()

Unnamed: 0,d,a,b,c
one,4,5,6,7
three,0,1,2,3


In [130]:
frame.sort_index(axis=1)

Unnamed: 0,a,b,c,d
three,1,2,3,0
one,5,6,7,4


In [131]:
frame.sort_index(axis=1, ascending=False)

Unnamed: 0,d,c,b,a
three,0,3,2,1
one,4,7,6,5


In [132]:
obj = pd.Series([4, 7, -3, 2])

In [133]:
obj

0    4
1    7
2   -3
3    2
dtype: int64

In [134]:
obj.sort_values()

2   -3
3    2
0    4
1    7
dtype: int64

In [135]:
frame = pd.DataFrame({'b': [4, 7, -3, 2], 'a': [0, 1, 0, 1]})

In [136]:
frame

Unnamed: 0,a,b
0,0,4
1,1,7
2,0,-3
3,1,2


In [137]:
frame.sort_values(by='b')

Unnamed: 0,a,b
2,0,-3
3,1,2
0,0,4
1,1,7


In [138]:
frame.sort_values(by=['a', 'b'])

Unnamed: 0,a,b
2,0,-3
0,0,4
3,1,2
1,1,7


## Axis Indexes with Duplicate Labels

In [139]:
obj = pd.Series(range(5), index=['a', 'a', 'b', 'b', 'c'])

In [140]:
obj

a    0
a    1
b    2
b    3
c    4
dtype: int64

In [141]:
obj.index.is_unique

False

In [142]:
obj['a']

a    0
a    1
dtype: int64

In [143]:
obj['c']

4

In [144]:
df = pd.DataFrame(np.random.randn(4, 3), index=['a', 'a', 'b', 'b'])

In [145]:
df

Unnamed: 0,0,1,2
a,0.449848,-0.488384,0.654447
a,-1.67571,1.647289,-0.273376
b,-0.72299,0.328864,1.56722
b,0.997093,-2.768942,-1.168857


In [146]:
df.loc['b']

Unnamed: 0,0,1,2
b,-0.72299,0.328864,1.56722
b,0.997093,-2.768942,-1.168857


In [147]:
df.sum()

0   -0.951760
1   -1.281174
2    0.779434
dtype: float64

In [148]:
df.idxmax()

0    b
1    a
2    b
dtype: object

In [149]:
df.describe()

Unnamed: 0,0,1,2
count,4.0,4.0,4.0
mean,-0.23794,-0.320293,0.194859
std,1.197332,1.854447,1.179486
min,-1.67571,-2.768942,-1.168857
25%,-0.96117,-1.058523,-0.497246
50%,-0.136571,-0.07976,0.190536
75%,0.586659,0.65847,0.88264
max,0.997093,1.647289,1.56722


In [150]:
obj = pd.Series(['c', 'a', 'd', 'a', 'a', 'b', 'b', 'c', 'c'])

In [151]:
obj

0    c
1    a
2    d
3    a
4    a
5    b
6    b
7    c
8    c
dtype: object

In [152]:
obj.unique()

array(['c', 'a', 'd', 'b'], dtype=object)

In [153]:
obj.value_counts()

c    3
a    3
b    2
d    1
dtype: int64

In [154]:
obj.isin(['b','a'])

0    False
1     True
2    False
3     True
4     True
5     True
6     True
7    False
8    False
dtype: bool