In [1]:
import numpy as np
import pandas as pd

In [2]:
df1 = pd.DataFrame(np.arange(9).reshape((3,3)),
                   columns=['b','c','d'],
                   index=['Mumbai','Pune','Nagpur'])
df1

Unnamed: 0,b,c,d
Mumbai,0,1,2
Pune,3,4,5
Nagpur,6,7,8


In [3]:
df2 = pd.DataFrame(np.arange(12).reshape((4,3)),
                   columns=['b','d','e'],
                   index=['Delhi','Mumbai','Pune','Chennai'])
df2

Unnamed: 0,b,d,e
Delhi,0,1,2
Mumbai,3,4,5
Pune,6,7,8
Chennai,9,10,11


In [4]:
df3 = df1 + df2
df3

Unnamed: 0,b,c,d,e
Chennai,,,,
Delhi,,,,
Mumbai,3.0,,6.0,
Nagpur,,,,
Pune,9.0,,12.0,


In [5]:
df1.add(df2)

Unnamed: 0,b,c,d,e
Chennai,,,,
Delhi,,,,
Mumbai,3.0,,6.0,
Nagpur,,,,
Pune,9.0,,12.0,


In [6]:
df1.add(df2, fill_value=0)

Unnamed: 0,b,c,d,e
Chennai,9.0,,10.0,11.0
Delhi,0.0,,1.0,2.0
Mumbai,3.0,1.0,6.0,5.0
Nagpur,6.0,7.0,8.0,
Pune,9.0,4.0,12.0,8.0


## Function Application & Mapping

In [9]:
df1 = pd.DataFrame(np.random.randn(4,3),
                   columns=['b','d','e'],
                   index=['Mumbai','Pune','Nagpur','Thane'])
df1

Unnamed: 0,b,d,e
Mumbai,0.067225,0.194721,1.276947
Pune,2.242492,-0.064995,-1.220272
Nagpur,0.233197,-2.091125,-1.314912
Thane,0.59905,-1.348559,0.470903


In [10]:
np.abs(df1)

Unnamed: 0,b,d,e
Mumbai,0.067225,0.194721,1.276947
Pune,2.242492,0.064995,1.220272
Nagpur,0.233197,2.091125,1.314912
Thane,0.59905,1.348559,0.470903


In [11]:
diff = lambda x : x.max() - x.min()    # customise function

In [12]:
df1.apply(diff)

b    2.175267
d    2.285846
e    2.591859
dtype: float64

In [13]:
df1['b'].max() - df1['b'].min()

2.175267246802242

In [14]:
df1.apply(diff, axis='columns')   # doing operation on the basis of rows

Mumbai    1.209721
Pune      3.462764
Nagpur    2.324321
Thane     1.947609
dtype: float64

In [15]:
df1

Unnamed: 0,b,d,e
Mumbai,0.067225,0.194721,1.276947
Pune,2.242492,-0.064995,-1.220272
Nagpur,0.233197,-2.091125,-1.314912
Thane,0.59905,-1.348559,0.470903


In [19]:
def my_diff(x):
    return pd.Series([x.max(),x.min(),x.max()-x.min()],index=['max','min','diff'])

In [20]:
df1.apply(my_diff)

Unnamed: 0,b,d,e
max,2.242492,0.194721,1.276947
min,0.067225,-2.091125,-1.314912
diff,2.175267,2.285846,2.591859


In [21]:
df1.loc['Mumbai'].max()

1.2769466153295301

In [22]:
df1

Unnamed: 0,b,d,e
Mumbai,0.067225,0.194721,1.276947
Pune,2.242492,-0.064995,-1.220272
Nagpur,0.233197,-2.091125,-1.314912
Thane,0.59905,-1.348559,0.470903


In [23]:
df1['e']

Mumbai    1.276947
Pune     -1.220272
Nagpur   -1.314912
Thane     0.470903
Name: e, dtype: float64

In [26]:
format_decimal = lambda x : '%.2f' % x

In [28]:
df1['b'].apply(format_decimal)

Mumbai    0.07
Pune      2.24
Nagpur    0.23
Thane     0.60
Name: b, dtype: object

## Sorting & Ranking

In [33]:
obj = pd.Series(range(4), index=['d','a','b','c'])
obj

d    0
a    1
b    2
c    3
dtype: int64

In [34]:
obj.sort_index()

a    1
b    2
c    3
d    0
dtype: int64

In [35]:
frame = pd.DataFrame(np.arange(8).reshape((2,4)),
                    index=['three','one'],
                    columns=['d','a','b','c'])
frame

Unnamed: 0,d,a,b,c
three,0,1,2,3
one,4,5,6,7


In [36]:
frame.sort_index()

Unnamed: 0,d,a,b,c
one,4,5,6,7
three,0,1,2,3


In [38]:
frame.sort_index(axis=1)

Unnamed: 0,a,b,c,d
three,1,2,3,0
one,5,6,7,4


In [39]:
frame.sort_index(axis=1, ascending=False)

Unnamed: 0,d,c,b,a
three,0,3,2,1
one,4,7,6,5


In [40]:
ser = pd.Series(range(5), index=['a','a','b','b','c'])
ser

a    0
a    1
b    2
b    3
c    4
dtype: int64

In [41]:
ser['a']

a    0
a    1
dtype: int64

In [42]:
ser.index.is_unique

False

## Computing Descriptive Statistics

In [43]:
df = pd.DataFrame([[1.4,np.nan],[7.1,4.5],[np.nan,np.nan],[0.75,-1.3]],
                 index=['a','b','c','d'], columns=['A','B'])
df

Unnamed: 0,A,B
a,1.4,
b,7.1,4.5
c,,
d,0.75,-1.3


In [44]:
df.sum()   # sum of all numbers on basis of column

A    9.25
B    3.20
dtype: float64

In [45]:
df.sum(axis=1)   # on the basis of rows

a     1.40
b    11.60
c     0.00
d    -0.55
dtype: float64

In [46]:
df.mean()

A    3.083333
B    1.600000
dtype: float64

In [47]:
df.mean(axis=1)

a    1.400
b    5.800
c      NaN
d   -0.275
dtype: float64

In [48]:
df.mean(axis=1, skipna=False)

a      NaN
b    5.800
c      NaN
d   -0.275
dtype: float64

In [49]:
df.describe()

Unnamed: 0,A,B
count,3.0,2.0
mean,3.083333,1.6
std,3.493685,4.101219
min,0.75,-1.3
25%,1.075,0.15
50%,1.4,1.6
75%,4.25,3.05
max,7.1,4.5


In [50]:
df.std()

A    3.493685
B    4.101219
dtype: float64

In [51]:
df.std(axis=1)

a         NaN
b    1.838478
c         NaN
d    1.449569
dtype: float64

In [52]:
df.var()

A    12.205833
B    16.820000
dtype: float64

In [53]:
df.mode()

Unnamed: 0,A,B
0,0.75,-1.3
1,1.4,4.5
2,7.1,


In [54]:
df.count()

A    3
B    2
dtype: int64

In [55]:
obj = pd.Series(['c','a','d','a','a','b','b','c','c'])
obj

0    c
1    a
2    d
3    a
4    a
5    b
6    b
7    c
8    c
dtype: object

In [56]:
obj.unique()

array(['c', 'a', 'd', 'b'], dtype=object)

In [57]:
obj.value_counts()

a    3
c    3
b    2
d    1
dtype: int64

In [58]:
obj

0    c
1    a
2    d
3    a
4    a
5    b
6    b
7    c
8    c
dtype: object

In [59]:
obj.isin(['c','a'])

0     True
1     True
2    False
3     True
4     True
5    False
6    False
7     True
8     True
dtype: bool

In [60]:
cities = {"name":["mumbai","pune","chennai","delhi"],
          "state":['Maharashtra','Karnataka','Gujrat','Tamil Nadu']}

In [61]:
df = pd.DataFrame(cities, columns=['name','state'])
df

Unnamed: 0,name,state
0,mumbai,Maharashtra
1,pune,Karnataka
2,chennai,Gujrat
3,delhi,Tamil Nadu


In [62]:
df[['state','name']]

Unnamed: 0,state,name
0,Maharashtra,mumbai
1,Karnataka,pune
2,Gujrat,chennai
3,Tamil Nadu,delhi


In [64]:
df['state']

0    Maharashtra
1      Karnataka
2         Gujrat
3     Tamil Nadu
Name: state, dtype: object

In [65]:
df['state'].apply(lambda data : data.upper())

0    MAHARASHTRA
1      KARNATAKA
2         GUJRAT
3     TAMIL NADU
Name: state, dtype: object