# Pandas Essential Functionalities
## Reindex - How To Reindex Pandas Objects

In [1]:
# pd.Series.reindex?
# pd.DataFrame.reindex?

In [2]:
import pandas as pd
import numpy as np
ob = pd.Series([1, 2, 3, 6], index=['d', 'b', 'a', 'c'])

print(ob)
print(ob.reindex(index=['a', 'b', 'c', 'd']))      #int
print(ob.reindex(index=['a', 'b', 'c', 'd', 'e'])) #NaN -> float

d    1
b    2
a    3
c    6
dtype: int64
a    3
b    2
c    6
d    1
dtype: int64
a    3.0
b    2.0
c    6.0
d    1.0
e    NaN
dtype: float64


In [3]:
ob2 = pd.Series([1, 2, 3], index = [0, 1, 2])

print(ob2)
print(ob2.reindex(index=np.arange(6)))
print(ob2.reindex(index=np.arange(6), method = 'ffill'))

0    1
1    2
2    3
dtype: int64
0    1.0
1    2.0
2    3.0
3    NaN
4    NaN
5    NaN
dtype: float64
0    1
1    2
2    3
3    3
4    3
5    3
dtype: int64


In [4]:
ob3 = pd.DataFrame(np.arange(9).reshape((3, 3)), 
                  index=['a', 'c', 'd'], columns=['Andhra', 'Tamilnadu', 'Kerala'])
print(ob3)
print(ob3.reindex(index=['a', 'b', 'c', 'd']))
print(ob3.reindex(columns=['Andhra', 'Telangana', 'Tamilnadu', 'Kerala']))

   Andhra  Tamilnadu  Kerala
a       0          1       2
c       3          4       5
d       6          7       8
   Andhra  Tamilnadu  Kerala
a     0.0        1.0     2.0
b     NaN        NaN     NaN
c     3.0        4.0     5.0
d     6.0        7.0     8.0
   Andhra  Telangana  Tamilnadu  Kerala
a       0        NaN          1       2
c       3        NaN          4       5
d       6        NaN          7       8


## Drop - Droping Entries From an Axis

In [5]:
# pd.Series.drop?
# pd.DataFrame.drop?

In [6]:
import pandas as pd
import numpy as np

data = pd.Series(np.arange(6), index=['a', 'b', 'c', 'd', 'e', 'f'])

print(data)
print(data.drop('a'))
print(data.drop(['a', 'd']))

a    0
b    1
c    2
d    3
e    4
f    5
dtype: int64
b    1
c    2
d    3
e    4
f    5
dtype: int64
b    1
c    2
e    4
f    5
dtype: int64


In [7]:
dataframe = pd.DataFrame(np.arange(16).reshape((4, 4)), 
                  index=['a', 'b', 'd', 'e'], columns=['Karnataka', 'Andhra', 'Tamilnadu', 'Kerala'])
print(dataframe)
print(dataframe.drop(['a', 'e']))
print(dataframe.drop('Kerala', axis=1))
print(dataframe.drop(['Kerala', 'Andhra'], axis=1))
print(dataframe.drop(['Kerala', 'Tamilnadu'], axis='columns'))
print(dataframe.drop(['Kerala', 'Andhra'], axis=1, inplace=True))

   Karnataka  Andhra  Tamilnadu  Kerala
a          0       1          2       3
b          4       5          6       7
d          8       9         10      11
e         12      13         14      15
   Karnataka  Andhra  Tamilnadu  Kerala
b          4       5          6       7
d          8       9         10      11
   Karnataka  Andhra  Tamilnadu
a          0       1          2
b          4       5          6
d          8       9         10
e         12      13         14
   Karnataka  Tamilnadu
a          0          2
b          4          6
d          8         10
e         12         14
   Karnataka  Andhra
a          0       1
b          4       5
d          8       9
e         12      13
None


## Aggregation - Arithmetic and Data Alignment

In [8]:
# pd.DataFrame.add?

In [9]:
import pandas as pd 
import numpy as np

ser1 = pd.Series([7, 5, 4, 1], index=['a', 'c', 'd', 'e'])
ser2 = pd.Series([7, 5, 4, 1, 3], index=['a', 'c', 'e', 'f', 'g'])
ser1 + ser2

a    14.0
c    10.0
d     NaN
e     5.0
f     NaN
g     NaN
dtype: float64

In [10]:
df1 = pd.DataFrame(np.arange(9).reshape((3, 3)), columns=['a', 'c', 'd'], index=['Andhra', 'Tamilnadu', 'Kerala'])
df2 = pd.DataFrame(np.arange(16).reshape((4, 4)), columns=['a', 'b', 'd', 'e'], index=['Karnataka', 'Andhra', 'Tamilnadu', 'Kerala'])
df1 + df2

Unnamed: 0,a,b,c,d,e
Andhra,4.0,,,8.0,
Karnataka,,,,,
Kerala,18.0,,,22.0,
Tamilnadu,11.0,,,15.0,


In [11]:
df3 = pd.DataFrame({'A': [1, 2]})
df4 = pd.DataFrame({'B': [3, 4]})
df3 + df4

Unnamed: 0,A,B
0,,
1,,


In [12]:
import pandas as pd
import numpy as np

df5 = pd.DataFrame(np.arange(12).reshape((3, 4)), columns=list('abcd'))
df6 = pd.DataFrame(np.arange(20).reshape((4, 5)), columns=list('abcde'))
df5 + df6

Unnamed: 0,a,b,c,d,e
0,0.0,2.0,4.0,6.0,
1,9.0,11.0,13.0,15.0,
2,18.0,20.0,22.0,24.0,
3,,,,,


In [13]:
df5.add(df6, fill_value=0)
# df5.add(df6, fill_value=10)

Unnamed: 0,a,b,c,d,e
0,0.0,2.0,4.0,6.0,4.0
1,9.0,11.0,13.0,15.0,9.0
2,18.0,20.0,22.0,24.0,14.0
3,15.0,16.0,17.0,18.0,19.0


In [14]:
df5 + 10
# df5 - 3
# df5 * 2
# 1/df5

Unnamed: 0,a,b,c,d
0,10,11,12,13
1,14,15,16,17
2,18,19,20,21


In [15]:
df5.radd(10)
# df5.rdiv(1)
# df5.rmul(2)
# df5.rpow(2)

Unnamed: 0,a,b,c,d
0,10,11,12,13
1,14,15,16,17
2,18,19,20,21


## Operations Between DataFrame and Series

In [16]:
import pandas as pd
import numpy as np
df7 = pd.DataFrame(np.arange(12.).reshape((4, 3)),  columns=list('bde'), index=['One', 'Two', 'Three', 'Four'])
ds1 = df7.iloc[0]
df7 - ds1

Unnamed: 0,b,d,e
One,0.0,0.0,0.0
Two,3.0,3.0,3.0
Three,6.0,6.0,6.0
Four,9.0,9.0,9.0


In [17]:
ds2 = pd.Series(range(3), index=['b', 'e', 'f'])
df7 + ser2

Unnamed: 0,a,b,c,d,e,f,g
One,,,,,6.0,,
Two,,,,,9.0,,
Three,,,,,12.0,,
Four,,,,,15.0,,


In [18]:
df7.sub(df7, axis='index')

Unnamed: 0,b,d,e
One,0.0,0.0,0.0
Two,0.0,0.0,0.0
Three,0.0,0.0,0.0
Four,0.0,0.0,0.0


## lambda - Function Application and Mapping

In [19]:
# pd.DataFrame.apply?
# pd.DataFrame.applymap?

In [20]:
import pandas as pd
import numpy as np
df8 = pd.DataFrame(np.random.randn(4, 3),  
                  columns=list('bde'), index=['One', 'Two', 'Three', 'Four'])
df8

Unnamed: 0,b,d,e
One,1.381425,-1.818751,0.186003
Two,-0.92108,-0.68949,0.010357
Three,-0.690672,0.887023,2.275354
Four,0.611356,-2.731372,-0.047612


In [21]:
abs(df8)

Unnamed: 0,b,d,e
One,1.381425,1.818751,0.186003
Two,0.92108,0.68949,0.010357
Three,0.690672,0.887023,2.275354
Four,0.611356,2.731372,0.047612


In [22]:
f = lambda x: x.max()
#f = lambda x: x.min()
#f = lambda x: x.max() - x.min()
df8.apply(f)

b    1.381425
d    0.887023
e    2.275354
dtype: float64

In [23]:
f = lambda x: x.max()
#f = lambda x: x.max() - x.min()
df8.apply(f, axis='columns')
df8.apply(f, axis='columns')

One      1.381425
Two      0.010357
Three    2.275354
Four     0.611356
dtype: float64

In [24]:
def f(x):  return pd.Series([x.max(), x.min(), x.mean()], index=['max', 'min', 'mean'])
df8.apply(f)

Unnamed: 0,b,d,e
max,1.381425,0.887023,2.275354
min,-0.92108,-2.731372,-0.047612
mean,0.095257,-1.088148,0.606025


In [25]:
f = lambda x: '%.3f' %x
df8.applymap(f)

Unnamed: 0,b,d,e
One,1.381,-1.819,0.186
Two,-0.921,-0.689,0.01
Three,-0.691,0.887,2.275
Four,0.611,-2.731,-0.048


## sort - Sorting and Ranking

In [26]:
# pd.DataFrame.sort_index?
# pd.DataFrame.rank?
# pd.DataFrame.sort_values?

In [27]:
import pandas as pd
import numpy as np

series = pd.Series(range(6), index=['d', 'a', 'b', 'c', 'f', 'g'])
series.sort_index(axis=0, level=None, ascending=True)

a    1
b    2
c    3
d    0
f    4
g    5
dtype: int64

In [28]:
df8 = pd.DataFrame(np.random.randn(4, 3),  
                  columns=list('bde'), index=['One', 'Two', 'Three', 'Four'])
df8.rank()
# df8.rank(axis='columns')
# df8.loc[:, 'b']

Unnamed: 0,b,d,e
One,3.0,2.0,4.0
Two,4.0,3.0,1.0
Three,1.0,1.0,3.0
Four,2.0,4.0,2.0


In [29]:
df9 = pd.DataFrame(np.random.randn(4, 5),  
                   columns=list('bdeac'), index=['1', '3', '2', '4'])
df9.sort_index(axis=1, level=None, ascending=True)
# df9.sort_index(axis=1, level=None, ascending=False)
# df9.sort_index(axis=0, level=None, ascending=True)
# df9.sort_values(by=['b'])
# df9.sort_values(by=['d'])

Unnamed: 0,a,b,c,d,e
1,0.145402,-0.411877,-0.096924,0.415244,0.130511
3,-0.354809,-5.5e-05,-0.38367,-0.990748,-0.908245
2,-0.529861,1.024043,-0.973033,-0.472369,-0.392479
4,0.580602,0.42358,-0.006799,-0.679914,-0.405907


In [30]:
df10 =pd.DataFrame(np.arange(12).reshape((3, 4)), 
                   index=['1', '3', '2'], columns=['d', 'a', 'b', 'c'])
df10.sort_index(axis='index', level=None, ascending=True)

Unnamed: 0,d,a,b,c
1,0,1,2,3
2,8,9,10,11
3,4,5,6,7


## is_unique - Axis Indexes with Duplicate Labels

In [31]:
# pd.Index.is_unique?

In [32]:
import pandas as pd
import numpy as np
di_s = pd.Series(range(5), index=['a', 'a', 'b', 'b', 'c'])
di_s['a']
# di_s['b']

a    0
a    1
dtype: int64

In [33]:
di_s.index.is_unique

False

In [34]:
df11 = pd.DataFrame(np.random.randn(4, 3), index=['a', 'a', 'b', 'b'])
df11.loc['a']

Unnamed: 0,0,1,2
a,-0.274654,-0.983588,0.267194
a,-0.872443,0.820166,0.382975


In [35]:
df11.index.is_unique

False

## describe - How to Summarise and compute Descriptive Statistics?

In [36]:
# pd.Series.describe?

In [37]:
import pandas as pd
import numpy as np
df12 = pd.DataFrame([[1.4, np.nan], [7.1, -4.5], [np.nan, np.nan], [0.75, -1.3]], 
                    index=['a', 'b', 'c', 'd'], columns=['one', 'two'])
df12

Unnamed: 0,one,two
a,1.4,
b,7.1,-4.5
c,,
d,0.75,-1.3


In [38]:
df12.describe()

Unnamed: 0,one,two
count,3.0,2.0
mean,3.083333,-2.9
std,3.493685,2.262742
min,0.75,-4.5
25%,1.075,-3.7
50%,1.4,-2.9
75%,4.25,-2.1
max,7.1,-1.3


In [39]:
df12.sum()
# df12.sum(axis='columns')
# df12.mean(axis='columns', skipna=False)
# df12.idxmax()
# df12.cumsum()

one    9.25
two   -5.80
dtype: float64

## unique - Unique Values, Value Counts, and Membership

In [40]:
import pandas as pd
ser_u = pd.Series(['c', 'a', 'd', 'a', 'a', 'b', 'b', 'c', 'c'])
ser_u

0    c
1    a
2    d
3    a
4    a
5    b
6    b
7    c
8    c
dtype: object

In [41]:
ser_u.unique()
# ser_u.value_counts()
# ser_u[ser_u.isin(['d', 'c'])]

array(['c', 'a', 'd', 'b'], dtype=object)

In [42]:
n_dist = pd.Series(['c', 'a', 'b', 'b', 'c', 'a'])
dist = pd.Series(['c', 'b', 'a'])

pd.Index(dist).get_indexer(n_dist)

array([0, 2, 1, 1, 0, 2])

In [43]:
df13 = pd.DataFrame({'Qu1': [1, 3, 4, 3], 'Qu2': [2, 3, 1, 2], 'Qu3': [1, 5, 2, 4]})
histogram = df13.apply(pd.value_counts)
histogram

Unnamed: 0,Qu1,Qu2,Qu3
1,1.0,1.0,1.0
2,,2.0,1.0
3,2.0,1.0,
4,1.0,,1.0
5,,,1.0


In [44]:
histogram = df13.apply(pd.value_counts).fillna(0)
histogram

Unnamed: 0,Qu1,Qu2,Qu3
1,1.0,1.0,1.0
2,0.0,2.0,1.0
3,2.0,1.0,0.0
4,1.0,0.0,1.0
5,0.0,0.0,1.0
