In [1]:
import pandas as pd
import numpy as np

## Series

In [2]:
labels = ['a','b','c']
l1 = [10,20,30]
arr = np.array([40,50,60])
d = {"x":11,"y":22,"z":33}

In [3]:
pd.Series(l1,index = labels)

a    10
b    20
c    30
dtype: int64

In [4]:
pd.Series(arr,labels)

a    40
b    50
c    60
dtype: int64

In [5]:
pd.Series(data = labels)

0    a
1    b
2    c
dtype: object

In [6]:
pd.Series([sum,len,print])

0      <built-in function sum>
1      <built-in function len>
2    <built-in function print>
dtype: object

In [7]:
s1 = pd.Series([1,2,3,4],index = ["IND","US","CH","RS"])
s1

IND    1
US     2
CH     3
RS     4
dtype: int64

In [8]:
s2 = pd.Series([1,2,3,4],index = ["IND","IS","TT","RS"])
s2

IND    1
IS     2
TT     3
RS     4
dtype: int64

In [9]:
s1["US"]

2

In [10]:
s1[1]

2

In [11]:
s1+s2

CH     NaN
IND    2.0
IS     NaN
RS     8.0
TT     NaN
US     NaN
dtype: float64

## Dataframes

### Part 1

In [12]:
from numpy.random import randn
np.random.seed(101)

In [13]:
df = pd.DataFrame(randn(5,4),
                  ['A','B','C','D','E'],
                  ['W','X','Y','Z'])
df

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [14]:
df['W']

A    2.706850
B    0.651118
C   -2.018168
D    0.188695
E    0.190794
Name: W, dtype: float64

In [15]:
df['W']

A    2.706850
B    0.651118
C   -2.018168
D    0.188695
E    0.190794
Name: W, dtype: float64

In [16]:
type(df['W'])

pandas.core.series.Series

In [17]:
df.W

A    2.706850
B    0.651118
C   -2.018168
D    0.188695
E    0.190794
Name: W, dtype: float64

In [18]:
df[['W','Z']]

Unnamed: 0,W,Z
A,2.70685,0.503826
B,0.651118,0.605965
C,-2.018168,-0.589001
D,0.188695,0.955057
E,0.190794,0.683509


In [19]:
df['new'] = df['W'] + df['X']
df['new']

A    3.334983
B    0.331800
C   -1.278046
D   -0.570177
E    2.169552
Name: new, dtype: float64

In [20]:
df

Unnamed: 0,W,X,Y,Z,new
A,2.70685,0.628133,0.907969,0.503826,3.334983
B,0.651118,-0.319318,-0.848077,0.605965,0.3318
C,-2.018168,0.740122,0.528813,-0.589001,-1.278046
D,0.188695,-0.758872,-0.933237,0.955057,-0.570177
E,0.190794,1.978757,2.605967,0.683509,2.169552


In [21]:
df.drop('new',axis = 1)

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [22]:
df.drop('E')

Unnamed: 0,W,X,Y,Z,new
A,2.70685,0.628133,0.907969,0.503826,3.334983
B,0.651118,-0.319318,-0.848077,0.605965,0.3318
C,-2.018168,0.740122,0.528813,-0.589001,-1.278046
D,0.188695,-0.758872,-0.933237,0.955057,-0.570177


In [23]:
df

Unnamed: 0,W,X,Y,Z,new
A,2.70685,0.628133,0.907969,0.503826,3.334983
B,0.651118,-0.319318,-0.848077,0.605965,0.3318
C,-2.018168,0.740122,0.528813,-0.589001,-1.278046
D,0.188695,-0.758872,-0.933237,0.955057,-0.570177
E,0.190794,1.978757,2.605967,0.683509,2.169552


Drop doesnot happens inplace.

In [24]:
df.drop('new',axis = 1,inplace = True)

In [25]:
df

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


Name based

In [26]:
df.loc["A":"B","X":]  

Unnamed: 0,X,Y,Z
A,0.628133,0.907969,0.503826
B,-0.319318,-0.848077,0.605965


Integer index based

In [27]:
df.iloc[0:2,1:]

Unnamed: 0,X,Y,Z
A,0.628133,0.907969,0.503826
B,-0.319318,-0.848077,0.605965


In [28]:
df.loc["A","Z"]

0.5038257538223936

### Part 2

In [29]:
df

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [30]:
df[df>0]

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,,,0.605965
C,,0.740122,0.528813,
D,0.188695,,,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [31]:
df["W"]>0

A     True
B     True
C    False
D     True
E     True
Name: W, dtype: bool

In [32]:
df[df['W']>0]

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [33]:
 df[df["Z"]<0]

Unnamed: 0,W,X,Y,Z
C,-2.018168,0.740122,0.528813,-0.589001


In [34]:
df[df["W"]>0][['X','Y']]

Unnamed: 0,X,Y
A,0.628133,0.907969
B,-0.319318,-0.848077
D,-0.758872,-0.933237
E,1.978757,2.605967


In [35]:
df

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [36]:
df[(df['W']>0) & (df['X']>1)]

Unnamed: 0,W,X,Y,Z
E,0.190794,1.978757,2.605967,0.683509


We cannot use keyword 'and' instead use & .
We cannot use keyword 'or'  instead use |

In [37]:
df

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [38]:
df.reset_index(drop=True)

Unnamed: 0,W,X,Y,Z
0,2.70685,0.628133,0.907969,0.503826
1,0.651118,-0.319318,-0.848077,0.605965
2,-2.018168,0.740122,0.528813,-0.589001
3,0.188695,-0.758872,-0.933237,0.955057
4,0.190794,1.978757,2.605967,0.683509


In [39]:
cols = 'r g b y o'.split()
cols

['r', 'g', 'b', 'y', 'o']

In [40]:
df['Cols'] = cols

In [41]:
df

Unnamed: 0,W,X,Y,Z,Cols
A,2.70685,0.628133,0.907969,0.503826,r
B,0.651118,-0.319318,-0.848077,0.605965,g
C,-2.018168,0.740122,0.528813,-0.589001,b
D,0.188695,-0.758872,-0.933237,0.955057,y
E,0.190794,1.978757,2.605967,0.683509,o


In [42]:
df.set_index('Cols')

Unnamed: 0_level_0,W,X,Y,Z
Cols,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
r,2.70685,0.628133,0.907969,0.503826
g,0.651118,-0.319318,-0.848077,0.605965
b,-2.018168,0.740122,0.528813,-0.589001
y,0.188695,-0.758872,-0.933237,0.955057
o,0.190794,1.978757,2.605967,0.683509


In [43]:
df

Unnamed: 0,W,X,Y,Z,Cols
A,2.70685,0.628133,0.907969,0.503826,r
B,0.651118,-0.319318,-0.848077,0.605965,g
C,-2.018168,0.740122,0.528813,-0.589001,b
D,0.188695,-0.758872,-0.933237,0.955057,y
E,0.190794,1.978757,2.605967,0.683509,o


### Part 3

In [44]:
outside = ['G1', 'G1', 'G1', 'G2', 'G2', 'G2']
inside = [1,2,3,1,2,3]
heir_index = list(zip(outside,inside))
heir_index = pd.MultiIndex.from_tuples(heir_index)

In [45]:
heir_index

MultiIndex(levels=[['G1', 'G2'], [1, 2, 3]],
           labels=[[0, 0, 0, 1, 1, 1], [0, 1, 2, 0, 1, 2]])

In [46]:
list(zip(outside,inside))

[('G1', 1), ('G1', 2), ('G1', 3), ('G2', 1), ('G2', 2), ('G2', 3)]

In [47]:
heir_index

MultiIndex(levels=[['G1', 'G2'], [1, 2, 3]],
           labels=[[0, 0, 0, 1, 1, 1], [0, 1, 2, 0, 1, 2]])

In [48]:
df = pd.DataFrame(randn(6,2),heir_index,['A','B'])

In [49]:
df

Unnamed: 0,Unnamed: 1,A,B
G1,1,0.302665,1.693723
G1,2,-1.706086,-1.159119
G1,3,-0.134841,0.390528
G2,1,0.166905,0.184502
G2,2,0.807706,0.07296
G2,3,0.638787,0.329646


In [50]:
df.loc['G1'].loc[1]

A    0.302665
B    1.693723
Name: 1, dtype: float64

In [51]:
df.index.names

FrozenList([None, None])

In [52]:
df.index.names = ['Groups','Num']

In [53]:
df

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B
Groups,Num,Unnamed: 2_level_1,Unnamed: 3_level_1
G1,1,0.302665,1.693723
G1,2,-1.706086,-1.159119
G1,3,-0.134841,0.390528
G2,1,0.166905,0.184502
G2,2,0.807706,0.07296
G2,3,0.638787,0.329646


In [54]:
df.loc['G2'].iloc[1]['B']

0.07295967531703869

In [55]:
df.loc['G2'].loc[2]['B']

0.07295967531703869

In [56]:
df.xs('G1')

Unnamed: 0_level_0,A,B
Num,Unnamed: 1_level_1,Unnamed: 2_level_1
1,0.302665,1.693723
2,-1.706086,-1.159119
3,-0.134841,0.390528


In [57]:
df.xs(2,level = 'Num') #where num is 2

Unnamed: 0_level_0,A,B
Groups,Unnamed: 1_level_1,Unnamed: 2_level_1
G1,-1.706086,-1.159119
G2,0.807706,0.07296


# Missing Data

In [58]:
d = {'A':[1,2,np.nan],'B':[5,np.nan,np.nan],'C':[4,5,6]}

In [59]:
df = pd.DataFrame(d)
df

Unnamed: 0,A,B,C
0,1.0,5.0,4
1,2.0,,5
2,,,6


In [60]:
df.dropna() 

Unnamed: 0,A,B,C
0,1.0,5.0,4


In [61]:
df.dropna(axis = 1)

Unnamed: 0,C
0,4
1,5
2,6


In [62]:
df.dropna(how='all')

Unnamed: 0,A,B,C
0,1.0,5.0,4
1,2.0,,5
2,,,6


In [63]:
df.dropna(how='any')

Unnamed: 0,A,B,C
0,1.0,5.0,4


In [64]:
df.dropna(thresh = 2) #atleast 2 non nan values

Unnamed: 0,A,B,C
0,1.0,5.0,4
1,2.0,,5


In [65]:
df

Unnamed: 0,A,B,C
0,1.0,5.0,4
1,2.0,,5
2,,,6


In [66]:
df.fillna('Fill value')

Unnamed: 0,A,B,C
0,1,5,4
1,2,Fill value,5
2,Fill value,Fill value,6


In [67]:
df['A'].fillna(value = df['A'].mean())

0    1.0
1    2.0
2    1.5
Name: A, dtype: float64