# Pandas

## Pandas is an open source library built on top of Numpy 
## It allows for fast analysis and data cleaning and preparation 
## It excels in performance and productivity 
## It also has built-in visualization features 
## It can work with data from a wide variety of sources 


##Things to learn
--------------------------
### Series 
### Dataframes
### Missing Data
### groupby
### Merging, Joining and Concatenating 
### Operations 
### Data Input and Output 

In [1]:
import numpy as np
import pandas as pd

## Series

In [2]:
labels = ['a','b','c']

my_data = [10,20,30]

arr = np.array(my_data)

d = {'a':10,'b':20,'c':30}


In [3]:
pd.Series(data=my_data)

0    10
1    20
2    30
dtype: int64

In [4]:
# can be indexed 
pd.Series(data=my_data,index=labels)

a    10
b    20
c    30
dtype: int64

In [5]:
# also 
pd.Series(my_data,labels)

a    10
b    20
c    30
dtype: int64

In [6]:
# to create Series , you can pass a numpy array

pd.Series(arr)

0    10
1    20
2    30
dtype: int32

In [7]:
#other way is using dictionary
d

{'a': 10, 'b': 20, 'c': 30}

In [8]:
pd.Series(d)

a    10
b    20
c    30
dtype: int64

In [9]:
#Series can hold functions as well 
pd.Series(data=[sum,print,len])

0      <built-in function sum>
1    <built-in function print>
2      <built-in function len>
dtype: object

In [10]:
# data comes first then labels
ser1 = pd.Series([1,2,3,4],["USA","GERMANY","JAPAN","RUSSIA"])

In [11]:
ser1

USA        1
GERMANY    2
JAPAN      3
RUSSIA     4
dtype: int64

In [12]:
ser2 = pd.Series([1,2,3,4],["USA","GERMANY","JAPAN","ITALY"])

In [13]:
ser2

USA        1
GERMANY    2
JAPAN      3
ITALY      4
dtype: int64

In [14]:
ser1['USA']

1

In [15]:
# Addition , Notice intergers are converted to float
ser1 + ser2

GERMANY    4.0
ITALY      NaN
JAPAN      6.0
RUSSIA     NaN
USA        2.0
dtype: float64

## Dataframes

In [16]:
import numpy as np 
import pandas as pd 
from numpy.random import randn



In [17]:
#constant random data 
np.random.seed(101)

In [18]:
# Dataframe( data, x_label, y_label)
df = pd.DataFrame(randn(5,4),['A','B','C','D','E'],['W','X','Y','Z'])

In [19]:
df

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [20]:
#series out , so a dataframe is bunch of Series 
df['W']

A    2.706850
B    0.651118
C   -2.018168
D    0.188695
E    0.190794
Name: W, dtype: float64

In [21]:
type(df['W'])

pandas.core.series.Series

In [22]:
type(df)

pandas.core.frame.DataFrame

In [23]:
# another way but prefer df['W']
df.W

A    2.706850
B    0.651118
C   -2.018168
D    0.188695
E    0.190794
Name: W, dtype: float64

In [24]:
#display 2 columns 
df[['X','Z']]

Unnamed: 0,X,Z
A,0.628133,0.503826
B,-0.319318,0.605965
C,0.740122,-0.589001
D,-0.758872,0.955057
E,1.978757,0.683509


In [25]:
# creating new column
df['new'] = df['W'] + df['X']
df

Unnamed: 0,W,X,Y,Z,new
A,2.70685,0.628133,0.907969,0.503826,3.334983
B,0.651118,-0.319318,-0.848077,0.605965,0.3318
C,-2.018168,0.740122,0.528813,-0.589001,-1.278046
D,0.188695,-0.758872,-0.933237,0.955057,-0.570177
E,0.190794,1.978757,2.605967,0.683509,2.169552


In [26]:
df.drop('new',axis=1)

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [27]:
# but data wont be dropped as such , its to protect the data 
df

Unnamed: 0,W,X,Y,Z,new
A,2.70685,0.628133,0.907969,0.503826,3.334983
B,0.651118,-0.319318,-0.848077,0.605965,0.3318
C,-2.018168,0.740122,0.528813,-0.589001,-1.278046
D,0.188695,-0.758872,-0.933237,0.955057,-0.570177
E,0.190794,1.978757,2.605967,0.683509,2.169552


In [28]:
#if you want to delete use inplace 
df.drop('new',axis=1,inplace=True)

In [29]:
df

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [30]:
df.shape

(5, 4)

In [31]:
df[['Z','X']]

Unnamed: 0,Z,X
A,0.503826,0.628133
B,0.605965,-0.319318
C,-0.589001,0.740122
D,0.955057,-0.758872
E,0.683509,1.978757


In [32]:
# how about selecting rows ? which aslo returns a Series 

#method 1
df.loc['A']


W    2.706850
X    0.628133
Y    0.907969
Z    0.503826
Name: A, dtype: float64

In [33]:
#using iloc and give a row number 

df.iloc[0]

W    2.706850
X    0.628133
Y    0.907969
Z    0.503826
Name: A, dtype: float64

In [34]:
df.iloc[2]

W   -2.018168
X    0.740122
Y    0.528813
Z   -0.589001
Name: C, dtype: float64

In [35]:
# a specific value

df.loc['B','Y']

-0.8480769834036315

In [36]:
df.loc[['B','C'],['Y','Z']]

Unnamed: 0,Y,Z
B,-0.848077,0.605965
C,0.528813,-0.589001


In [37]:
df

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [38]:
df

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [39]:
# conditional selection 
df > 0

Unnamed: 0,W,X,Y,Z
A,True,True,True,True
B,True,False,False,True
C,False,True,True,False
D,True,False,False,True
E,True,True,True,True


In [40]:
booldf = df > 0
booldf

Unnamed: 0,W,X,Y,Z
A,True,True,True,True
B,True,False,False,True
C,False,True,True,False
D,True,False,False,True
E,True,True,True,True


In [41]:
df[booldf]

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,,,0.605965
C,,0.740122,0.528813,
D,0.188695,,,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [42]:
# also can be done like 
df[df>0]

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,,,0.605965
C,,0.740122,0.528813,
D,0.188695,,,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [43]:
df['W'] > 0

A     True
B     True
C    False
D     True
E     True
Name: W, dtype: bool

In [44]:
#removes C , as C false 
df[df['W'] > 0]

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [45]:
df[df['Z']<0]

Unnamed: 0,W,X,Y,Z
C,-2.018168,0.740122,0.528813,-0.589001


In [46]:
df[df['X']<0]

Unnamed: 0,W,X,Y,Z
B,0.651118,-0.319318,-0.848077,0.605965
D,0.188695,-0.758872,-0.933237,0.955057


In [47]:
df[df['X']<0][['X','Y']]

Unnamed: 0,X,Y
B,-0.319318,-0.848077
D,-0.758872,-0.933237


In [48]:
# multiple conditions ( Not possible in pandas) dont use and
df[df['W']>0 and df['Y']<0  ]

ValueError: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().

In [None]:
# multiple conditions , instead you can do this use & with paranthasis()
df[(df['W']>0) & (df['Y']<1)  ]

In [None]:
# multiple conditions , instead you can do this 
df[(df['W']>0) | (df['Z']<1)  ]

In [None]:
df

In [None]:
#convert to numerical index
df.reset_index()

In [None]:
# use reset_index to get back old df
df.reset_index()
df

In [49]:
newind='CA NY WY OR CO'.split()
newind

['CA', 'NY', 'WY', 'OR', 'CO']

In [50]:
df['States'] = newind
df

Unnamed: 0,W,X,Y,Z,States
A,2.70685,0.628133,0.907969,0.503826,CA
B,0.651118,-0.319318,-0.848077,0.605965,NY
C,-2.018168,0.740122,0.528813,-0.589001,WY
D,0.188695,-0.758872,-0.933237,0.955057,OR
E,0.190794,1.978757,2.605967,0.683509,CO


In [51]:
# changing index
df.set_index('States')


Unnamed: 0_level_0,W,X,Y,Z
States,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
CA,2.70685,0.628133,0.907969,0.503826
NY,0.651118,-0.319318,-0.848077,0.605965
WY,-2.018168,0.740122,0.528813,-0.589001
OR,0.188695,-0.758872,-0.933237,0.955057
CO,0.190794,1.978757,2.605967,0.683509


In [52]:
# but it wont reflect unless you use inplace 
df

Unnamed: 0,W,X,Y,Z,States
A,2.70685,0.628133,0.907969,0.503826,CA
B,0.651118,-0.319318,-0.848077,0.605965,NY
C,-2.018168,0.740122,0.528813,-0.589001,WY
D,0.188695,-0.758872,-0.933237,0.955057,OR
E,0.190794,1.978757,2.605967,0.683509,CO


In [53]:
# changing index with inplace
df.set_index('States',inplace=True)
df

Unnamed: 0_level_0,W,X,Y,Z
States,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
CA,2.70685,0.628133,0.907969,0.503826
NY,0.651118,-0.319318,-0.848077,0.605965
WY,-2.018168,0.740122,0.528813,-0.589001
OR,0.188695,-0.758872,-0.933237,0.955057
CO,0.190794,1.978757,2.605967,0.683509


In [54]:
#DATA frames with multilevel indexes 



In [55]:
# Index Levels
outside = "G1 G1 G1 G2 G2 G2".split()
inside = [1,2,3,1,2,3]
hier_index = list(zip(outside,inside))
hier_index = pd.MultiIndex.from_tuples(hier_index) # creates a MultiIndex

In [56]:
hier_index

MultiIndex([('G1', 1),
            ('G1', 2),
            ('G1', 3),
            ('G2', 1),
            ('G2', 2),
            ('G2', 3)],
           )

In [57]:
list(zip(outside,inside)) # creates a list - tuple pair

[('G1', 1), ('G1', 2), ('G1', 3), ('G2', 1), ('G2', 2), ('G2', 3)]

In [58]:
df = pd.DataFrame(randn(6,2),hier_index,['A','B'])
df

Unnamed: 0,Unnamed: 1,A,B
G1,1,0.302665,1.693723
G1,2,-1.706086,-1.159119
G1,3,-0.134841,0.390528
G2,1,0.166905,0.184502
G2,2,0.807706,0.07296
G2,3,0.638787,0.329646


In [59]:
df.loc['G1']

Unnamed: 0,A,B
1,0.302665,1.693723
2,-1.706086,-1.159119
3,-0.134841,0.390528


In [60]:
df.loc['G2']

Unnamed: 0,A,B
1,0.166905,0.184502
2,0.807706,0.07296
3,0.638787,0.329646


In [61]:
df.loc['G1']['A']

1    0.302665
2   -1.706086
3   -0.134841
Name: A, dtype: float64

In [62]:
df.loc['G1'].iloc[0]

A    0.302665
B    1.693723
Name: 1, dtype: float64

In [63]:
# check names of multi level index 
df.index.names

FrozenList([None, None])

In [64]:
# Assign a list 
df.index.names = ['Groups','Num']

In [65]:
df

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B
Groups,Num,Unnamed: 2_level_1,Unnamed: 3_level_1
G1,1,0.302665,1.693723
G1,2,-1.706086,-1.159119
G1,3,-0.134841,0.390528
G2,1,0.166905,0.184502
G2,2,0.807706,0.07296
G2,3,0.638787,0.329646


In [66]:
df.loc['G2'].loc[2]['B']

0.07295967531703869

In [67]:
df.loc['G1'].loc[3]['A']

-0.13484072060601238

In [68]:
#cross section 
df.xs


<bound method NDFrame.xs of                    A         B
Groups Num                    
G1     1    0.302665  1.693723
       2   -1.706086 -1.159119
       3   -0.134841  0.390528
G2     1    0.166905  0.184502
       2    0.807706  0.072960
       3    0.638787  0.329646>

In [69]:
df.xs('G1')

Unnamed: 0_level_0,A,B
Num,Unnamed: 1_level_1,Unnamed: 2_level_1
1,0.302665,1.693723
2,-1.706086,-1.159119
3,-0.134841,0.390528


In [70]:
df.xs(1,level="Num")

Unnamed: 0_level_0,A,B
Groups,Unnamed: 1_level_1,Unnamed: 2_level_1
G1,0.302665,1.693723
G2,0.166905,0.184502


### Missing Data|

In [75]:
diction = {'A':[1,2,np.nan],'B':[5,np.nan,np.nan],'C':[1,2,3]}

In [76]:
df = pd.DataFrame(diction)

In [77]:
df

Unnamed: 0,A,B,C
0,1.0,5.0,1
1,2.0,,2
2,,,3


In [80]:
df.dropna()


Unnamed: 0,A,B,C
0,1.0,5.0,1


In [81]:
df.dropna(axis=1)

Unnamed: 0,C
0,1
1,2
2,3


In [83]:
#it wont be reflected anyway
df

Unnamed: 0,A,B,C
0,1.0,5.0,1
1,2.0,,2
2,,,3


In [84]:
#you can set the threshold
df.dropna(thresh=2)

Unnamed: 0,A,B,C
0,1.0,5.0,1
1,2.0,,2


In [85]:

df.dropna(axis=1,thresh=2)

Unnamed: 0,A,C
0,1.0,1
1,2.0,2
2,,3


In [86]:
df.dropna(axis=1,thresh=1)

Unnamed: 0,A,B,C
0,1.0,5.0,1
1,2.0,,2
2,,,3


In [87]:
### Replace Missing values 
df.fillna(value=0)

Unnamed: 0,A,B,C
0,1.0,5.0,1
1,2.0,0.0,2
2,0.0,0.0,3


In [93]:
#you can also fillna using mean,mode etc as well 
df['A'].fillna(value=df['A'].mean())

0    1.0
1    2.0
2    1.5
Name: A, dtype: float64