# Pandas

In [1]:
import numpy as np
import pandas as pd

In [4]:
labels = ['a','b','c']
my_data = [10,20,30]
arr = np.array(my_data)
d = {'a':10,'b':20,'c':30}

In [5]:
pd.Series(data = my_data)

0    10
1    20
2    30
dtype: int64

In [6]:
pd.Series(my_data,labels)

a    10
b    20
c    30
dtype: int64

In [12]:
ser2 = pd.Series(d,labels)

In [11]:
ser1 = pd.Series(arr)

In [13]:
ser1 + ser2

0   NaN
1   NaN
2   NaN
a   NaN
b   NaN
c   NaN
dtype: float64

## Dataframes

In [21]:
from numpy.random import randn

In [22]:
np.random.seed(101)

In [24]:
df = pd.DataFrame(randn(5,4),['a','b','c','d','e'],['w','x','y','z'])
df

Unnamed: 0,w,x,y,z
a,0.302665,1.693723,-1.706086,-1.159119
b,-0.134841,0.390528,0.166905,0.184502
c,0.807706,0.07296,0.638787,0.329646
d,-0.497104,-0.75407,-0.943406,0.484752
e,-0.116773,1.901755,0.238127,1.996652


In [30]:
df[['w','z','y']]

Unnamed: 0,w,z,y
a,0.302665,-1.159119,-1.706086
b,-0.134841,0.184502,0.166905
c,0.807706,0.329646,0.638787
d,-0.497104,0.484752,-0.943406
e,-0.116773,1.996652,0.238127


In [43]:
df['new'] = df['w'] +df['y']

In [44]:
df['new']

a   -1.403420
b    0.032064
c    1.446493
d   -1.440510
e    0.121354
Name: new, dtype: float64

In [45]:
df

Unnamed: 0,w,x,y,z,new
a,0.302665,1.693723,-1.706086,-1.159119,-1.40342
b,-0.134841,0.390528,0.166905,0.184502,0.032064
c,0.807706,0.07296,0.638787,0.329646,1.446493
d,-0.497104,-0.75407,-0.943406,0.484752,-1.44051
e,-0.116773,1.901755,0.238127,1.996652,0.121354


In [46]:
df.drop('e')

Unnamed: 0,w,x,y,z,new
a,0.302665,1.693723,-1.706086,-1.159119,-1.40342
b,-0.134841,0.390528,0.166905,0.184502,0.032064
c,0.807706,0.07296,0.638787,0.329646,1.446493
d,-0.497104,-0.75407,-0.943406,0.484752,-1.44051


In [48]:
df.drop('new',axis = 1,inplace = True)


In [49]:
df

Unnamed: 0,w,x,y,z
a,0.302665,1.693723,-1.706086,-1.159119
b,-0.134841,0.390528,0.166905,0.184502
c,0.807706,0.07296,0.638787,0.329646
d,-0.497104,-0.75407,-0.943406,0.484752
e,-0.116773,1.901755,0.238127,1.996652


In [52]:
df['w'] #dont forget to use square brackets

a    0.302665
b   -0.134841
c    0.807706
d   -0.497104
e   -0.116773
Name: w, dtype: float64

In [53]:
df.loc['a'] #use loc method to call rows but use square brackets

w    0.302665
x    1.693723
y   -1.706086
z   -1.159119
Name: a, dtype: float64

In [54]:
df.iloc[4] # uses index numerical based

w   -0.116773
x    1.901755
y    0.238127
z    1.996652
Name: e, dtype: float64

In [55]:
df.loc['c','w']

0.8077059142577141

In [56]:
df.loc[['a','b'],['w','x']]

Unnamed: 0,w,x
a,0.302665,1.693723
b,-0.134841,0.390528


## Conditional Selection

In [57]:
df

Unnamed: 0,w,x,y,z
a,0.302665,1.693723,-1.706086,-1.159119
b,-0.134841,0.390528,0.166905,0.184502
c,0.807706,0.07296,0.638787,0.329646
d,-0.497104,-0.75407,-0.943406,0.484752
e,-0.116773,1.901755,0.238127,1.996652


In [58]:
df['w']>0

a     True
b    False
c     True
d    False
e    False
Name: w, dtype: bool

In [66]:
df[df['w'] < 0][['y','z']] #Brackets are important. This is single conditional statement

Unnamed: 0,y,z
b,0.166905,0.184502
d,-0.943406,0.484752
e,0.238127,1.996652


In [68]:
df[(df['w']>0) | (df['y']<0)] #parentheses are used for seperate two different conditional statements

Unnamed: 0,w,x,y,z
a,0.302665,1.693723,-1.706086,-1.159119
c,0.807706,0.07296,0.638787,0.329646
d,-0.497104,-0.75407,-0.943406,0.484752


In [69]:
df.reset_index()

Unnamed: 0,index,w,x,y,z
0,a,0.302665,1.693723,-1.706086,-1.159119
1,b,-0.134841,0.390528,0.166905,0.184502
2,c,0.807706,0.07296,0.638787,0.329646
3,d,-0.497104,-0.75407,-0.943406,0.484752
4,e,-0.116773,1.901755,0.238127,1.996652


In [70]:
df['states'] = ['AB','BC','CD','DE','EF']

In [71]:
df

Unnamed: 0,w,x,y,z,states
a,0.302665,1.693723,-1.706086,-1.159119,AB
b,-0.134841,0.390528,0.166905,0.184502,BC
c,0.807706,0.07296,0.638787,0.329646,CD
d,-0.497104,-0.75407,-0.943406,0.484752,DE
e,-0.116773,1.901755,0.238127,1.996652,EF


In [74]:
df.set_index('states',inplace = True) # to set a new index to the entire dataframe

In [75]:
df

Unnamed: 0_level_0,w,x,y,z
states,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
AB,0.302665,1.693723,-1.706086,-1.159119
BC,-0.134841,0.390528,0.166905,0.184502
CD,0.807706,0.07296,0.638787,0.329646
DE,-0.497104,-0.75407,-0.943406,0.484752
EF,-0.116773,1.901755,0.238127,1.996652


## Multiindexing

In [78]:
#Index levels, creating a dataframe with heirarchy
outside = ['G1','G1','G1','G2','G2','G2']
inside = [1,2,3,1,2,3]
hier_index = list(zip(outside,inside))
hier_index = pd.MultiIndex.from_tuples(hier_index)

In [79]:
hier_index

MultiIndex([('G1', 1),
            ('G1', 2),
            ('G1', 3),
            ('G2', 1),
            ('G2', 2),
            ('G2', 3)],
           )

In [80]:
df = pd.DataFrame(randn(6,2),hier_index,['A','B'])
df

Unnamed: 0,Unnamed: 1,A,B
G1,1,-0.993263,0.1968
G1,2,-1.136645,0.000366
G1,3,1.025984,-0.156598
G2,1,-0.031579,0.649826
G2,2,2.154846,-0.610259
G2,3,-0.755325,-0.346419


In [93]:
df['A'].loc['G1'][2]

-1.1366445936091856

In [94]:
df.index.names = ["Groups",'Names']

In [95]:
df

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B
Groups,Names,Unnamed: 2_level_1,Unnamed: 3_level_1
G1,1,-0.993263,0.1968
G1,2,-1.136645,0.000366
G1,3,1.025984,-0.156598
G2,1,-0.031579,0.649826
G2,2,2.154846,-0.610259
G2,3,-0.755325,-0.346419


In [102]:
df.xs('G2',level ='Groups')

Unnamed: 0_level_0,A,B
Names,Unnamed: 1_level_1,Unnamed: 2_level_1
1,-0.031579,0.649826
2,2.154846,-0.610259
3,-0.755325,-0.346419


## Missing Data

In [None]:
df.dropna() #drops the rows or columns, can add threshold condition too.
df.fillna() #fills the missing elements.

In [103]:
df 

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B
Groups,Names,Unnamed: 2_level_1,Unnamed: 3_level_1
G1,1,-0.993263,0.1968
G1,2,-1.136645,0.000366
G1,3,1.025984,-0.156598
G2,1,-0.031579,0.649826
G2,2,2.154846,-0.610259
G2,3,-0.755325,-0.346419


In [106]:
df.loc['G2'].loc[2] = np.nan

In [110]:
df.fillna(value=df['B'].mean()) #Filling it with mean of B coloumn or A column your wish

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B
Groups,Names,Unnamed: 2_level_1,Unnamed: 3_level_1
G1,1,-0.993263,0.1968
G1,2,-1.136645,0.000366
G1,3,1.025984,-0.156598
G2,1,-0.031579,0.649826
G2,2,0.068795,0.068795
G2,3,-0.755325,-0.346419


In [None]:
pd.concat([df1,df2,df3],axis=1) #concatinates dataframes
pd.merge(df1,df2,how='inner',on = 'Key') #Merges df's. Use ON as common column to merge
left.join(right) #Similar to merge but we're joiing on the index instead of a column