In [5]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [6]:
import numpy as np
import pandas as pd

In [7]:
from numpy.random import randn
np.random.seed(101)             #If two people use same seed than they see same random numbers

In [8]:
arr = [10,20,30,40,50]
lab = ['a','b','c','d','e']
np_arr = np.array(arr)
dict_arr = {'a':10,'b':20,'c':30,'d':40,'e':50}

In [9]:
pd.Series(data=arr)

0    10
1    20
2    30
3    40
4    50
dtype: int64

In [10]:
pd.Series(dict_arr) #Pandas automatically assigns VALUE as DATA and KEY as INDEX

a    10
b    20
c    30
d    40
e    50
dtype: int64

In [11]:
pd.Series(arr,lab) #Mixing two but KEEP THE NUMBER OF INDEX VALUES SAME AS THE NO OF VALUES IN DATA

a    10
b    20
c    30
d    40
e    50
dtype: int64

In [12]:
df = pd.DataFrame(randn(5,5),['A','B','C','D','E'],['one','two','three','four','five'])

In [13]:
df

Unnamed: 0,one,two,three,four,five
A,2.70685,0.628133,0.907969,0.503826,0.651118
B,-0.319318,-0.848077,0.605965,-2.018168,0.740122
C,0.528813,-0.589001,0.188695,-0.758872,-0.933237
D,0.955057,0.190794,1.978757,2.605967,0.683509
E,0.302665,1.693723,-1.706086,-1.159119,-0.134841


In [14]:
df['two']   #A dataframe is a bunch of series that share same index
#df.two works too but not a good approach since it mixes up with predefined methods.

A    0.628133
B   -0.848077
C   -0.589001
D    0.190794
E    1.693723
Name: two, dtype: float64

In [15]:
#Pandas allows creation of new column like they already exists and we can create new column with various 
#arithmetic combinations of the already defined columns
df['sum'] = df['one'] + df['two'] + df['three'] + df['four'] + df['five']
df

Unnamed: 0,one,two,three,four,five,sum
A,2.70685,0.628133,0.907969,0.503826,0.651118,5.397896
B,-0.319318,-0.848077,0.605965,-2.018168,0.740122,-1.839476
C,0.528813,-0.589001,0.188695,-0.758872,-0.933237,-1.563601
D,0.955057,0.190794,1.978757,2.605967,0.683509,6.414084
E,0.302665,1.693723,-1.706086,-1.159119,-0.134841,-1.003658


In [16]:
#to drop or it means to delete
#df.drop('sum') Run this command and this will generate error.
#Reason being that when we just drop something the by default axis is '0' which is x-axis but if we wan't to
#drop the column we need to mention axis = 1 explicitly

df.drop('sum', axis=1) #See now the sum is dropped.
df.drop('E') #This by default axis assumption works well on the indexes.

Unnamed: 0,one,two,three,four,five
A,2.70685,0.628133,0.907969,0.503826,0.651118
B,-0.319318,-0.848077,0.605965,-2.018168,0.740122
C,0.528813,-0.589001,0.188695,-0.758872,-0.933237
D,0.955057,0.190794,1.978757,2.605967,0.683509
E,0.302665,1.693723,-1.706086,-1.159119,-0.134841


Unnamed: 0,one,two,three,four,five,sum
A,2.70685,0.628133,0.907969,0.503826,0.651118,5.397896
B,-0.319318,-0.848077,0.605965,-2.018168,0.740122,-1.839476
C,0.528813,-0.589001,0.188695,-0.758872,-0.933237,-1.563601
D,0.955057,0.190794,1.978757,2.605967,0.683509,6.414084


In [17]:
df #It still retains the value which were deleted.
#Why? So you don't lose the data accidentally.
#We need to set inplace = 'True' if we want to affect the real table
df.shape #It's dimensions are still as the original so no deletion took place

Unnamed: 0,one,two,three,four,five,sum
A,2.70685,0.628133,0.907969,0.503826,0.651118,5.397896
B,-0.319318,-0.848077,0.605965,-2.018168,0.740122,-1.839476
C,0.528813,-0.589001,0.188695,-0.758872,-0.933237,-1.563601
D,0.955057,0.190794,1.978757,2.605967,0.683509,6.414084
E,0.302665,1.693723,-1.706086,-1.159119,-0.134841,-1.003658


(5, 6)

In [18]:
df.drop('E',inplace=True)

In [19]:
df #sum is dropped
df.shape #Dimensions have changed

Unnamed: 0,one,two,three,four,five,sum
A,2.70685,0.628133,0.907969,0.503826,0.651118,5.397896
B,-0.319318,-0.848077,0.605965,-2.018168,0.740122,-1.839476
C,0.528813,-0.589001,0.188695,-0.758872,-0.933237,-1.563601
D,0.955057,0.190794,1.978757,2.605967,0.683509,6.414084


(4, 6)

In [20]:
#Ways to grab rows 
#Two methods based on index and based on position
df.loc['C']

one      0.528813
two     -0.589001
three    0.188695
four    -0.758872
five    -0.933237
sum     -1.563601
Name: C, dtype: float64

In [21]:
df.iloc[2]

one      0.528813
two     -0.589001
three    0.188695
four    -0.758872
five    -0.933237
sum     -1.563601
Name: C, dtype: float64

In [22]:
#Ways to grab a unique value
df.loc['C','three']
df.iloc[2,2]
df.loc[['A','C'],['three','sum']]

0.18869530944922425

0.18869530944922425

Unnamed: 0,three,sum
A,0.907969,5.397896
C,0.188695,-1.563601


In [23]:
df
df[df>0] #see this returns the NaN  where it is false

Unnamed: 0,one,two,three,four,five,sum
A,2.70685,0.628133,0.907969,0.503826,0.651118,5.397896
B,-0.319318,-0.848077,0.605965,-2.018168,0.740122,-1.839476
C,0.528813,-0.589001,0.188695,-0.758872,-0.933237,-1.563601
D,0.955057,0.190794,1.978757,2.605967,0.683509,6.414084


Unnamed: 0,one,two,three,four,five,sum
A,2.70685,0.628133,0.907969,0.503826,0.651118,5.397896
B,,,0.605965,,0.740122,
C,0.528813,,0.188695,,,
D,0.955057,0.190794,1.978757,2.605967,0.683509,6.414084


In [24]:
df[df['four'] > 0] #only returns where it is true no NaN needed

Unnamed: 0,one,two,three,four,five,sum
A,2.70685,0.628133,0.907969,0.503826,0.651118,5.397896
D,0.955057,0.190794,1.978757,2.605967,0.683509,6.414084


In [25]:
df[df['four']>0]['one'] #printing the 'one column where value in four are > 0'
df[df['four']>0][['one','two']] #printing the dataframe with conditions we want 

A    2.706850
D    0.955057
Name: one, dtype: float64

Unnamed: 0,one,two
A,2.70685,0.628133
D,0.955057,0.190794


In [26]:
df
# df[(df['one']>0) and (df['two']>0)] This statement won't work because the and works on distinct boolean
#values not the series of boolean values so we need to use '&' instead.
df[(df['one']>0) & (df['two']>0)] #See this works
#use | instead of or 

Unnamed: 0,one,two,three,four,five,sum
A,2.70685,0.628133,0.907969,0.503826,0.651118,5.397896
B,-0.319318,-0.848077,0.605965,-2.018168,0.740122,-1.839476
C,0.528813,-0.589001,0.188695,-0.758872,-0.933237,-1.563601
D,0.955057,0.190794,1.978757,2.605967,0.683509,6.414084


Unnamed: 0,one,two,three,four,five,sum
A,2.70685,0.628133,0.907969,0.503826,0.651118,5.397896
D,0.955057,0.190794,1.978757,2.605967,0.683509,6.414084


In [27]:
df.reset_index() #This creates the pre-existing indexes as new columns and create new fresh index starting
#from zero. Shift+Tab inside the brackets of index and see that if you set the inplace = True the old
#indexes won't be kept and the new one will be created in place like the previous example.

Unnamed: 0,index,one,two,three,four,five,sum
0,A,2.70685,0.628133,0.907969,0.503826,0.651118,5.397896
1,B,-0.319318,-0.848077,0.605965,-2.018168,0.740122,-1.839476
2,C,0.528813,-0.589001,0.188695,-0.758872,-0.933237,-1.563601
3,D,0.955057,0.190794,1.978757,2.605967,0.683509,6.414084


In [28]:
indexes = ['start', 'first', 'second', 'end']
df['state'] = indexes
df
df.set_index('state') #To set another column as index but if you want to retain it as index set inplace = True

Unnamed: 0,one,two,three,four,five,sum,state
A,2.70685,0.628133,0.907969,0.503826,0.651118,5.397896,start
B,-0.319318,-0.848077,0.605965,-2.018168,0.740122,-1.839476,first
C,0.528813,-0.589001,0.188695,-0.758872,-0.933237,-1.563601,second
D,0.955057,0.190794,1.978757,2.605967,0.683509,6.414084,end


Unnamed: 0_level_0,one,two,three,four,five,sum
state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
start,2.70685,0.628133,0.907969,0.503826,0.651118,5.397896
first,-0.319318,-0.848077,0.605965,-2.018168,0.740122,-1.839476
second,0.528813,-0.589001,0.188695,-0.758872,-0.933237,-1.563601
end,0.955057,0.190794,1.978757,2.605967,0.683509,6.414084


In [29]:
alpha = ['R1', 'R1', 'R1', 'R2', 'R2', 'R2']
num = [1,2,3,1,2,3]
alphanum = list(zip(alpha,num)) #What a magnificent way to bind them.
alphanum = pd.MultiIndex.from_tuples(alphanum) #I think this line forms multiindex in readable form
Df = pd.DataFrame(randn(6,3),alphanum,['First','Second', 'Third']) #Data Frame
Df.index.names = ['Group', 'Sub-Group'] 
Df #Line 

Unnamed: 0_level_0,Unnamed: 1_level_0,First,Second,Third
Group,Sub-Group,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
R1,1,0.390528,0.166905,0.184502
R1,2,0.807706,0.07296,0.638787
R1,3,0.329646,-0.497104,-0.75407
R2,1,-0.943406,0.484752,-0.116773
R2,2,1.901755,0.238127,1.996652
R2,3,-0.993263,0.1968,-1.136645


In [30]:
#how to grab from this kind of dataFrame
Df.loc['R1']

Unnamed: 0_level_0,First,Second,Third
Sub-Group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,0.390528,0.166905,0.184502
2,0.807706,0.07296,0.638787
3,0.329646,-0.497104,-0.75407


In [31]:
Df.loc['R1'].loc[1]

First     0.390528
Second    0.166905
Third     0.184502
Name: 1, dtype: float64

In [32]:
Df.loc['R1'].loc[1]['Second']

0.16690463609281317

# Filling of the missing data

In [41]:
d = {'A': [5,2,1], 'B': [2,np.nan,np.nan], 'C':[3,1,8]}
DF = pd.DataFrame(d)
DF.dropna() 
# DF.dropna(axis = 1) so you can drop along the column
# thresh = 2 means 2 Nan allowed pr row

Unnamed: 0,A,B,C
0,5,2.0,3


In [43]:
DF
DF.fillna(value = '00') #To fill

Unnamed: 0,A,B,C
0,5,2.0,3
1,2,,1
2,1,,8


Unnamed: 0,A,B,C
0,5,2,3
1,2,0,1
2,1,0,8


In [46]:
DF
DF['B'].fillna(value = DF['B'].mean())

Unnamed: 0,A,B,C
0,5,2.0,3
1,2,,1
2,1,,8


0    2.0
1    2.0
2    2.0
Name: B, dtype: float64

#  GROUPBY