# Pandas DataFrame 3

In [1]:
import numpy as np
import pandas as pd

from numpy.random import randn

In [2]:
# Index Levels

outside = ['G1','G1','G1','G2','G2','G2']

inside = [1,2,3,1,2,3]
hier_index = list(zip(outside,inside))
hier_index = pd.MultiIndex.from_tuples(hier_index)

We used the line :
    hier_index = list(zip(outside,inside))  to create tuple pair 

In [3]:
list(zip(outside,inside))

[('G1', 1), ('G1', 2), ('G1', 3), ('G2', 1), ('G2', 2), ('G2', 3)]

In [4]:
hier_index

MultiIndex([('G1', 1),
            ('G1', 2),
            ('G1', 3),
            ('G2', 1),
            ('G2', 2),
            ('G2', 3)],
           )

In [5]:
df = pd.DataFrame(randn(6,2),hier_index,['A','B'])

In [6]:
df

Unnamed: 0,Unnamed: 1,A,B
G1,1,-2.281127,-1.00393
G1,2,0.405798,-0.245864
G1,3,0.772349,0.89454
G2,1,-0.050176,-0.527995
G2,2,0.741617,-0.001575
G2,3,-0.092755,1.058154


In [7]:
df.loc['G1'].loc[1]

A   -2.281127
B   -1.003930
Name: 1, dtype: float64

In [8]:
#labelling indexes

df.index.names = ['Groups','num']

In [9]:
df

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B
Groups,num,Unnamed: 2_level_1,Unnamed: 3_level_1
G1,1,-2.281127,-1.00393
G1,2,0.405798,-0.245864
G1,3,0.772349,0.89454
G2,1,-0.050176,-0.527995
G2,2,0.741617,-0.001575
G2,3,-0.092755,1.058154


In [10]:
#Calling the value -0.65

#Step 1

df.loc['G2']

Unnamed: 0_level_0,A,B
num,Unnamed: 1_level_1,Unnamed: 2_level_1
1,-0.050176,-0.527995
2,0.741617,-0.001575
3,-0.092755,1.058154


In [11]:
#Step 2

df.loc['G2'].loc[2]

A    0.741617
B   -0.001575
Name: 2, dtype: float64

In [12]:
#Step 3

df.loc['G2'].loc[2].loc['B']

-0.0015750740380970044

In [13]:
df


Unnamed: 0_level_0,Unnamed: 1_level_0,A,B
Groups,num,Unnamed: 2_level_1,Unnamed: 3_level_1
G1,1,-2.281127,-1.00393
G1,2,0.405798,-0.245864
G1,3,0.772349,0.89454
G2,1,-0.050176,-0.527995
G2,2,0.741617,-0.001575
G2,3,-0.092755,1.058154


Returning cross section of rows or columns


In [14]:
# We use the function 'xs() ' to find out the cross section of rows and columns
# It is used when we have multi level index

df.xs('G1')

Unnamed: 0_level_0,A,B
num,Unnamed: 1_level_1,Unnamed: 2_level_1
1,-2.281127,-1.00393
2,0.405798,-0.245864
3,0.772349,0.89454


In [15]:
#This allows us to grab a cross section of the full data frame where we specify the index position and level

df.xs(1,level = 'num')

Unnamed: 0_level_0,A,B
Groups,Unnamed: 1_level_1,Unnamed: 2_level_1
G1,-2.281127,-1.00393
G2,-0.050176,-0.527995


# Pandas MISSING Data

In [19]:
# Create a dictionary with 3 keys
#These are going to be our columnsin our data frame

d = {'A':[1,2,np.nan],'B':[5,np.nan,np.nan],'C':[1,2,3]}


#The reason we have np.nan in here is because wesignify missing or null values

In [21]:
df = pd.DataFrame(d)

In [22]:
df

Unnamed: 0,A,B,C
0,1.0,5.0,1
1,2.0,,2
2,,,3


Here We are going to check out the dropna method
 
A lot of times we want to drop missing values in the data set we use the drop na method

If we use the dropna() by itselef, it is just going to drop all the rows even if a single cell has a missing value.

Please see below

In [26]:
#To drop rows having null values

df.dropna()

Unnamed: 0,A,B,C
0,1.0,5.0,1


In [27]:
#To perform dropna() on columns

df.dropna(axis = 1)

Unnamed: 0,C
0,1
1,2
2,3


In [30]:
#Threshold value is the minimum value to be considered


df.dropna(thresh = 2)

Unnamed: 0,A,B,C
0,1.0,5.0,1
1,2.0,,2


In [31]:
df

Unnamed: 0,A,B,C
0,1.0,5.0,1
1,2.0,,2
2,,,3


In [32]:
#Fillna() THis method is usede to fill missing values

df.fillna(value = 'Fill Value')

Unnamed: 0,A,B,C
0,1.0,5.0,1
1,2.0,Fill Value,2
2,Fill Value,Fill Value,3


In [38]:
#Sometimes we have to fill in or impute the missing values by mean or median

#Below we are filling the value using the mean.

df['A'].fillna(value = df['A'].mean(),inplace = True)

In [39]:
df

Unnamed: 0,A,B,C
0,1.0,5.0,1
1,2.0,,2
2,1.5,,3
