# Dataframes

In [1]:
import pandas as pd
import numpy as np

In [2]:
df= pd.DataFrame(data=np.random.randint(0,100,20).reshape(5,4), columns= ["col1","col2","col3","col4",], index= ['row1','row2','row3','row4','row5'])
df

Unnamed: 0,col1,col2,col3,col4
row1,74,77,41,63
row2,66,34,24,3
row3,5,71,58,55
row4,65,99,12,84
row5,71,88,91,47


In [3]:
#Conditional seletion
df>10

Unnamed: 0,col1,col2,col3,col4
row1,True,True,True,True
row2,True,True,True,False
row3,False,True,True,True
row4,True,True,True,True
row5,True,True,True,True


In [4]:
#Filtering the dataframe according to a condition applied on col1
df[df["col1"]>10]

Unnamed: 0,col1,col2,col3,col4
row1,74,77,41,63
row2,66,34,24,3
row4,65,99,12,84
row5,71,88,91,47


In [5]:
#Filtering the dataframe according to a condition applied on col1 and taking out col1 only
df[df["col1"]>10] ['col1']

row1    74
row2    66
row4    65
row5    71
Name: col1, dtype: int32

In [6]:
#Filtering dataframe based on multiple conditions
df[  (df['col2']>50)  & (df['col1']>50)  ]     #using &(and) operator

Unnamed: 0,col1,col2,col3,col4
row1,74,77,41,63
row4,65,99,12,84
row5,71,88,91,47


In [7]:
#Filtering dataframe based on multiple conditions
df[  (df['col2']>50)  | (df['col1']>50)  ]     #using |(or) operator

Unnamed: 0,col1,col2,col3,col4
row1,74,77,41,63
row2,66,34,24,3
row3,5,71,58,55
row4,65,99,12,84
row5,71,88,91,47


In [8]:
#Resetting the index
df.reset_index()

Unnamed: 0,index,col1,col2,col3,col4
0,row1,74,77,41,63
1,row2,66,34,24,3
2,row3,5,71,58,55
3,row4,65,99,12,84
4,row5,71,88,91,47


In [9]:
#setting an index permanently 
df.reset_index(inplace=True)
df

Unnamed: 0,index,col1,col2,col3,col4
0,row1,74,77,41,63
1,row2,66,34,24,3
2,row3,5,71,58,55
3,row4,65,99,12,84
4,row5,71,88,91,47


In [10]:
#setting a custom index
df.set_index('index', inplace= True)
df

Unnamed: 0_level_0,col1,col2,col3,col4
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
row1,74,77,41,63
row2,66,34,24,3
row3,5,71,58,55
row4,65,99,12,84
row5,71,88,91,47


# Missing Values

In [11]:
#creating a new dataframe with null values from above dataframe.
new_df= df[df>15]
new_df

Unnamed: 0_level_0,col1,col2,col3,col4
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
row1,74.0,77,41.0,63.0
row2,66.0,34,24.0,
row3,,71,58.0,55.0
row4,65.0,99,,84.0
row5,71.0,88,91.0,47.0


In [12]:
#Dropping missing values from the dataframe
#Mention inplace= True for dropping null values permanently from the dataframe.
#This will drop rows containing the null values
new_df.dropna()

Unnamed: 0_level_0,col1,col2,col3,col4
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
row1,74.0,77,41.0,63.0
row5,71.0,88,91.0,47.0


In [13]:
#dropping columns containing null values
new_df.dropna(axis=1)

Unnamed: 0_level_0,col2
index,Unnamed: 1_level_1
row1,77
row2,34
row3,71
row4,99
row5,88


In [14]:
#Filling the null values with 5
new_df.fillna(value= 5)

Unnamed: 0_level_0,col1,col2,col3,col4
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
row1,74.0,77,41.0,63.0
row2,66.0,34,24.0,5.0
row3,5.0,71,58.0,55.0
row4,65.0,99,5.0,84.0
row5,71.0,88,91.0,47.0


In [15]:
#Filling the null valyes with a condition as sum of column1
new_df.fillna(value= new_df["col1"].sum(),inplace=True)
new_df

Unnamed: 0_level_0,col1,col2,col3,col4
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
row1,74.0,77,41.0,63.0
row2,66.0,34,24.0,276.0
row3,276.0,71,58.0,55.0
row4,65.0,99,276.0,84.0
row5,71.0,88,91.0,47.0


# Group By

In [16]:
countries= {"Countries":['India', 'India', 'USA', 'USA', 'China'], "Sales":[100,200,100,300,400]}
newdf2= pd.DataFrame(countries)
newdf2

Unnamed: 0,Countries,Sales
0,India,100
1,India,200
2,USA,100
3,USA,300
4,China,400


In [17]:
#Grouping by rows based on column names
newdf2.groupby('Countries').sum()

Unnamed: 0_level_0,Sales
Countries,Unnamed: 1_level_1
China,400
India,300
USA,400


In [18]:
newdf2.groupby("Countries").describe()

Unnamed: 0_level_0,Sales,Sales,Sales,Sales,Sales,Sales,Sales,Sales
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
Countries,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
China,1.0,400.0,,400.0,400.0,400.0,400.0,400.0
India,2.0,150.0,70.710678,100.0,125.0,150.0,175.0,200.0
USA,2.0,200.0,141.421356,100.0,150.0,200.0,250.0,300.0
