All About DataFrame

# DataFrames

DataFrames are the workhorse of pandas and are directly inspired by the R programming language. We can think of a DataFrame as a bunch of Series objects put together to share the same index. Let's use pandas to explore this topic!

In [234]:
import pandas as pd
import numpy as np

In [235]:
from numpy.random import randn
np.random.seed(101)

In [236]:
table = randn(5,4)
table

array([[ 2.70684984,  0.62813271,  0.90796945,  0.50382575],
       [ 0.65111795, -0.31931804, -0.84807698,  0.60596535],
       [-2.01816824,  0.74012206,  0.52881349, -0.58900053],
       [ 0.18869531, -0.75887206, -0.93323722,  0.95505651],
       [ 0.19079432,  1.97875732,  2.60596728,  0.68350889]])

In [237]:
df = pd.DataFrame(table, index=['a','b','c','d','e'],columns =['W','X','Y','Z'])
# or
#df = pd.DataFrame(table, index=['a','b','c','d','e'],columns = 'W X Y Z'.split())
df

Unnamed: 0,W,X,Y,Z
a,2.70685,0.628133,0.907969,0.503826
b,0.651118,-0.319318,-0.848077,0.605965
c,-2.018168,0.740122,0.528813,-0.589001
d,0.188695,-0.758872,-0.933237,0.955057
e,0.190794,1.978757,2.605967,0.683509


In [238]:
df.columns

Index(['W', 'X', 'Y', 'Z'], dtype='object')

In [239]:
df.index

Index(['a', 'b', 'c', 'd', 'e'], dtype='object')

In [240]:
# Another example 
# employee data base

emplist = [ [100,'Vipul',33,70000],
            [300,'Vaibhav',23,40000],
            [500,'Aniket',53,1170000],
            [200,'Srini',43,220000], 
            [400,'Abhishek',29,72000] ]

#empdf = pd.DataFrame(emplist)  # can create without column name (column name as 0 (as an index),1,2,3)
#empdf = pd.DataFrame(emplist, columns=['empid','ename','age','salary'])  # with column name but index will be 0,1,2,3 and empid
empdf = pd.DataFrame(emplist, columns=['empid','emp name','age','salary'])   
empdf.set_index('empid',inplace=True)

In [241]:
empdf

Unnamed: 0_level_0,emp name,age,salary
empid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
100,Vipul,33,70000
300,Vaibhav,23,40000
500,Aniket,53,1170000
200,Srini,43,220000
400,Abhishek,29,72000


In [242]:
type(empdf)

pandas.core.frame.DataFrame

In [243]:
type(empdf['salary'])

pandas.core.series.Series

In [244]:
type(empdf['emp name'])

pandas.core.series.Series

In [245]:
type(empdf['age'])

pandas.core.series.Series

In [246]:
type(empdf['Salary'])     # column name Case sensitive 

KeyError: 'Salary'

In [250]:
empdf.'emp name'

SyntaxError: invalid syntax (<ipython-input-250-42cf58bbafee>, line 1)

In [251]:
empdf['emp name']  # Accessing one column at a time

empid
100       Vipul
300     Vaibhav
500      Aniket
200       Srini
400    Abhishek
Name: emp name, dtype: object

In [253]:
empdf.salary  # another method, drawback if column name contains space this method would not work

empid
100      70000
300      40000
500    1170000
200     220000
400      72000
Name: salary, dtype: int64

In [255]:
empdf[['emp name','salary']]

Unnamed: 0_level_0,emp name,salary
empid,Unnamed: 1_level_1,Unnamed: 2_level_1
100,Vipul,70000
300,Vaibhav,40000
500,Aniket,1170000
200,Srini,220000
400,Abhishek,72000


In [256]:
empdf.index

Int64Index([100, 300, 500, 200, 400], dtype='int64', name='empid')

In [257]:
empdf.iloc[0]       # Accessing rows with 'i'ndex 0 

emp name    Vipul
age            33
salary      70000
Name: 100, dtype: object

In [292]:
empdf.iloc[100]           # there is no index 100

IndexError: single positional indexer is out-of-bounds

In [258]:
empdf.loc[100]       # Accessing row using field index - empid 

emp name    Vipul
age            33
salary      70000
Name: 100, dtype: object

In [259]:
empdf.loc[400]         # Accessing entire row using field index - empid

emp name    Abhishek
age               29
salary         72000
Name: 400, dtype: object

In [260]:
empdf.loc[[100,300]]  # Accessing 2  rows using index - fancy method - multiple rows

Unnamed: 0_level_0,emp name,age,salary
empid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
100,Vipul,33,70000
300,Vaibhav,23,40000


In [261]:
empdf.iloc[0:3]      # Slicing method to access rows using iloc

Unnamed: 0_level_0,emp name,age,salary
empid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
100,Vipul,33,70000
300,Vaibhav,23,40000
500,Aniket,53,1170000


In [262]:
empdf.loc[ [100,300], ['ename','age']] 

Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#deprecate-loc-reindex-listlike
  return self._getitem_tuple(key)


Unnamed: 0_level_0,ename,age
empid,Unnamed: 1_level_1,Unnamed: 2_level_1
100,,33
300,,23


In [264]:
empdf.loc[[200],['emp name']]       # Accessing specific row and column value using loc function

Unnamed: 0_level_0,emp name
empid,Unnamed: 1_level_1
200,Srini


In [265]:
empdf.loc[[200],['salary']]   # row and column 

Unnamed: 0_level_0,salary
empid,Unnamed: 1_level_1
200,220000


In [266]:
empdf

Unnamed: 0_level_0,emp name,age,salary
empid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
100,Vipul,33,70000
300,Vaibhav,23,40000
500,Aniket,53,1170000
200,Srini,43,220000
400,Abhishek,29,72000


In [267]:
# Add new column to existing data frame
empdf['lastname'] = ['Kapoor','Kochar','Modi','shaha','Kumar']
empdf['middlename'] = ['Vikas','Dharmendra','sharad','Palash','Atul']
empdf

Unnamed: 0_level_0,emp name,age,salary,lastname,middlename
empid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
100,Vipul,33,70000,Kapoor,Vikas
300,Vaibhav,23,40000,Kochar,Dharmendra
500,Aniket,53,1170000,Modi,sharad
200,Srini,43,220000,shaha,Palash
400,Abhishek,29,72000,Kumar,Atul


In [269]:
empdf = empdf.reset_index(level=None)

In [270]:
empdf

Unnamed: 0,index,empid,emp name,age,salary,lastname,middlename
0,0,100,Vipul,33,70000,Kapoor,Vikas
1,1,300,Vaibhav,23,40000,Kochar,Dharmendra
2,2,500,Aniket,53,1170000,Modi,sharad
3,3,200,Srini,43,220000,shaha,Palash
4,4,400,Abhishek,29,72000,Kumar,Atul


In [271]:
# Concatanate middlename and lastname
empdf['mlname'] = empdf['middlename'] + ' ' + empdf['lastname']

In [272]:
empdf

Unnamed: 0,index,empid,emp name,age,salary,lastname,middlename,mlname
0,0,100,Vipul,33,70000,Kapoor,Vikas,Vikas Kapoor
1,1,300,Vaibhav,23,40000,Kochar,Dharmendra,Dharmendra Kochar
2,2,500,Aniket,53,1170000,Modi,sharad,sharad Modi
3,3,200,Srini,43,220000,shaha,Palash,Palash shaha
4,4,400,Abhishek,29,72000,Kumar,Atul,Atul Kumar


In [273]:
empdf.drop(['middlename', 'index'],axis=1,inplace=True)     # axis 1 mean column

In [274]:
empdf

Unnamed: 0,empid,emp name,age,salary,lastname,mlname
0,100,Vipul,33,70000,Kapoor,Vikas Kapoor
1,300,Vaibhav,23,40000,Kochar,Dharmendra Kochar
2,500,Aniket,53,1170000,Modi,sharad Modi
3,200,Srini,43,220000,shaha,Palash shaha
4,400,Abhishek,29,72000,Kumar,Atul Kumar


In [275]:
empdf['agesalarycol'] = empdf['age'] + empdf['salary']      # Adds sal and age and generates new rows for this column

In [276]:
empdf

Unnamed: 0,empid,emp name,age,salary,lastname,mlname,agesalarycol
0,100,Vipul,33,70000,Kapoor,Vikas Kapoor,70033
1,300,Vaibhav,23,40000,Kochar,Dharmendra Kochar,40023
2,500,Aniket,53,1170000,Modi,sharad Modi,1170053
3,200,Srini,43,220000,shaha,Palash shaha,220043
4,400,Abhishek,29,72000,Kumar,Atul Kumar,72029


In [277]:
# Lets rename agesalarycol to age_sal
empdf.rename(columns = {'agesalarycol':'age_sal'}, inplace = True) 

In [278]:
empdf

Unnamed: 0,empid,emp name,age,salary,lastname,mlname,age_sal
0,100,Vipul,33,70000,Kapoor,Vikas Kapoor,70033
1,300,Vaibhav,23,40000,Kochar,Dharmendra Kochar,40023
2,500,Aniket,53,1170000,Modi,sharad Modi,1170053
3,200,Srini,43,220000,shaha,Palash shaha,220043
4,400,Abhishek,29,72000,Kumar,Atul Kumar,72029


In [279]:
# Add row to data frame

dict = {'empid':600,
'emp name':'new name',
'age': 45,
'salary':99999,
'lastname': 'Jain',
'mlname': 'Vimal',
'ncol':1200,
'age_sal': 99945}


In [280]:
rowlist=[]
rowlist.append(dict)
empdf = empdf.append(rowlist,ignore_index=True)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort,


In [281]:
empdf

Unnamed: 0,age,age_sal,emp name,empid,lastname,mlname,ncol,salary
0,33,70033,Vipul,100,Kapoor,Vikas Kapoor,,70000
1,23,40023,Vaibhav,300,Kochar,Dharmendra Kochar,,40000
2,53,1170053,Aniket,500,Modi,sharad Modi,,1170000
3,43,220043,Srini,200,shaha,Palash shaha,,220000
4,29,72029,Abhishek,400,Kumar,Atul Kumar,,72000
5,45,99945,new name,600,Jain,Vimal,1200.0,99999


In [216]:
# See the added row has index 0 again
empdf.reset_index(level=None,inplace=True)

In [217]:
empdf

Unnamed: 0,level_0,index,empid,emp name,age,salary,lastname,mlname,ncol,age_sal
0,0,0.0,100,Vipul,33,70000,Kapoor,Vikas Kapoor,70100,70033
1,1,1.0,300,Vaibhav,23,40000,Kochar,Dharmendra Kochar,40300,40023
2,2,2.0,500,Aniket,53,1170000,Modi,sharad Modi,1170500,1170053
3,3,3.0,200,Srini,43,220000,shaha,Palash shaha,220200,220043
4,4,4.0,400,Abhishek,29,72000,Kumar,Atul Kumar,72400,72029
5,0,,600,new name,45,99999,Jain,Vimal,1200,99945


In [282]:
# Try adding one more row using ignore_index=False

# Add row to data frame

dict = {'empid':700,
'emp name':'new name1',
'age': 46,
'salary':88888,
'lastname': 'Vishwas1',
'mlname': 'Nandkumar',
'ncol':1100,
'age_sal': 88845}

rowlist=[]
rowlist.append(dict)
empdf = empdf.append(rowlist,ignore_index=False)

In [283]:
empdf  # Observe index = 0 for last row so whenever you add new rows, ensure to have ignore_index=True to have correct index

Unnamed: 0,age,age_sal,emp name,empid,lastname,mlname,ncol,salary
0,33,70033,Vipul,100,Kapoor,Vikas Kapoor,,70000
1,23,40023,Vaibhav,300,Kochar,Dharmendra Kochar,,40000
2,53,1170053,Aniket,500,Modi,sharad Modi,,1170000
3,43,220043,Srini,200,shaha,Palash shaha,,220000
4,29,72029,Abhishek,400,Kumar,Atul Kumar,,72000
5,45,99945,new name,600,Jain,Vimal,1200.0,99999
0,46,88845,new name1,700,Vishwas1,Nandkumar,1100.0,88888


In [287]:
empdf.drop(0, inplace=True)           # both rows are disappeared as index to both were 0

KeyError: '[0] not found in axis'

In [289]:
empdf.drop(2, inplace=True, axis=0)

In [290]:
empdf

Unnamed: 0,age,age_sal,emp name,empid,lastname,mlname,ncol,salary
1,23,40023,Vaibhav,300,Kochar,Dharmendra Kochar,,40000
3,43,220043,Srini,200,shaha,Palash shaha,,220000
4,29,72029,Abhishek,400,Kumar,Atul Kumar,,72000
5,45,99945,new name,600,Jain,Vimal,1200.0,99999


In [295]:
# set index age
empdf.set_index('age',inplace=True) 

In [296]:
empdf

Unnamed: 0_level_0,age_sal,emp name,empid,lastname,mlname,ncol,salary
age,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
23,40023,Vaibhav,300,Kochar,Dharmendra Kochar,,40000
43,220043,Srini,200,shaha,Palash shaha,,220000
29,72029,Abhishek,400,Kumar,Atul Kumar,,72000
45,99945,new name,600,Jain,Vimal,1200.0,99999


In [299]:
# reset index
empdf.reset_index(level=None, inplace=True)

In [300]:
empdf

Unnamed: 0,age,age_sal,emp name,empid,lastname,mlname,ncol,salary
0,23,40023,Vaibhav,300,Kochar,Dharmendra Kochar,,40000
1,43,220043,Srini,200,shaha,Palash shaha,,220000
2,29,72029,Abhishek,400,Kumar,Atul Kumar,,72000
3,45,99945,new name,600,Jain,Vimal,1200.0,99999


In [None]:
# Will multi hierachical index later 