# The Pandas DataFrame Object

A DataFrame represents a ractangler table of data, and contains an ordered collection of columns, each of which can be a different value type.

In [1]:
import numpy as np 
import pandas as pd

In [2]:
#Create a DataFrame from a 2-D ndarray
df = pd.DataFrame(np.array([ [10,11,12,13],[20,21,22,23] ]))
df

Unnamed: 0,0,1,2,3
0,10,11,12,13
1,20,21,22,23


In [3]:
#Create a DataFrame for a list of series objects
df1 = pd.DataFrame( [pd.Series(np.arange(10, 15)),
                    pd.Series(np.arange(15, 20))] )
df1

Unnamed: 0,0,1,2,3,4
0,10,11,12,13,14
1,15,16,17,18,19


In [4]:
#Create a DataFrame with two series objects and a dictionary
s1 = pd.Series(np.arange(1, 6, 1))
s2 = pd.Series(np.arange(6, 11, 1))

df2 = pd.DataFrame({'boys': s1, 'girls': s2})
df2

Unnamed: 0,boys,girls
0,1,6
1,2,7
2,3,8
3,4,9
4,5,10


In [5]:
#Create a DataFrame with dictionary
data = {
            'name':   ["Asad", "Saad", "Fahad", "Ali"], 
            "age":    [23, 34, 23, 21], 
            "grades": ["A", "B", "C", "D"]
       }
data = pd.DataFrame(data)
data

Unnamed: 0,name,age,grades
0,Asad,23,A
1,Saad,34,B
2,Fahad,23,C
3,Ali,21,D


In [6]:
#Specify column name
df3 = pd.DataFrame(np.array([ [10,11], [20,21] ]), columns=["apples", "oranges"])
df3

Unnamed: 0,apples,oranges
0,10,11
1,20,21


In [7]:
#Create a DataFrame with named columns and rows
df4 = pd.DataFrame(np.array([ [10,11,12,13],[20,21,22,23] ]), index=['apples', 'oranges'], columns=['Mon', 'Tue', 'Wed', 'Thu'])

df4

Unnamed: 0,Mon,Tue,Wed,Thu
apples,10,11,12,13
oranges,20,21,22,23


In [8]:
data = {
    'state': ['Ohio','Ohio','Ohio','Neveda','Neveda','Neveda'], 
    'year':  [2000,2001,2002,2001,2002,2003],
    'pop': [1.5,1.7,3.6,2.4,2.9,3.2]
}

frame = pd.DataFrame(data)
frame

Unnamed: 0,state,year,pop
0,Ohio,2000,1.5
1,Ohio,2001,1.7
2,Ohio,2002,3.6
3,Neveda,2001,2.4
4,Neveda,2002,2.9
5,Neveda,2003,3.2


In [9]:
#in-place = false
pd.DataFrame(frame, columns=['year','state','pop', 'imports'])

Unnamed: 0,year,state,pop,imports
0,2000,Ohio,1.5,
1,2001,Ohio,1.7,
2,2002,Ohio,3.6,
3,2001,Neveda,2.4,
4,2002,Neveda,2.9,
5,2003,Neveda,3.2,


In [10]:
#Select a column
frame.year

0    2000
1    2001
2    2002
3    2001
4    2002
5    2003
Name: year, dtype: int64

In [11]:
frame['pop']

0    1.5
1    1.7
2    3.6
3    2.4
4    2.9
5    3.2
Name: pop, dtype: float64

In [12]:
frame2 = pd.DataFrame(data, columns=['year','state','pop','dept'], index=['one', 'two', 'three', 'four', 'five', 'six'])
frame2

Unnamed: 0,year,state,pop,dept
one,2000,Ohio,1.5,
two,2001,Ohio,1.7,
three,2002,Ohio,3.6,
four,2001,Neveda,2.4,
five,2002,Neveda,2.9,
six,2003,Neveda,3.2,


In [13]:
#Enter 100 (const) in dept column
frame2.dept = "100"
frame2

Unnamed: 0,year,state,pop,dept
one,2000,Ohio,1.5,100
two,2001,Ohio,1.7,100
three,2002,Ohio,3.6,100
four,2001,Neveda,2.4,100
five,2002,Neveda,2.9,100
six,2003,Neveda,3.2,100


In [14]:
#Enter 0-6 (range) numbers in dept column
frame2['dept'] = np.arange(6)
frame2

Unnamed: 0,year,state,pop,dept
one,2000,Ohio,1.5,0
two,2001,Ohio,1.7,1
three,2002,Ohio,3.6,2
four,2001,Neveda,2.4,3
five,2002,Neveda,2.9,4
six,2003,Neveda,3.2,5


In [15]:
#Enter (series) in dept column
val = pd.Series([-1.2, -1.5, -1.7], index=['two', 'four', 'five'])
frame2['dept'] = val
frame2

Unnamed: 0,year,state,pop,dept
one,2000,Ohio,1.5,
two,2001,Ohio,1.7,-1.2
three,2002,Ohio,3.6,
four,2001,Neveda,2.4,-1.5
five,2002,Neveda,2.9,-1.7
six,2003,Neveda,3.2,


In [16]:
#Adding column to dataframe with bool values where state(column) has 'Ohio' value
frame2['eastern'] = frame2.state == 'Ohio' #ture/false
frame2

Unnamed: 0,year,state,pop,dept,eastern
one,2000,Ohio,1.5,,True
two,2001,Ohio,1.7,-1.2,True
three,2002,Ohio,3.6,,True
four,2001,Neveda,2.4,-1.5,False
five,2002,Neveda,2.9,-1.7,False
six,2003,Neveda,3.2,,False


In [17]:
#Adding column to dataframe with bool values where pop(column) has value > 2
frame2['greaterThenTwo'] = frame2['pop'] > 2
frame2

Unnamed: 0,year,state,pop,dept,eastern,greaterThenTwo
one,2000,Ohio,1.5,,True,False
two,2001,Ohio,1.7,-1.2,True,False
three,2002,Ohio,3.6,,True,True
four,2001,Neveda,2.4,-1.5,False,True
five,2002,Neveda,2.9,-1.7,False,True
six,2003,Neveda,3.2,,False,True


In [18]:
#Delete eastern column
del frame2['eastern']

In [19]:
frame2

Unnamed: 0,year,state,pop,dept,greaterThenTwo
one,2000,Ohio,1.5,,False
two,2001,Ohio,1.7,-1.2,False
three,2002,Ohio,3.6,,True
four,2001,Neveda,2.4,-1.5,True
five,2002,Neveda,2.9,-1.7,True
six,2003,Neveda,3.2,,True


In [24]:
#Create a DataFrame with dictionary
data = {
            'name':             ["Asad", "Saad", "Fahad", "Ali"], 
            "age":              [23, 34, 23, 21], 
            "AIforEveryOne":    [89,78,90,98],
            "puthon":           [78,89,87,89],
            "git":              [90,98,87,86],
            "numpy":            [98,87,98,99]
       }
data = pd.DataFrame(data)
data

Unnamed: 0,name,age,AIforEveryOne,puthon,git,numpy
0,Asad,23,89,78,90,98
1,Saad,34,78,89,98,87
2,Fahad,23,90,87,87,98
3,Ali,21,98,89,86,99


In [34]:
#Add new columns by calculation some existing columns
data['total'] = data['AIforEveryOne']+data['puthon']+data['git']+data['numpy']
data['percentage'] = (data['total']/400)*100
data['grade'] = [ 'A+' if percent<90.0 else 'A' if 70.0<=percent<80.0 else 'B' for percent in data['percentage'] ]
data

Unnamed: 0,name,age,AIforEveryOne,puthon,git,numpy,Total,total,persentage,percentage,grade
0,Asad,23,89,78,90,98,355,355,88.75,88.75,A+
1,Saad,34,78,89,98,87,352,352,88.0,88.0,A+
2,Fahad,23,90,87,87,98,362,362,90.5,90.5,B
3,Ali,21,98,89,86,99,372,372,93.0,93.0,B


In [45]:
#Nested dict of dicts
pop = {
    'Nevada' : {2001: 2.4, 2002: 2.9},
    'Ohio':    {2000: 1.5, 2001: 1.7, 2002: 3.6}
}
df3 = pd.DataFrame(pop)
df3

Unnamed: 0,Nevada,Ohio
2001,2.4,1.7
2002,2.9,3.6
2000,,1.5


In [41]:
#Make Transposes
df3.T

Unnamed: 0,2001,2002,2000
Nedada,2.4,2.9,
Ohio,1.7,3.6,1.5


In [42]:
#Note, after taking Transposes, actual DF never change.
df3

Unnamed: 0,Nedada,Ohio
2001,2.4,1.7
2002,2.9,3.6
2000,,1.5


In [43]:
#Adding some new indexs
pop1 = pd.DataFrame(pop, index=[2001, 2002, 2003])
pop1

Unnamed: 0,Nedada,Ohio
2001,2.4,1.7
2002,2.9,3.6
2003,,


In [49]:
#Use slice to assign new value in dicts
pdata = {
    'Ohio': df3['Ohio'][:-1],   #slice Ohio column and set value to new column Ohio
    'Nevada': df3['Nevada'][:2] #slice Nevada column and set value to new column Nevada
}
pd.DataFrame(pdata)

Unnamed: 0,Ohio,Nevada
2001,1.7,2.4
2002,3.6,2.9


In [52]:
#Set row(index) and column name
df3.index.name = 'year'         #Set row(index) name
df3.columns.name = 'state'      #Set column name
df3

state,Nevada,Ohio
year,Unnamed: 1_level_1,Unnamed: 2_level_1
2001,2.4,1.7
2002,2.9,3.6
2000,,1.5
