# Pandas DataFrame overview


# Function Application and Mapping

In [1]:
import numpy as np
import pandas as pd
frame  = pd.DataFrame(np.random.randn(4, 3), 
                             columns=list('bde'), 
                     index=['Utah', 'Ohio', 'Texas', 'Oregon'])
print(frame)

               b         d         e
Utah   -0.409346 -1.196249  1.362264
Ohio    0.583208  1.323443  1.140889
Texas   1.394336 -2.196385  0.114458
Oregon -0.876926 -1.509606 -1.244524


In [2]:
print(np.abs(frame))

               b         d         e
Utah    0.409346  1.196249  1.362264
Ohio    0.583208  1.323443  1.140889
Texas   1.394336  2.196385  0.114458
Oregon  0.876926  1.509606  1.244524


In [3]:
print(frame["d"]. min())

-2.196384743406495


In [4]:
print(frame["d"].max())

1.3234432426948768


In [5]:
print(frame["d"].max() - frame["d"]. min())

3.519827986101372


In [6]:
f = lambda x: x.max() - x.min() 
df = frame.apply(f)
print(df, type(df))

b    2.271262
d    3.519828
e    2.606788
dtype: float64 <class 'pandas.core.series.Series'>


In [7]:
df = frame.apply(f, axis=1)
print(df)

Utah      2.558513
Ohio      0.740235
Texas     3.590721
Oregon    0.632680
dtype: float64


In [8]:
def min_max(x):
  return pd.Series( [x.max() , x.min() ], index=['min', 'max'])    

df = frame.apply(min_max)
print(df, type(df))

            b         d         e
min  1.394336  1.323443  1.362264
max -0.876926 -2.196385 -1.244524 <class 'pandas.core.frame.DataFrame'>


In [9]:
def min_max(x):
  return pd.Series( [x.max() - x.min() ], index=['min-max'])    

df = frame.apply(min_max)
print(df, type(df))

                b         d         e
min-max  2.271262  3.519828  2.606788 <class 'pandas.core.frame.DataFrame'>


# Sorting and Ranking

In [10]:
# Sorting and Ranking
import numpy as np
import pandas as pd
frame  = pd.DataFrame(np.random.randn(4, 3), 
                             columns=list('bde'), 
                     index=['Utah', 'Ohio', 'Texas', 'Oregon'])
print(frame)

               b         d         e
Utah    1.431659 -0.357057 -1.161687
Ohio   -1.099858  1.752138 -0.465720
Texas   0.644284 -0.322173 -0.584513
Oregon  0.468000 -0.498031 -1.390034


In [11]:
# defaults in sort: axis = 0, ascending = True
print(frame.sort_index(axis=1, ascending=True))

               b         d         e
Utah    1.431659 -0.357057 -1.161687
Ohio   -1.099858  1.752138 -0.465720
Texas   0.644284 -0.322173 -0.584513
Oregon  0.468000 -0.498031 -1.390034


In [12]:
print( frame.sort_index())

               b         d         e
Ohio   -1.099858  1.752138 -0.465720
Oregon  0.468000 -0.498031 -1.390034
Texas   0.644284 -0.322173 -0.584513
Utah    1.431659 -0.357057 -1.161687


In [13]:
frame

Unnamed: 0,b,d,e
Utah,1.431659,-0.357057,-1.161687
Ohio,-1.099858,1.752138,-0.46572
Texas,0.644284,-0.322173,-0.584513
Oregon,0.468,-0.498031,-1.390034


In [14]:
# sort by values
frame  = pd.DataFrame(np.random.randn(4, 3), 
                             columns=list('bde'), 
                     index=['Utah', 'Ohio', 'Texas', 'Oregon'])
print(frame)
print( frame.sort_values(by='b') )


               b         d         e
Utah    1.232950 -1.985025 -0.679833
Ohio    0.769441  0.548758 -1.266527
Texas   1.026494 -0.601035  0.956384
Oregon -0.026996 -0.157361 -0.243208
               b         d         e
Oregon -0.026996 -0.157361 -0.243208
Ohio    0.769441  0.548758 -1.266527
Texas   1.026494 -0.601035  0.956384
Utah    1.232950 -1.985025 -0.679833


In [15]:
print(frame.rank(ascending=False, method='max'))
print(frame.rank(ascending=True, method='min'))
print( frame.rank(axis='columns'))


#check details from the book
'''
'average' Default: assign the average rank to each entry in the equal group
'min'
'max'
'first'
'dense'
Use the minimum rank for the whole group
Use the maximum rank for the whole group
Assign ranks in the order the values appear in the data
Like method='min' , but ranks always increase by 1 in between groups rather than the number of equal
elements in a group
'''

          b    d    e
Utah    1.0  4.0  3.0
Ohio    3.0  1.0  4.0
Texas   2.0  3.0  1.0
Oregon  4.0  2.0  2.0
          b    d    e
Utah    4.0  1.0  2.0
Ohio    2.0  4.0  1.0
Texas   3.0  2.0  4.0
Oregon  1.0  3.0  3.0
          b    d    e
Utah    3.0  1.0  2.0
Ohio    3.0  2.0  1.0
Texas   3.0  1.0  2.0
Oregon  3.0  2.0  1.0


"\n'average' Default: assign the average rank to each entry in the equal group\n'min'\n'max'\n'first'\n'dense'\nUse the minimum rank for the whole group\nUse the maximum rank for the whole group\nAssign ranks in the order the values appear in the data\nLike method='min' , but ranks always increase by 1 in between groups rather than the number of equal\nelements in a group\n"

# Summarizing and Computing Descriptive Statistics

In [16]:
df = pd.DataFrame([
                    [1.4, np.nan], [7.1, -4.5],
                    [np.nan, np.nan], [0.75, -1.3]
                   ], index=['a', 'b', 'c', 'd'], columns=['one', 'two'])
print("Data Frame")
print(df)
print()
print("Column sum")
print( df.sum())
print()
print("Row Sum")
print( df.sum(axis='columns'))
# note: use of skipna

Data Frame
    one  two
a  1.40  NaN
b  7.10 -4.5
c   NaN  NaN
d  0.75 -1.3

Column sum
one    9.25
two   -5.80
dtype: float64

Row Sum
a    1.40
b    2.60
c    0.00
d   -0.55
dtype: float64


In [17]:
print(df)
x = df.mean(axis='columns', skipna=False)
print()
print(x)

    one  two
a  1.40  NaN
b  7.10 -4.5
c   NaN  NaN
d  0.75 -1.3

a      NaN
b    1.300
c      NaN
d   -0.275
dtype: float64


# unique values

In [18]:
df = pd.DataFrame([
                    [1.4,1.4, 1.5, np.nan], [7.1, -4.5, 1.5, 1.4],
                    [1.4, np.nan, 0.5, np.nan], [0.75, -1.3, 1.3, np.nan]
                   ], index=['a', 'b', 'c', 'd'], columns=['one', 'two', 'three', 'four'])
print(df)
print()
print(df['one'].unique(), df['two'].unique() )
df['one'].value_counts()

    one  two  three  four
a  1.40  1.4    1.5   NaN
b  7.10 -4.5    1.5   1.4
c  1.40  NaN    0.5   NaN
d  0.75 -1.3    1.3   NaN

[1.4  7.1  0.75] [ 1.4 -4.5  nan -1.3]


1.40    2
0.75    1
7.10    1
Name: one, dtype: int64

# Arithmetic and Data Alignment

In [19]:
#print(list('bcd'))
df1 = pd.DataFrame(np.arange(9.).reshape((3, 3)), 
                   columns=list('bcd'), 
                   index=['Ohio', 'Texas', 'Colorado'])

df2 = pd.DataFrame(np.arange(12.).reshape((4, 3)), 
                   columns=list('bde'),
                    index=['Utah', 'Ohio', 'Texas', 'Oregon'])

print(df1)
print(df2)
print()
# applying plus operation between two data frames
df3 = df1 + df2

print(df3)
# your work is to fill all Nan values of  this df3 with a number, 
# choice of number is yours

            b    c    d
Ohio      0.0  1.0  2.0
Texas     3.0  4.0  5.0
Colorado  6.0  7.0  8.0
          b     d     e
Utah    0.0   1.0   2.0
Ohio    3.0   4.0   5.0
Texas   6.0   7.0   8.0
Oregon  9.0  10.0  11.0

            b   c     d   e
Colorado  NaN NaN   NaN NaN
Ohio      3.0 NaN   6.0 NaN
Oregon    NaN NaN   NaN NaN
Texas     9.0 NaN  12.0 NaN
Utah      NaN NaN   NaN NaN


# Arithmetic methods with fill values

In [20]:
df1 = pd.DataFrame(np.arange(12.).reshape((3, 4)),
                   columns=list('abcd'))
df2 = pd.DataFrame(np.arange(20.).reshape((4, 5)),
                    columns=list('abcde'))
print(df1)
print(df2)

     a    b     c     d
0  0.0  1.0   2.0   3.0
1  4.0  5.0   6.0   7.0
2  8.0  9.0  10.0  11.0
      a     b     c     d     e
0   0.0   1.0   2.0   3.0   4.0
1   5.0   6.0   7.0   8.0   9.0
2  10.0  11.0  12.0  13.0  14.0
3  15.0  16.0  17.0  18.0  19.0


In [21]:
df2.loc[1, 'b'] = np.nan
print(df1)
print(df2)

     a    b     c     d
0  0.0  1.0   2.0   3.0
1  4.0  5.0   6.0   7.0
2  8.0  9.0  10.0  11.0
      a     b     c     d     e
0   0.0   1.0   2.0   3.0   4.0
1   5.0   NaN   7.0   8.0   9.0
2  10.0  11.0  12.0  13.0  14.0
3  15.0  16.0  17.0  18.0  19.0


In [22]:
df3 = df1 + df2
print("direct + operation without fill_value")
print(df3)

direct + operation without fill_value
      a     b     c     d   e
0   0.0   2.0   4.0   6.0 NaN
1   9.0   NaN  13.0  15.0 NaN
2  18.0  20.0  22.0  24.0 NaN
3   NaN   NaN   NaN   NaN NaN


In [23]:
# We can use add method for filling NaN cells with a value
# Nan will be replaced by 0 and then addition operation will apply
print("addition using a method with replacing Nan with 0")
df3 = df1.add(df2, fill_value=0)
print(df3)

addition using a method with replacing Nan with 0
      a     b     c     d     e
0   0.0   2.0   4.0   6.0   4.0
1   9.0   5.0  13.0  15.0   9.0
2  18.0  20.0  22.0  24.0  14.0
3  15.0  16.0  17.0  18.0  19.0


# Operations between DataFrame and Series

In [24]:
import numpy as np
import pandas as pd
frame = pd.DataFrame(np.arange(12.).reshape((4, 3)),
            columns=list('bde'), 
                     index=['Utah', 'Ohio', 'Texas', 'Oregon'])
print(frame)

          b     d     e
Utah    0.0   1.0   2.0
Ohio    3.0   4.0   5.0
Texas   6.0   7.0   8.0
Oregon  9.0  10.0  11.0


In [25]:
# iloc syntax
# iloc[start_row_pos:end_row_pos, start_column_pos: end_col_positin ]
series = frame.iloc[0]
print(series)

b    0.0
d    1.0
e    2.0
Name: Utah, dtype: float64


In [26]:
print(series.values)
print(series.index)

[0. 1. 2.]
Index(['b', 'd', 'e'], dtype='object')


In [27]:
print(frame)
print(series)
print(frame - series)

          b     d     e
Utah    0.0   1.0   2.0
Ohio    3.0   4.0   5.0
Texas   6.0   7.0   8.0
Oregon  9.0  10.0  11.0
b    0.0
d    1.0
e    2.0
Name: Utah, dtype: float64
          b    d    e
Utah    0.0  0.0  0.0
Ohio    3.0  3.0  3.0
Texas   6.0  6.0  6.0
Oregon  9.0  9.0  9.0


In [28]:
series = pd.Series([1,2,3], index =list('bde') )
print(frame)
print(series)
print(series.index, series.values)
print(frame - series)

          b     d     e
Utah    0.0   1.0   2.0
Ohio    3.0   4.0   5.0
Texas   6.0   7.0   8.0
Oregon  9.0  10.0  11.0
b    1
d    2
e    3
dtype: int64
Index(['b', 'd', 'e'], dtype='object') [1 2 3]
          b    d    e
Utah   -1.0 -1.0 -1.0
Ohio    2.0  2.0  2.0
Texas   5.0  5.0  5.0
Oregon  8.0  8.0  8.0


# Deleting data (row or column from dataframe) 

In [29]:
import numpy as np
import pandas as pd

data_df = pd.DataFrame(np.arange(16).reshape((4, 4)), 
             index=['Ohio', 'Colorado', 'Utah', 'New York'],
             columns=['one', 'two', 'three', 'four'])

print(data_df)
data_df = data_df.drop('two', axis=1)

print(data_df)

          one  two  three  four
Ohio        0    1      2     3
Colorado    4    5      6     7
Utah        8    9     10    11
New York   12   13     14    15
          one  three  four
Ohio        0      2     3
Colorado    4      6     7
Utah        8     10    11
New York   12     14    15


In [30]:
data_df.drop('Colorado', axis=0, inplace = True)
print(data_df)

          one  three  four
Ohio        0      2     3
Utah        8     10    11
New York   12     14    15


# Indexing, Selection, and Filtering

In [31]:
import numpy as np
import pandas as pd

data_df = pd.DataFrame(np.arange(16).reshape((4, 4)), 
             index=['Ohio', 'Colorado', 'Utah', 'New York'],
             columns=['one', 'two', 'three', 'four'])

print(data_df)

          one  two  three  four
Ohio        0    1      2     3
Colorado    4    5      6     7
Utah        8    9     10    11
New York   12   13     14    15


In [32]:
df2 = data_df[ ["one", "three"] ]
print(df2)

          one  three
Ohio        0      2
Colorado    4      6
Utah        8     10
New York   12     14


In [33]:
print( data_df[2:] ) # same like numpy

          one  two  three  four
Utah        8    9     10    11
New York   12   13     14    15


In [34]:
print(data_df["one"] )# dicitonary like style of accessing data

Ohio         0
Colorado     4
Utah         8
New York    12
Name: one, dtype: int32


In [35]:
print(data_df.one[2:])  # filter on both row and column

Utah         8
New York    12
Name: one, dtype: int32


In [36]:
# Conditonal Selection
print ( data_df.three[data_df['three'] > 6] )

Utah        10
New York    14
Name: three, dtype: int32


In [37]:
print(data_df, "\n")
f2 = data_df["three"] > 10  # boolean dataframe
print(f2)

          one  two  three  four
Ohio        0    1      2     3
Colorado    4    5      6     7
Utah        8    9     10    11
New York   12   13     14    15 

Ohio        False
Colorado    False
Utah        False
New York     True
Name: three, dtype: bool


In [38]:
print(df2)

          one  three
Ohio        0      2
Colorado    4      6
Utah        8     10
New York   12     14


In [39]:
print( data_df[  data_df["three"] > 10  ] )


          one  two  three  four
New York   12   13     14    15


# Reindexing

In [40]:
# Mind Teaser !!
#Assigning a column that already exist will ________ and 
# assigning a column that does not exist will ________


In [41]:
# how to reindex and usie of ffill value of parameter method 
import pandas as pd
obj3 = pd.Series(['blue', 'purple', 'yellow'], index=[0, 3, 6])
print( obj3 )

0      blue
3    purple
6    yellow
dtype: object


In [42]:
# might create a new rows
obj3 =obj3.reindex(range(9) )
print(obj3)

0      blue
1       NaN
2       NaN
3    purple
4       NaN
5       NaN
6    yellow
7       NaN
8       NaN
dtype: object


In [43]:
# how to reindex and usie of ffill value of parameter method 
import pandas as pd
obj3 = pd.Series(['blue', 'purple', 'yellow'], index=[0, 3, 6])
print( obj3 )
obj3 =obj3.reindex(range(9), method='ffill' )
print(obj3)

0      blue
3    purple
6    yellow
dtype: object
0      blue
1      blue
2      blue
3    purple
4    purple
5    purple
6    yellow
7    yellow
8    yellow
dtype: object


In [44]:
import pandas as pd
obj3 = pd.Series(['blue', 'purple', 'yellow'], index=[0, 3, 6])
print( obj3 )
obj3 =obj3.reindex(range(2,11), method="ffill")
obj3

0      blue
3    purple
6    yellow
dtype: object


2       blue
3     purple
4     purple
5     purple
6     yellow
7     yellow
8     yellow
9     yellow
10    yellow
dtype: object

In [45]:
import numpy as np
import pandas as pd
states = pd.DataFrame(np.arange(9).reshape((3, 3) ),
index=['a', 'c', 'd'], columns=['Ohio', 'Texas', 'California'])

print(states)
# for your own working, run following statement without ffill
states = states.reindex(['a', 'b', 'c', 'd'],method = 'ffill')
print(states)

   Ohio  Texas  California
a     0      1           2
c     3      4           5
d     6      7           8
   Ohio  Texas  California
a     0      1           2
b     0      1           2
c     3      4           5
d     6      7           8


In [46]:
# column name changing using reindex method
states = pd.DataFrame(np.arange(9).reshape((3, 3) ),
index=['a', 'c', 'd'], columns=['Ohio', 'Texas', 'California'])
print(states)
states_name = ['Texas', 'Utah', 'California','Ohio']

# can we use ffill parameter in column reindex mode ?
states =states.reindex(columns=states_name)
print(states)


   Ohio  Texas  California
a     0      1           2
c     3      4           5
d     6      7           8
   Texas  Utah  California  Ohio
a      1   NaN           2     0
c      4   NaN           5     3
d      7   NaN           8     6


# Selection with loc and iloc

In [47]:
import numpy as np
import pandas as pd

data_df = pd.DataFrame(np.arange(16).reshape((4, 4)), 
             index=['Ohio', 'Colorado', 'Utah', 'New York'],
             columns=['one', 'two', 'three', 'four'])

print(data_df)       

          one  two  three  four
Ohio        0    1      2     3
Colorado    4    5      6     7
Utah        8    9     10    11
New York   12   13     14    15


In [48]:
# in the loc method specify row label first 
# then specify column names
# remember! mutiple column names require array notation
print( data_df.loc[['Colorado','Ohio'], ['two',  'three', 'one'] ] )

          two  three  one
Colorado    5      6    4
Ohio        1      2    0


In [49]:
print( data_df.iloc[2, [3, 0, 1] ] ) # using number instead of label

four    11
one      8
two      9
Name: Utah, dtype: int32


In [50]:
print( data_df.iloc[2:, [3, 0, 1] ] ) # using number instead of labels

          four  one  two
Utah        11    8    9
New York    15   12   13


In [51]:
print( data_df.iloc[:])

          one  two  three  four
Ohio        0    1      2     3
Colorado    4    5      6     7
Utah        8    9     10    11
New York   12   13     14    15


In [52]:
print ( data_df.iloc[ :3 , :3 ] )

          one  two  three
Ohio        0    1      2
Colorado    4    5      6
Utah        8    9     10


In [53]:
print( data_df.iloc[:, :3])

          one  two  three
Ohio        0    1      2
Colorado    4    5      6
Utah        8    9     10
New York   12   13     14
