# Pandas Introduction - DataFrame part 4

In [1]:
import numpy as np
from pandas import Series, DataFrame
import pandas as pd

In [2]:
pop_data = {'Nevada': {2004: 0.0, 2003: 0.0, 2002: 2.9, 2001: 2.4},
            'Ohio': {2004: 0.0, 2003: 0.0, 2002: 3.6, 2001: 1.7, 2000: 1.5},
            'California': {2004: 0.0, 2003: 0.0, 2002: 0.0, 2001: 0.0, 2000: 0.0},
            'Texas': {2004: 0.0, 2003: 0.0, 2002: 0.0, 2001: 0.0, 2000: 0.0},
           }
           
df3 = DataFrame(pop_data)

# set the index and column names to something meaningful
df3.index.name = 'Year'
df3.columns.name = 'State'
df4 = df3.T
df4.loc['Texas', 2003] = np.nan
df4

Year,2004,2003,2002,2001,2000
State,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Nevada,0.0,0.0,2.9,2.4,
Ohio,0.0,0.0,3.6,1.7,1.5
California,0.0,0.0,0.0,0.0,0.0
Texas,0.0,,0.0,0.0,0.0


In [3]:
df5 = df4.copy()
df5.loc['Nevada', 2004:2003] = 1.0
df5

Year,2004,2003,2002,2001,2000
State,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Nevada,1.0,1.0,2.9,2.4,
Ohio,0.0,0.0,3.6,1.7,1.5
California,0.0,0.0,0.0,0.0,0.0
Texas,0.0,,0.0,0.0,0.0


## Dealing with Missing Values

In [4]:
df4[df4.isnull().any(axis='columns')]

Year,2004,2003,2002,2001,2000
State,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Nevada,0.0,0.0,2.9,2.4,
Texas,0.0,,0.0,0.0,0.0


In [5]:
df4[df4.isnull()] = -1.0
df4

Year,2004,2003,2002,2001,2000
State,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Nevada,0.0,0.0,2.9,2.4,-1.0
Ohio,0.0,0.0,3.6,1.7,1.5
California,0.0,0.0,0.0,0.0,0.0
Texas,0.0,-1.0,0.0,0.0,0.0


Now, put the NaN back for another example.

In [6]:
df4[df4 == -1] = np.nan
df4

Year,2004,2003,2002,2001,2000
State,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Nevada,0.0,0.0,2.9,2.4,
Ohio,0.0,0.0,3.6,1.7,1.5
California,0.0,0.0,0.0,0.0,0.0
Texas,0.0,,0.0,0.0,0.0


The *fillna* method is a bit easier to use to replace NaN, but note that unlike the direct assignment, it did not modify the dataframe in place (i.e., it made a copy).

In [7]:
df4.fillna(-1)

Year,2004,2003,2002,2001,2000
State,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Nevada,0.0,0.0,2.9,2.4,-1.0
Ohio,0.0,0.0,3.6,1.7,1.5
California,0.0,0.0,0.0,0.0,0.0
Texas,0.0,-1.0,0.0,0.0,0.0


In [8]:
df4

Year,2004,2003,2002,2001,2000
State,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Nevada,0.0,0.0,2.9,2.4,
Ohio,0.0,0.0,3.6,1.7,1.5
California,0.0,0.0,0.0,0.0,0.0
Texas,0.0,,0.0,0.0,0.0


Now, use *fillna* with the "inplace' option to actually change the dataframe itself.

In [9]:
df4.fillna(-1, inplace=True)

In [10]:
df4

Year,2004,2003,2002,2001,2000
State,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Nevada,0.0,0.0,2.9,2.4,-1.0
Ohio,0.0,0.0,3.6,1.7,1.5
California,0.0,0.0,0.0,0.0,0.0
Texas,0.0,-1.0,0.0,0.0,0.0


In [11]:
df4.describe()

Year,2004,2003,2002,2001,2000
count,4.0,4.0,4.0,4.0,4.0
mean,0.0,-0.25,1.625,1.025,0.125
std,0.0,0.5,1.898025,1.21758,1.030776
min,0.0,-1.0,0.0,0.0,-1.0
25%,0.0,-0.25,0.0,0.0,-0.25
50%,0.0,0.0,1.45,0.85,0.0
75%,0.0,0.0,3.075,1.875,0.375
max,0.0,0.0,3.6,2.4,1.5


We could choose to drop all rows with any missing values...

Now, put the NaN back for another example.

In [12]:
df4[df4 == -1] = np.nan
df4

Year,2004,2003,2002,2001,2000
State,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Nevada,0.0,0.0,2.9,2.4,
Ohio,0.0,0.0,3.6,1.7,1.5
California,0.0,0.0,0.0,0.0,0.0
Texas,0.0,,0.0,0.0,0.0


In [13]:
df4.dropna()

Year,2004,2003,2002,2001,2000
State,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Ohio,0.0,0.0,3.6,1.7,1.5
California,0.0,0.0,0.0,0.0,0.0


...or all columns with any missing values.

In [14]:
df4.dropna(axis='columns')

Year,2004,2002,2001
State,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Nevada,0.0,2.9,2.4
Ohio,0.0,3.6,1.7
California,0.0,0.0,0.0
Texas,0.0,0.0,0.0


The *dropna* method has options.  For example, let's only drop rows where *all* values are missing.

In [15]:
df4

Year,2004,2003,2002,2001,2000
State,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Nevada,0.0,0.0,2.9,2.4,
Ohio,0.0,0.0,3.6,1.7,1.5
California,0.0,0.0,0.0,0.0,0.0
Texas,0.0,,0.0,0.0,0.0


Add a row that has all null values first

In [16]:
df4.loc['Arkansas'] = np.nan

In [17]:
df4

Year,2004,2003,2002,2001,2000
State,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Nevada,0.0,0.0,2.9,2.4,
Ohio,0.0,0.0,3.6,1.7,1.5
California,0.0,0.0,0.0,0.0,0.0
Texas,0.0,,0.0,0.0,0.0
Arkansas,,,,,


In [18]:
df4.dropna(how='all', inplace=True)
df4

Year,2004,2003,2002,2001,2000
State,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Nevada,0.0,0.0,2.9,2.4,
Ohio,0.0,0.0,3.6,1.7,1.5
California,0.0,0.0,0.0,0.0,0.0
Texas,0.0,,0.0,0.0,0.0


## Applying User Written Functions to Dataframes

Define a function *f* that operates on an entire column (or row) passed as *x*).

In [19]:
f = lambda x: x.max() - x.min()

In [20]:
df5

Year,2004,2003,2002,2001,2000
State,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Nevada,1.0,1.0,2.9,2.4,
Ohio,0.0,0.0,3.6,1.7,1.5
California,0.0,0.0,0.0,0.0,0.0
Texas,0.0,,0.0,0.0,0.0


Apply the function *f* to every column in the dataframe.

In [21]:
df5.apply(f)

Year
2004    1.0
2003    1.0
2002    3.6
2001    2.4
2000    1.5
dtype: float64

Now apply the function *f* to every row in the dataframe.

In [22]:
df5.apply(f, axis='columns')

State
Nevada        1.9
Ohio          3.6
California    0.0
Texas         0.0
dtype: float64

Define a function (in this case a format function) to apply individually to every cell in the dataframe.

In [23]:
format = lambda x: f'{x:.3f}'

Method *applymap* maps the specified function to each cell, rather than axis as with *apply*

In [24]:
df5.applymap(format)

Year,2004,2003,2002,2001,2000
State,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Nevada,1.0,1.0,2.9,2.4,
Ohio,0.0,0.0,3.6,1.7,1.5
California,0.0,0.0,0.0,0.0,0.0
Texas,0.0,,0.0,0.0,0.0


## Sorting Dataframes

In [25]:
df5

Year,2004,2003,2002,2001,2000
State,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Nevada,1.0,1.0,2.9,2.4,
Ohio,0.0,0.0,3.6,1.7,1.5
California,0.0,0.0,0.0,0.0,0.0
Texas,0.0,,0.0,0.0,0.0


The *sort_index* method arranges rows based upon the sort order of index values.

In [26]:
df5.sort_index()

Year,2004,2003,2002,2001,2000
State,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
California,0.0,0.0,0.0,0.0,0.0
Nevada,1.0,1.0,2.9,2.4,
Ohio,0.0,0.0,3.6,1.7,1.5
Texas,0.0,,0.0,0.0,0.0


In [27]:
df5

Year,2004,2003,2002,2001,2000
State,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Nevada,1.0,1.0,2.9,2.4,
Ohio,0.0,0.0,3.6,1.7,1.5
California,0.0,0.0,0.0,0.0,0.0
Texas,0.0,,0.0,0.0,0.0


Sort both the indexes and the column values.

In [28]:
df5.sort_index().sort_index(axis='columns')

Year,2000,2001,2002,2003,2004
State,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
California,0.0,0.0,0.0,0.0,0.0
Nevada,,2.4,2.9,1.0,1.0
Ohio,1.5,1.7,3.6,0.0,0.0
Texas,0.0,0.0,0.0,,0.0


Sort by the values of a particular column.

In [29]:
df5.sort_values(by=2002)

Year,2004,2003,2002,2001,2000
State,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
California,0.0,0.0,0.0,0.0,0.0
Texas,0.0,,0.0,0.0,0.0
Nevada,1.0,1.0,2.9,2.4,
Ohio,0.0,0.0,3.6,1.7,1.5


Sort by first column value, then by second column value on ties.

In [30]:
df5.sort_values(by=[2003,2002])

Year,2004,2003,2002,2001,2000
State,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
California,0.0,0.0,0.0,0.0,0.0
Ohio,0.0,0.0,3.6,1.7,1.5
Nevada,1.0,1.0,2.9,2.4,
Texas,0.0,,0.0,0.0,0.0


Put any missing values at the top of the sort order.

In [31]:
df5.sort_values(by=[2003,2002], na_position='first')

Year,2004,2003,2002,2001,2000
State,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Texas,0.0,,0.0,0.0,0.0
California,0.0,0.0,0.0,0.0,0.0
Ohio,0.0,0.0,3.6,1.7,1.5
Nevada,1.0,1.0,2.9,2.4,


Change to a descending sort order.

In [32]:
df5.sort_values(by='Ohio', ascending=False, axis='columns')

Year,2002,2001,2000,2004,2003
State,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Nevada,2.9,2.4,,1.0,1.0
Ohio,3.6,1.7,1.5,0.0,0.0
California,0.0,0.0,0.0,0.0,0.0
Texas,0.0,0.0,0.0,0.0,
