# Pandas Introduction - DataFrame part 3

In [1]:
import numpy as np
from pandas import Series, DataFrame
import pandas as pd

In [2]:
pop_data = {'Nevada': {2004: 0.0, 2003: 0.0, 2002: 2.9, 2001: 2.4},
            'Ohio': {2004: 0.0, 2003: 0.0, 2002: 3.6, 2001: 1.7, 2000: 1.5},
            'California': {2004: 0.0, 2003: 0.0, 2002: 0.0, 2001: 0.0, 2000: 0.0},
            'Texas': {2004: 0.0, 2003: 0.0, 2002: 0.0, 2001: 0.0, 2000: 0.0},
           }
           
df3 = DataFrame(pop_data)

# set the index and column names to something meaningful
df3.index.name = 'Year'
df3.columns.name = 'State'
df4 = df3.T
df4.loc['Texas', 2003] = np.nan

## Descriptive Statistics from DataFrames

In [3]:
df4

Year,2004,2003,2002,2001,2000
State,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Nevada,0.0,0.0,2.9,2.4,
Ohio,0.0,0.0,3.6,1.7,1.5
California,0.0,0.0,0.0,0.0,0.0
Texas,0.0,,0.0,0.0,0.0


In [4]:
df4.describe()

Year,2004,2003,2002,2001,2000
count,4.0,3.0,4.0,4.0,3.0
mean,0.0,0.0,1.625,1.025,0.5
std,0.0,0.0,1.898025,1.21758,0.866025
min,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,1.45,0.85,0.0
75%,0.0,0.0,3.075,1.875,0.75
max,0.0,0.0,3.6,2.4,1.5


In [5]:
df5 = df4[(df4[2002] > 1.0) | (df4[2000] > 1.0)]
df5

Year,2004,2003,2002,2001,2000
State,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Nevada,0.0,0.0,2.9,2.4,
Ohio,0.0,0.0,3.6,1.7,1.5


In [6]:
df5.describe()

Year,2004,2003,2002,2001,2000
count,2.0,2.0,2.0,2.0,1.0
mean,0.0,0.0,3.25,2.05,1.5
std,0.0,0.0,0.494975,0.494975,
min,0.0,0.0,2.9,1.7,1.5
25%,0.0,0.0,3.075,1.875,1.5
50%,0.0,0.0,3.25,2.05,1.5
75%,0.0,0.0,3.425,2.225,1.5
max,0.0,0.0,3.6,2.4,1.5


In [7]:
df4[(df4[2002] > 1.0) | (df4[2000] > 1.0)].describe()

Year,2004,2003,2002,2001,2000
count,2.0,2.0,2.0,2.0,1.0
mean,0.0,0.0,3.25,2.05,1.5
std,0.0,0.0,0.494975,0.494975,
min,0.0,0.0,2.9,1.7,1.5
25%,0.0,0.0,3.075,1.875,1.5
50%,0.0,0.0,3.25,2.05,1.5
75%,0.0,0.0,3.425,2.225,1.5
max,0.0,0.0,3.6,2.4,1.5


In [8]:
df4

Year,2004,2003,2002,2001,2000
State,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Nevada,0.0,0.0,2.9,2.4,
Ohio,0.0,0.0,3.6,1.7,1.5
California,0.0,0.0,0.0,0.0,0.0
Texas,0.0,,0.0,0.0,0.0


In [9]:
df4.sum()

Year
2004    0.0
2003    0.0
2002    6.5
2001    4.1
2000    1.5
dtype: float64

In [10]:
df4.mean()

Year
2004    0.000
2003    0.000
2002    1.625
2001    1.025
2000    0.500
dtype: float64

In [11]:
df4[2001].mean()

1.025

In [12]:
df4.median()

Year
2004    0.00
2003    0.00
2002    1.45
2001    0.85
2000    0.00
dtype: float64

In [13]:
df4

Year,2004,2003,2002,2001,2000
State,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Nevada,0.0,0.0,2.9,2.4,
Ohio,0.0,0.0,3.6,1.7,1.5
California,0.0,0.0,0.0,0.0,0.0
Texas,0.0,,0.0,0.0,0.0


In [14]:
df4.mode()

Year,2004,2003,2002,2001,2000
0,0.0,0.0,0.0,0.0,0.0


In [15]:
df4.loc['California', 2002] = 3.6

In [16]:
df4

Year,2004,2003,2002,2001,2000
State,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Nevada,0.0,0.0,2.9,2.4,
Ohio,0.0,0.0,3.6,1.7,1.5
California,0.0,0.0,3.6,0.0,0.0
Texas,0.0,,0.0,0.0,0.0


In [17]:
df4.mode()

Year,2004,2003,2002,2001,2000
0,0.0,0.0,3.6,0.0,0.0


In [18]:
df4[df4 > 0].describe()

Year,2004,2003,2002,2001,2000
count,0.0,0.0,3.0,2.0,1.0
mean,,,3.366667,2.05,1.5
std,,,0.404145,0.494975,
min,,,2.9,1.7,1.5
25%,,,3.25,1.875,1.5
50%,,,3.6,2.05,1.5
75%,,,3.6,2.225,1.5
max,,,3.6,2.4,1.5


In [19]:
df4[2002].describe()

count    4.000000
mean     2.525000
std      1.715372
min      0.000000
25%      2.175000
50%      3.250000
75%      3.600000
max      3.600000
Name: 2002, dtype: float64

In [20]:
df4

Year,2004,2003,2002,2001,2000
State,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Nevada,0.0,0.0,2.9,2.4,
Ohio,0.0,0.0,3.6,1.7,1.5
California,0.0,0.0,3.6,0.0,0.0
Texas,0.0,,0.0,0.0,0.0


In [21]:
df4[df4[2002] > 0][2002].describe()

count    3.000000
mean     3.366667
std      0.404145
min      2.900000
25%      3.250000
50%      3.600000
75%      3.600000
max      3.600000
Name: 2002, dtype: float64

In [22]:
df6 = df4
df6

Year,2004,2003,2002,2001,2000
State,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Nevada,0.0,0.0,2.9,2.4,
Ohio,0.0,0.0,3.6,1.7,1.5
California,0.0,0.0,3.6,0.0,0.0
Texas,0.0,,0.0,0.0,0.0


In [23]:
df6[2001].value_counts()

0.0    2
1.7    1
2.4    1
Name: 2001, dtype: int64

In [24]:
df6[[2002, 2001]].value_counts()

2002  2001
0.0   0.0     1
2.9   2.4     1
3.6   0.0     1
      1.7     1
dtype: int64

In [25]:
df6.value_counts()

2004  2003  2002  2001  2000
0.0   0.0   3.6   0.0   0.0     1
                  1.7   1.5     1
dtype: int64

In [26]:
df6.loc['California', 2002] = 0.0
df6

Year,2004,2003,2002,2001,2000
State,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Nevada,0.0,0.0,2.9,2.4,
Ohio,0.0,0.0,3.6,1.7,1.5
California,0.0,0.0,0.0,0.0,0.0
Texas,0.0,,0.0,0.0,0.0


In [27]:
df6.value_counts()

2004  2003  2002  2001  2000
0.0   0.0   0.0   0.0   0.0     1
            3.6   1.7   1.5     1
dtype: int64

In [28]:
df6.loc['Texas', 2003] = 0.0
df6

Year,2004,2003,2002,2001,2000
State,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Nevada,0.0,0.0,2.9,2.4,
Ohio,0.0,0.0,3.6,1.7,1.5
California,0.0,0.0,0.0,0.0,0.0
Texas,0.0,0.0,0.0,0.0,0.0


In [29]:
df6.value_counts()

2004  2003  2002  2001  2000
0.0   0.0   0.0   0.0   0.0     2
            3.6   1.7   1.5     1
dtype: int64

In [30]:
df6.loc['Texas', 2003] = np.nan
df6

Year,2004,2003,2002,2001,2000
State,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Nevada,0.0,0.0,2.9,2.4,
Ohio,0.0,0.0,3.6,1.7,1.5
California,0.0,0.0,0.0,0.0,0.0
Texas,0.0,,0.0,0.0,0.0


Let's find the value counts across all the columns in the dataframe.  The *apply* method executes a specified function for each element on an axis (e.g., each column).

In [47]:
df4

Year,2004,2003,2002,2001,2000
State,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Nevada,0.0,0.0,2.9,2.4,
Ohio,0.0,0.0,3.6,1.7,1.5
California,0.0,0.0,0.0,0.0,0.0
Texas,0.0,,0.0,0.0,0.0


In [48]:
df4.apply(pd.value_counts)

Year,2004,2003,2002,2001,2000
0.0,4.0,3.0,2.0,2.0,2.0
1.5,,,,,1.0
1.7,,,,1.0,
2.4,,,,1.0,
2.9,,,1.0,,
3.6,,,1.0,,


In [50]:
df4

Year,2004,2003,2002,2001,2000
State,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Nevada,0.0,0.0,2.9,2.4,
Ohio,0.0,0.0,3.6,1.7,1.5
California,0.0,0.0,0.0,0.0,0.0
Texas,0.0,,0.0,0.0,0.0


In [49]:
df4.apply(pd.value_counts, axis='columns')

Unnamed: 0_level_0,0.0,1.5,1.7,2.4,2.9,3.6
State,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Nevada,2.0,,,1.0,1.0,
Ohio,2.0,1.0,1.0,,,1.0
California,5.0,,,,,
Texas,4.0,,,,,


In [46]:
df4

Year,2004,2003,2002,2001,2000
State,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Nevada,0.0,0.0,2.9,2.4,
Ohio,0.0,0.0,3.6,1.7,1.5
California,0.0,0.0,0.0,0.0,0.0
Texas,0.0,,0.0,0.0,0.0


Suppose you want to invoke *describe* across the values for each row, use *apply* method in concert with the *describe* method on the 'columns' axis.

In [51]:
df4.apply(DataFrame.describe, axis='columns')

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
State,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Nevada,4.0,1.325,1.543535,0.0,0.0,1.2,2.525,2.9
Ohio,5.0,1.36,1.487616,0.0,0.0,1.5,1.7,3.6
California,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Texas,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Call *describe* on a specific row.

In [35]:
df4

Year,2004,2003,2002,2001,2000
State,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Nevada,0.0,0.0,2.9,2.4,
Ohio,0.0,0.0,3.6,1.7,1.5
California,0.0,0.0,0.0,0.0,0.0
Texas,0.0,,0.0,0.0,0.0


In [36]:
df4.loc['Ohio'].describe()

count    5.000000
mean     1.360000
std      1.487616
min      0.000000
25%      0.000000
50%      1.500000
75%      1.700000
max      3.600000
Name: Ohio, dtype: float64

Go a little crazy invoking descriptive methods on a number of conditions...

In [37]:
df4.loc['Ohio', 2002:2000].describe()

count    3.000000
mean     2.266667
std      1.159023
min      1.500000
25%      1.600000
50%      1.700000
75%      2.650000
max      3.600000
Name: Ohio, dtype: float64

In [38]:
df4.loc['Ohio'].sum()

6.8

In [39]:
df4[df4[2002] > 2.0]

Year,2004,2003,2002,2001,2000
State,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Nevada,0.0,0.0,2.9,2.4,
Ohio,0.0,0.0,3.6,1.7,1.5


In [40]:
df4[df4[2002] > 2.0].describe()

Year,2004,2003,2002,2001,2000
count,2.0,2.0,2.0,2.0,1.0
mean,0.0,0.0,3.25,2.05,1.5
std,0.0,0.0,0.494975,0.494975,
min,0.0,0.0,2.9,1.7,1.5
25%,0.0,0.0,3.075,1.875,1.5
50%,0.0,0.0,3.25,2.05,1.5
75%,0.0,0.0,3.425,2.225,1.5
max,0.0,0.0,3.6,2.4,1.5


In [41]:
df5 = df4.copy()
df5

Year,2004,2003,2002,2001,2000
State,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Nevada,0.0,0.0,2.9,2.4,
Ohio,0.0,0.0,3.6,1.7,1.5
California,0.0,0.0,0.0,0.0,0.0
Texas,0.0,,0.0,0.0,0.0


In [42]:
df5.loc['Nevada', 2004:2003] = 1.0

In [43]:
df5

Year,2004,2003,2002,2001,2000
State,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Nevada,1.0,1.0,2.9,2.4,
Ohio,0.0,0.0,3.6,1.7,1.5
California,0.0,0.0,0.0,0.0,0.0
Texas,0.0,,0.0,0.0,0.0


In [44]:
df5.apply(pd.value_counts)

Year,2004,2003,2002,2001,2000
0.0,3.0,2.0,2.0,2.0,2.0
1.0,1.0,1.0,,,
1.5,,,,,1.0
1.7,,,,1.0,
2.4,,,,1.0,
2.9,,,1.0,,
3.6,,,1.0,,


In [45]:
df4

Year,2004,2003,2002,2001,2000
State,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Nevada,0.0,0.0,2.9,2.4,
Ohio,0.0,0.0,3.6,1.7,1.5
California,0.0,0.0,0.0,0.0,0.0
Texas,0.0,,0.0,0.0,0.0
