# Data Indexing and Selection Examples

2020-09-28 - Jeff Smith

In [1]:
# Getting things ready
import numpy as np
import pandas as pd
from datetime import datetime

def show(data, show_data = 0):
    print ("   Type: {:}".format(type(data)))
    print ("  Index: {:}".format(data.index))
    print ("  Shape: {:}".format(data.shape))
    print ("Columns: {:}".format(data.columns))
    for col in data.columns:
        print ("    {:} ({:})".format(col,data[col].dtype))
    if show_data == 1:
        print(data.head())
    elif show_data == 2:
        print(data.values)
        
print("Pandas version: {:}".format(pd.__version__))

filepath = "../data/"

Pandas version: 1.0.5


## US Economic Data dataset
## Default read_csv()

In [2]:
# Start with the US Economic Data dataset.
df1 = pd.read_csv(filepath + '10_us_economic_data.csv')
show(df1,1)

   Type: <class 'pandas.core.frame.DataFrame'>
  Index: RangeIndex(start=0, stop=128, step=1)
  Shape: (128, 4)
Columns: Index(['Month', 'UnemploymentRate', 'JobsAdded', 'GDP'], dtype='object')
    Month (object)
    UnemploymentRate (float64)
    JobsAdded (float64)
    GDP (float64)
    Month  UnemploymentRate  JobsAdded       GDP
0  1/2008               5.0        8.0  14651.04
1  2/2008               4.9      -81.0       NaN
2  3/2008               5.1      -55.0       NaN
3  4/2008               5.0     -229.0  14805.61
4  5/2008               5.4     -184.0       NaN


In [3]:
# Extracting a single column (as a Series)
# Month - Either syntax works since the column name is alpha and is not a keyword.
df1.Month
#df1["Month"]

0      1/2008
1      2/2008
2      3/2008
3      4/2008
4      5/2008
        ...  
123    4/2018
124    5/2018
125    6/2018
126    7/2018
127    8/2018
Name: Month, Length: 128, dtype: object

In [4]:
# Extracting multiple Columns
df1[['Month', 'UnemploymentRate']]
# note the double brackets

Unnamed: 0,Month,UnemploymentRate
0,1/2008,5.0
1,2/2008,4.9
2,3/2008,5.1
3,4/2008,5.0
4,5/2008,5.4
...,...,...
123,4/2018,3.9
124,5/2018,3.8
125,6/2018,4.0
126,7/2018,3.9


In [5]:
# What about the first 12 entries (the first year of data)?
df1.loc[0:11]
# why does the row for index 11 show up?  For normal Python slices, it would not (why not?)
# Try iloc rather than loc.

Unnamed: 0,Month,UnemploymentRate,JobsAdded,GDP
0,1/2008,5.0,8.0,14651.04
1,2/2008,4.9,-81.0,
2,3/2008,5.1,-55.0,
3,4/2008,5.0,-229.0,14805.61
4,5/2008,5.4,-184.0,
5,6/2008,5.6,-154.0,
6,7/2008,5.8,-213.0,14835.19
7,8/2008,6.1,-277.0,
8,9/2008,6.1,-443.0,
9,10/2008,6.5,-475.0,14559.54


In [6]:
# What month had the highest unemployment rate?
rate = df1.UnemploymentRate.max()
df1.loc[df1.UnemploymentRate == rate]

Unnamed: 0,Month,UnemploymentRate,JobsAdded,GDP
21,10/2009,10.0,-209.0,14628.02


In [7]:
# or all at once
df1.loc[df1.UnemploymentRate == df1.UnemploymentRate.max()]

Unnamed: 0,Month,UnemploymentRate,JobsAdded,GDP
21,10/2009,10.0,-209.0,14628.02


In [8]:
# Can also apply the max function to the DataFrame
df1.max()
# Also try min(), sum(), std() and others ..
# https://pandas.pydata.org/pandas-docs/stable/reference/frame.html

Month                9/2017
UnemploymentRate         10
JobsAdded               522
GDP                 20621.1
dtype: object

In [9]:
# What months had the unemployment rate greater than or equal to 9.5
df1[df1.UnemploymentRate >= 9.5]
# Here the index expression is a mask -- see the next cell

Unnamed: 0,Month,UnemploymentRate,JobsAdded,GDP
17,6/2009,9.5,-469.0,
18,7/2009,9.5,-342.0,14420.31
19,8/2009,9.6,-196.0,
20,9/2009,9.8,-229.0,
21,10/2009,10.0,-209.0,14628.02
22,11/2009,9.9,12.0,
23,12/2009,9.9,-277.0,
24,1/2010,9.8,18.0,14721.35
25,2/2010,9.8,-73.0,
26,3/2010,9.9,193.0,


In [10]:
# the mask
df1.UnemploymentRate >= 9.5

0      False
1      False
2      False
3      False
4      False
       ...  
123    False
124    False
125    False
126    False
127    False
Name: UnemploymentRate, Length: 128, dtype: bool

In [11]:
# What about the Unemployment Rate and Jobs Added numbers for 
# the year 2010?
df1[['Month', 'UnemploymentRate','JobsAdded']].loc[24:35]

Unnamed: 0,Month,UnemploymentRate,JobsAdded
24,1/2010,9.8,18.0
25,2/2010,9.8,-73.0
26,3/2010,9.9,193.0
27,4/2010,9.9,221.0
28,5/2010,9.6,522.0
29,6/2010,9.4,-140.0
30,7/2010,9.4,-78.0
31,8/2010,9.5,-16.0
32,9/2010,9.5,-63.0
33,10/2010,9.4,267.0


In [12]:
# or
df1.iloc[24:36, :3]

Unnamed: 0,Month,UnemploymentRate,JobsAdded
24,1/2010,9.8,18.0
25,2/2010,9.8,-73.0
26,3/2010,9.9,193.0
27,4/2010,9.9,221.0
28,5/2010,9.6,522.0
29,6/2010,9.4,-140.0
30,7/2010,9.4,-78.0
31,8/2010,9.5,-16.0
32,9/2010,9.5,-63.0
33,10/2010,9.4,267.0


In [13]:
# or
df1.loc[24:35,['Month', 'UnemploymentRate','JobsAdded']]

Unnamed: 0,Month,UnemploymentRate,JobsAdded
24,1/2010,9.8,18.0
25,2/2010,9.8,-73.0
26,3/2010,9.9,193.0
27,4/2010,9.9,221.0
28,5/2010,9.6,522.0
29,6/2010,9.4,-140.0
30,7/2010,9.4,-78.0
31,8/2010,9.5,-16.0
32,9/2010,9.5,-63.0
33,10/2010,9.4,267.0


In [14]:
# What aobut the JobsAdded for the 12 month period after the month 
# with the maximum unemployment rate
# From the previous query, I knwo that max unemployment occured at loc 21
df1[['Month', 'JobsAdded', 'UnemploymentRate']].loc[22:33]

Unnamed: 0,Month,JobsAdded,UnemploymentRate
22,11/2009,12.0,9.9
23,12/2009,-277.0,9.9
24,1/2010,18.0,9.8
25,2/2010,-73.0,9.8
26,3/2010,193.0,9.9
27,4/2010,221.0,9.9
28,5/2010,522.0,9.6
29,6/2010,-140.0,9.4
30,7/2010,-78.0,9.4
31,8/2010,-16.0,9.5


In [15]:
# or -- by using the argmax ("argument for entry with maximum value") function:
df1[['Month', 'JobsAdded', 'UnemploymentRate']].loc[df1.UnemploymentRate.argmax()+1:df1.UnemploymentRate.argmax()+12]

Unnamed: 0,Month,JobsAdded,UnemploymentRate
22,11/2009,12.0,9.9
23,12/2009,-277.0,9.9
24,1/2010,18.0,9.8
25,2/2010,-73.0,9.8
26,3/2010,193.0,9.9
27,4/2010,221.0,9.9
28,5/2010,522.0,9.6
29,6/2010,-140.0,9.4
30,7/2010,-78.0,9.4
31,8/2010,-16.0,9.5


## Enhanced call to read_csv()

In the default version of the DataFrame, the date was read as string objects.  This makes date arithmetic clunky.  Let's try reading them in as datetime objects and setting the month to be the DataFrame index.

In [16]:
# Here, if I want to use the Month as the index AND specify that Pandas
# parse the dates for the index:
df2 = pd.read_csv(filepath + '10_us_economic_data.csv', parse_dates = ['Month'], index_col=0)
show(df2,1)

   Type: <class 'pandas.core.frame.DataFrame'>
  Index: DatetimeIndex(['2008-01-01', '2008-02-01', '2008-03-01', '2008-04-01',
               '2008-05-01', '2008-06-01', '2008-07-01', '2008-08-01',
               '2008-09-01', '2008-10-01',
               ...
               '2017-11-01', '2017-12-01', '2018-01-01', '2018-02-01',
               '2018-03-01', '2018-04-01', '2018-05-01', '2018-06-01',
               '2018-07-01', '2018-08-01'],
              dtype='datetime64[ns]', name='Month', length=128, freq=None)
  Shape: (128, 3)
Columns: Index(['UnemploymentRate', 'JobsAdded', 'GDP'], dtype='object')
    UnemploymentRate (float64)
    JobsAdded (float64)
    GDP (float64)
            UnemploymentRate  JobsAdded       GDP
Month                                            
2008-01-01               5.0        8.0  14651.04
2008-02-01               4.9      -81.0       NaN
2008-03-01               5.1      -55.0       NaN
2008-04-01               5.0     -229.0  14805.61
2008-05-01     

In [17]:
# Note that we can alway using the native Python (implicit) referencing.
df2.iloc[0:12]

Unnamed: 0_level_0,UnemploymentRate,JobsAdded,GDP
Month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2008-01-01,5.0,8.0,14651.04
2008-02-01,4.9,-81.0,
2008-03-01,5.1,-55.0,
2008-04-01,5.0,-229.0,14805.61
2008-05-01,5.4,-184.0,
2008-06-01,5.6,-154.0,
2008-07-01,5.8,-213.0,14835.19
2008-08-01,6.1,-277.0,
2008-09-01,6.1,-443.0,
2008-10-01,6.5,-475.0,14559.54


In [18]:
# But now we can use dates.  Note also that we're using a string rather
# that a datetime object
df2.loc['2008-02-01']

UnemploymentRate     4.9
JobsAdded          -81.0
GDP                  NaN
Name: 2008-02-01 00:00:00, dtype: float64

In [19]:
# what about a different date format?
df2.loc['02/01/08']

UnemploymentRate     4.9
JobsAdded          -81.0
GDP                  NaN
Name: 2008-02-01 00:00:00, dtype: float64

In [20]:
# Now we can do a direct date range.
df2.loc['2012-01-01':'2012-12-01']

Unnamed: 0_level_0,UnemploymentRate,JobsAdded,GDP
Month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2012-01-01,8.3,348.0,16019.76
2012-02-01,8.3,233.0,
2012-03-01,8.2,264.0,
2012-04-01,8.2,72.0,16152.26
2012-05-01,8.2,117.0,
2012-06-01,8.2,68.0,
2012-07-01,8.2,156.0,16257.15
2012-08-01,8.1,173.0,
2012-09-01,7.8,194.0,
2012-10-01,7.8,153.0,16358.86


In [21]:
df2.JobsAdded.loc['2012-01-01':'2012-12-01']

Month
2012-01-01    348.0
2012-02-01    233.0
2012-03-01    264.0
2012-04-01     72.0
2012-05-01    117.0
2012-06-01     68.0
2012-07-01    156.0
2012-08-01    173.0
2012-09-01    194.0
2012-10-01    153.0
2012-11-01    130.0
2012-12-01    243.0
Name: JobsAdded, dtype: float64

## Dealing with Missing Values - None and NaN Values

In [22]:
# what about null values?
df2.isnull().head(10)
# GDP is reported quarterly rather than monthly
# Note that the .head(10) is because I only want to see the first 10 rows.

Unnamed: 0_level_0,UnemploymentRate,JobsAdded,GDP
Month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2008-01-01,False,False,False
2008-02-01,False,False,True
2008-03-01,False,False,True
2008-04-01,False,False,False
2008-05-01,False,False,True
2008-06-01,False,False,True
2008-07-01,False,False,False
2008-08-01,False,False,True
2008-09-01,False,False,True
2008-10-01,False,False,False


In [23]:
# NumPy functions do not handle NA and NaN value "well."
x = np.array(df2.GDP)
np.mean(x)
# Why the quotes on "well"?

nan

In [24]:
# Remove the NA values.  Note that the entire
# records (rows) are removed, not just the specific values.
df2.dropna()

Unnamed: 0_level_0,UnemploymentRate,JobsAdded,GDP
Month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2008-01-01,5.0,8.0,14651.04
2008-04-01,5.0,-229.0,14805.61
2008-07-01,5.8,-213.0,14835.19
2008-10-01,6.5,-475.0,14559.54
2009-01-01,7.8,-787.0,14394.55
2009-04-01,9.0,-704.0,14352.85
2009-07-01,9.5,-342.0,14420.31
2009-10-01,10.0,-209.0,14628.02
2010-01-01,9.8,18.0,14721.35
2010-04-01,9.9,221.0,14926.1


In [25]:
# Now we can use NumPy on the "good" values
x = np.array(df2.dropna()['GDP'])
np.mean(x)
# Why the quotes on "good"?

16919.106976744188

In [26]:
# Or we can just use Pandas on the Series with the NA and NaN values
df2.GDP.mean()

16919.106976744188

In [27]:
# And we can use the Pandas functions across the DataFrame
df2.mean()

UnemploymentRate        6.784375
JobsAdded              84.906250
GDP                 16919.106977
dtype: float64

In [28]:
# What about non-numeric columns?  Recall that df1 has a Date (object) column.
df1.mean()

UnemploymentRate        6.784375
JobsAdded              84.906250
GDP                 16919.106977
dtype: float64

In [29]:
# We can tell Pandas to apply the function to non-numeric columns (not
# sure why we'd want to do this, but we can).
df1.mean(numeric_only=False)
# See https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.mean.html for details.

TypeError: could not convert string to float: '1/20082/20083/20084/20085/20086/20087/20088/20089/200810/200811/200812/20081/20092/20093/20094/20095/20096/20097/20098/20099/200910/200911/200912/20091/20102/20103/20104/20105/20106/20107/20108/20109/201010/201011/201012/20101/20112/20113/20114/20115/20116/20117/20118/20119/201110/201111/201112/20111/20122/20123/20124/20125/20126/20127/20128/20129/201210/201211/201212/20121/20132/20133/20134/20135/20136/20137/20138/20139/201310/201311/201312/20131/20142/20143/20144/20145/20146/20147/20148/20149/201410/201411/201412/20141/20152/20153/20154/20155/20156/20157/20158/20159/201510/201511/201512/20151/20162/20163/20164/20165/20166/20167/20168/20169/201610/201611/201612/20161/20172/20173/20174/20175/20176/20177/20178/20179/201710/201711/201712/20171/20182/20183/20184/20185/20186/20187/20188/2018'

## Example Use-case With Pandas and Concatenation
See the slide set for details of the problem.

In [30]:
# start with df2
df2 = pd.read_csv(filepath + '10_us_economic_data.csv', parse_dates = ['Month'], index_col=0)

In [31]:
# Extract the gdp data and drop the nan values
g = df2.GDP.dropna()
g.head(6)

Month
2008-01-01    14651.04
2008-04-01    14805.61
2008-07-01    14835.19
2008-10-01    14559.54
2009-01-01    14394.55
2009-04-01    14352.85
Name: GDP, dtype: float64

In [32]:
# Create a series using the 2008 data (first 4 rows)
y1 = pd.Series(data = g.iloc[0:4].values, index=['Q1', 'Q2', 'Q3', 'Q4'], name="2008")
y1

Q1    14651.04
Q2    14805.61
Q3    14835.19
Q4    14559.54
Name: 2008, dtype: float64

In [33]:
# Replicate for year 2
y2 = pd.Series(data = g.iloc[4:8].values, index=['Q1', 'Q2', 'Q3', 'Q4'], name="2009")
y2

Q1    14394.55
Q2    14352.85
Q3    14420.31
Q4    14628.02
Name: 2009, dtype: float64

In [34]:
# Concatenate them together
gdp = pd.concat([y1, y2], axis=1)
show(gdp,1)

   Type: <class 'pandas.core.frame.DataFrame'>
  Index: Index(['Q1', 'Q2', 'Q3', 'Q4'], dtype='object')
  Shape: (4, 2)
Columns: Index(['2008', '2009'], dtype='object')
    2008 (float64)
    2009 (float64)
        2008      2009
Q1  14651.04  14394.55
Q2  14805.61  14352.85
Q3  14835.19  14420.31
Q4  14559.54  14628.02


In [36]:
# Now just replicate for years 3, 4,and 5 .... or create a small loop that automates the process.