In [2]:
import pandas

In [3]:
# notice the bold column names across the top of our data frame
pollution = pandas.read_csv('Example 01 Tabular Data/LSTM-Multivariate_pollution.csv', parse_dates=['date'])
pollution

Unnamed: 0,date,pollution,dew,temp,press,wnd_dir,wnd_spd,snow,rain
0,2010-02-01 00:00:00,129,-16,-4.0,1020.0,SE,1.79,0,0
1,2010-02-01 01:00:00,148,-15,-4.0,1020.0,SE,2.68,0,0
2,2010-02-01 02:00:00,159,-11,-5.0,1021.0,SE,3.57,0,0
3,2010-02-01 03:00:00,181,-7,-5.0,1022.0,SE,5.36,1,0
4,2010-02-01 04:00:00,138,-7,-5.0,1022.0,SE,6.25,2,0
...,...,...,...,...,...,...,...,...,...
43795,2014-12-31 19:00:00,8,-23,-2.0,1034.0,NW,231.97,0,0
43796,2014-12-31 20:00:00,10,-22,-3.0,1034.0,NW,237.78,0,0
43797,2014-12-31 21:00:00,10,-22,-3.0,1034.0,NW,242.70,0,0
43798,2014-12-31 22:00:00,8,-22,-4.0,1034.0,NW,246.72,0,0


In [4]:
# we can ask for the names of each column
pollution.columns

Index(['date', 'pollution', 'dew', 'temp', 'press', 'wnd_dir', 'wnd_spd',
       'snow', 'rain'],
      dtype='object')

In [5]:
# we can use those column names as an index to retrieve just that row
# because the columns are indexed by these names, we can perform that lookup very quickly, the computer doesn't need to search column by column
pollution["temp"]

0       -4.0
1       -4.0
2       -5.0
3       -5.0
4       -5.0
        ... 
43795   -2.0
43796   -3.0
43797   -3.0
43798   -4.0
43799   -3.0
Name: temp, Length: 43800, dtype: float64

In [7]:
# if we want to look up the row  that has a particular date value, the computer needs to search through all rows one by one
pollution[pollution.date == "2014-12-31 20:00:00"]

Unnamed: 0,date,pollution,dew,temp,press,wnd_dir,wnd_spd,snow,rain
43796,2014-12-31 20:00:00,10,-22,-3.0,1034.0,NW,237.78,0,0


In [9]:
# we can use the %time magic command to determine how long that operation takes to execute
%timeit pollution[pollution.date == "2014-12-31 20:00:00"]

303 µs ± 15.8 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


In [10]:
pollution.date == "2014-12-31 20:00:00"

139 µs ± 10.7 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


In [14]:
# Notice that each of the rows also has a index which is shown in bold above.
# In this example the row indexes are integers (which is the default)
# We can use these row and column indexes to very quickly look up a particular row and column based on their index (43797 for the row and 'temp' for the column)
pollution.loc[43797, 'temp']

-3.0

In [15]:
# notice how much faster that works
%timeit pollution.loc[43797, 'temp']

6.18 µs ± 577 ns per loop (mean ± std. dev. of 7 runs, 100,000 loops each)


In [None]:
# that's very efficient, but we can only lookup a row in this manner if we know it's row number (which is not particularly meaningful).

In [17]:
# insteads we can tell pandas to use the 'date' column and the row index labels.
pollution = pandas.read_csv('Example 01 Tabular Data/LSTM-Multivariate_pollution.csv', parse_dates=['date'], index_col='date')

In [18]:
# notice how we don't have a regular date column anymore, it is now part of the row index
pollution.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 43800 entries, 2010-02-01 00:00:00 to 2014-12-31 23:00:00
Data columns (total 8 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   pollution  43800 non-null  int64  
 1   dew        43800 non-null  int64  
 2   temp       43800 non-null  float64
 3   press      43800 non-null  float64
 4   wnd_dir    43800 non-null  object 
 5   wnd_spd    43800 non-null  float64
 6   snow       43800 non-null  int64  
 7   rain       43800 non-null  int64  
dtypes: float64(3), int64(4), object(1)
memory usage: 3.0+ MB


In [19]:
# we can see the new row index labels in bold below
pollution

Unnamed: 0_level_0,pollution,dew,temp,press,wnd_dir,wnd_spd,snow,rain
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2010-02-01 00:00:00,129,-16,-4.0,1020.0,SE,1.79,0,0
2010-02-01 01:00:00,148,-15,-4.0,1020.0,SE,2.68,0,0
2010-02-01 02:00:00,159,-11,-5.0,1021.0,SE,3.57,0,0
2010-02-01 03:00:00,181,-7,-5.0,1022.0,SE,5.36,1,0
2010-02-01 04:00:00,138,-7,-5.0,1022.0,SE,6.25,2,0
...,...,...,...,...,...,...,...,...
2014-12-31 19:00:00,8,-23,-2.0,1034.0,NW,231.97,0,0
2014-12-31 20:00:00,10,-22,-3.0,1034.0,NW,237.78,0,0
2014-12-31 21:00:00,10,-22,-3.0,1034.0,NW,242.70,0,0
2014-12-31 22:00:00,8,-22,-4.0,1034.0,NW,246.72,0,0


In [20]:
# We don't have a date column, but we can refer to it using the .index attribute
pollution.index

DatetimeIndex(['2010-02-01 00:00:00', '2010-02-01 01:00:00',
               '2010-02-01 02:00:00', '2010-02-01 03:00:00',
               '2010-02-01 04:00:00', '2010-02-01 05:00:00',
               '2010-02-01 06:00:00', '2010-02-01 07:00:00',
               '2010-02-01 08:00:00', '2010-02-01 09:00:00',
               ...
               '2014-12-31 14:00:00', '2014-12-31 15:00:00',
               '2014-12-31 16:00:00', '2014-12-31 17:00:00',
               '2014-12-31 18:00:00', '2014-12-31 19:00:00',
               '2014-12-31 20:00:00', '2014-12-31 21:00:00',
               '2014-12-31 22:00:00', '2014-12-31 23:00:00'],
              dtype='datetime64[ns]', name='date', length=43800, freq=None)

In [21]:
# We can now do  a very efficient lookup of a particular row and column based on a more meaningful row label (i.e. a date time)
pollution.loc["2014-12-31 20:00:00", "temp"]

-3.0

In [22]:
# notice how much faster that was than when we had to search through all the date column values
%timeit pollution.loc["2014-12-31 20:00:00", "temp"]

203 µs ± 14.5 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)
