In [2]:
import pandas as pd
import pickle
from datetime import datetime,timedelta
import matplotlib.pyplot as plt
import download_airnow_web
import pyproj
%matplotlib inline
import numpy as np
from calibration.dataprocessing import plotsensorencounters,build_encounters,combinedatasets,loaddata,processBAMdata

# Download and organise data

The following code can take a while as it downloads data from servers and merges big dataframes. If you want to analyse the data and have already downloaded it - you might be best skipping straight to the analysis step below.

This method takes a long time as it downloads data from GCP.
Typically I comment it out and just use the local copy.

In [4]:
import pickle
airqodf = loaddata("/home/mike/.gcp_keys/airqo-250220-e1dc140554a6.json")
#pickle.dump(airqodf,open('airqodf.p','wb'))
#airqodf = pickle.load(open('airqodf.p','rb'))

Found cache file containing data up to 2020-10-28
Downloading more data to date 2020-11-02
SQL query:
SELECT * FROM `airqo-250220.thingspeak.clean_feeds_pms` WHERE (created_at >= DATETIME('2020-10-28')) AND (created_at < DATETIME('2020-11-02'))
Query complete, converting to dataframe...
done
                created_at  channel_id  entry_id  pm2_5    pm10  s2_pm2_5  \
0      2020-10-28 04:35:25      930427    193825  95.28  103.70     99.37   
1      2020-10-28 01:25:45      912222    190421  40.12   47.93     38.10   
2      2020-10-28 01:12:12      912222    190411  31.68   35.40     32.03   
3      2020-10-28 02:08:03      689749    454517  19.17   21.47     22.17   
4      2020-10-28 03:54:56      930426    241839  46.93   56.00     46.97   
...                    ...         ...       ...    ...     ...       ...   
225036 2020-11-01 11:47:39      730014    516366  10.77   11.12      9.73   
225037 2020-11-01 11:46:18      730014    516365   9.92    9.92     11.35   
225038 2020-11

In [11]:
np.sum(airqodf['channel_id']==930432)

0

In [5]:
np.max(airqodf['created_at'])

Timestamp('2020-10-20 23:59:59')

Download latest embassy data, add its latitude, longitude, channel_id (-1) and add new columns renaming them to our conventions.

In [3]:
embassydf = download_airnow_web.download('Kampala')
embassydf['pm2_5'] = embassydf['Raw Conc.']
embassydf['created_at'] = embassydf['Date (LT)']
embassydf['channel_id']=-1
embassydf['latitude'] = 0.299333
embassydf['longitude'] = 32.592539

The BAM data has been copied from the google folder Joel set up. This method parses that data and converts to a dataframe.

In [4]:
bamdf = processBAMdata('/home/mike/Documents/Research/air/BAM Data',{'Y24517':-24517,'Y24516':-24516})

Device Ids: 
['Y24517' 'Y24516']


I found that I ran out of memory doing some of this, so I drop many of the columns from the GCP airqo database.

In [5]:
airqodf = airqodf.drop(['entry_id','pm10','voltage','s2_pm10','altitude','wind','no_sats','hdope'],axis=1)

Here we move everything to UTC:
- the AirQo data is already in UTC
- the BAM and Embassy data are in local time, so they get converted.

We then call our 'combinedatasets' method to merge the three sets together, and save them.

In [6]:
airqodf['created_at'] = airqodf['created_at'].dt.tz_localize('UTC')
bamdf['created_at'] = pd.to_datetime(bamdf['Time'],format="%Y-%m-%d %H:%M:%S").dt.tz_localize('Africa/Kampala').dt.tz_convert('UTC')
embassydf['created_at'] = pd.to_datetime(embassydf['created_at'],format="%Y-%m-%d %I:%M %p").dt.tz_localize('Africa/Kampala').dt.tz_convert('UTC')

df = combinedatasets(airqodf,bamdf)
df = combinedatasets(df,embassydf)
pickle.dump(df,open('alldataprocessed.p','wb'))

  return _prepare_from_string(" ".join(pjargs))
  projstring = _prepare_from_string(" ".join((projstring, projkwargs)))
  return _prepare_from_string(" ".join(pjargs))
  projstring = _prepare_from_string(" ".join((projstring, projkwargs)))
  return _prepare_from_string(" ".join(pjargs))
  projstring = _prepare_from_string(" ".join((projstring, projkwargs)))
  return _prepare_from_string(" ".join(pjargs))
  projstring = _prepare_from_string(" ".join((projstring, projkwargs)))


# Analysis

The above processing and collecting of the data is quite time consuming, so to use a saved version we just can start here.

In [7]:
df = pickle.load(open('alldataprocessed.p','rb'))

In [8]:
#just use recent data
df = df[df['created_at']>pd.Timestamp('2020-07-15',tz='UTC')]

Here we build the encounters dataframe, this records all the times two sensors are near each other.
It can take a few minutes, so if you've already run it - skip to loading it on the next line.

In [9]:
#min(df['created_at']) + (max(df['created_at'])-min(df['created_at']))/2
#2020-07-15 to 2020-10-22
#2020-09-02 is the middle
#df.loc[(df['created_at']>pd.Timestamp('2020-09-02',tz='UTC')) & (df['channel_id']==-1),'channel_id'] = -2

td=(df['created_at']-pd.Timestamp('2020-07-15',tz='UTC'))
simnoncalsensor = (td/pd.Timedelta(hours=1))%48<24
df.loc[simnoncalsensor & (df['channel_id']==-1),'channel_id'] = -2
pickle.dump(df,open('alldataprocessed_embassycopy.p','wb'))

In [10]:
encounters = build_encounters(df)
pickle.dump(encounters,open('encounters_embassycopy.p','wb'))

1 of 49 (181 encounters recorded)
2 of 49 (181 encounters recorded)
3 of 49 (181 encounters recorded)
4 of 49 (183 encounters recorded)
5 of 49 (184 encounters recorded)
6 of 49 (188 encounters recorded)
7 of 49 (188 encounters recorded)
8 of 49 (188 encounters recorded)
9 of 49 (188 encounters recorded)
10 of 49 (633 encounters recorded)
11 of 49 (633 encounters recorded)
12 of 49 (633 encounters recorded)
13 of 49 (633 encounters recorded)
14 of 49 (633 encounters recorded)
15 of 49 (633 encounters recorded)
16 of 49 (756 encounters recorded)
17 of 49 (756 encounters recorded)
18 of 49 (939 encounters recorded)
19 of 49 (939 encounters recorded)
20 of 49 (939 encounters recorded)
21 of 49 (1100 encounters recorded)
22 of 49 (1100 encounters recorded)
23 of 49 (1454 encounters recorded)
24 of 49 (1454 encounters recorded)
25 of 49 (1454 encounters recorded)
26 of 49 (1454 encounters recorded)
27 of 49 (1687 encounters recorded)
28 of 49 (1712 encounters recorded)
29 of 49 (1712 encoun