In [2]:
import numpy
import pandas as pd
import requests
import unittest
from datetime import datetime

### Preprocessing

In [3]:
df = pd.read_csv('../dataset/sudo_tasmania_crash_2010_2020.csv')

In [4]:
df.head(2)

Unnamed: 0,light_condition,crash_date,severity,location_description,centre_line,visited,id,report_date,description,vcrn,longitude,latitude,surface_type,speed_zone
0,Daylight,2013-04-09T00:00:00.000+0000,Serious,"West Tamar Highway, Riverside, West Tamar",Single broken,Yes,4073,2013-04-09T00:00:00.000+0000,171 - Left off carriageway into object or park...,13000144.0,147.071676,-41.390335,Sealed,100
1,Daylight,2013-04-08T00:00:00.000+0000,Property Damage Only,Intersection of Campbell Street and Liverpool ...,Single Continuous,No,4086,2013-04-09T00:00:00.000+0000,136 - Vehicles in parallel lane/ right turn si...,13000145.0,147.329838,-42.879023,Sealed,50


In [5]:
df.columns

Index(['light_condition', 'crash_date', 'severity', 'location_description',
       'centre_line', 'visited', 'id', 'report_date', 'description', 'vcrn',
       'longitude', 'latitude', 'surface_type', 'speed_zone'],
      dtype='object')

In [6]:
to_keep = ['crash_date', 'latitude', 'light_condition', 'longitude', 'severity']
df.drop(set(df.columns)-set(to_keep), axis=1)
df.head(2)

Unnamed: 0,light_condition,crash_date,severity,location_description,centre_line,visited,id,report_date,description,vcrn,longitude,latitude,surface_type,speed_zone
0,Daylight,2013-04-09T00:00:00.000+0000,Serious,"West Tamar Highway, Riverside, West Tamar",Single broken,Yes,4073,2013-04-09T00:00:00.000+0000,171 - Left off carriageway into object or park...,13000144.0,147.071676,-41.390335,Sealed,100
1,Daylight,2013-04-08T00:00:00.000+0000,Property Damage Only,Intersection of Campbell Street and Liverpool ...,Single Continuous,No,4086,2013-04-09T00:00:00.000+0000,136 - Vehicles in parallel lane/ right turn si...,13000145.0,147.329838,-42.879023,Sealed,50


In [29]:
df['severity'].unique()

array(['Serious', 'Property Damage Only', 'First Aid', 'Minor', 'Fatal',
       'Not known'], dtype=object)

In [None]:
{'Not known':-1, 'Property Damage Only':0, 'Minor':1, 'First Aid':2, 'Serious':3, 'Fatal':4}

### API request

In [12]:
headers = {
            'Cache-Control': 'no-cache',
            'X-API-Key': 'aab76f73f089453aab4c3e8fea5a7cec',
            'User-agent' : 'CloudComp'}

r = requests.get('https://gateway.api.epa.vic.gov.au/environmentMonitoring/v1/sites/parameters?environmentalSegment=air',headers=headers )
r.text

200

In [22]:
d1 = datetime.strptime('2019-07-14T23:00:00Z', '%Y-%m-%dT%H:%M:%SZ')
d2 = datetime.strptime('2019-07-12T23:00:00Z', '%Y-%m-%dT%H:%M:%SZ')

In [23]:
d1 < d2

False

### Tests on fetch_epa()

In [13]:
type(datetime.strptime('2010-11-23T11:45:33Z', '%Y-%m-%dT%H:%M:%SZ')) == datetime

True

In [14]:
datetime.strptime('2010-11-23T11:45:33Z', '%Y-%m-%dT%H:%M:%SZ')

datetime.datetime(2010, 11, 23, 11, 45, 33)

In [16]:
datetime(2010, 11, 23, 11, 45, 33)

datetime.datetime(2010, 11, 23, 11, 45, 33)

In [None]:
class Tests(unittest.TestCase):
    
    def test_fetch_epa(self):

        bad_keys_dict1 = {'records':[{'not_parameters':1}]}
        bad_keys_dict2 = {'records':[{'parameters':[{'not_name':1,'timeSeriesReadings':2}],
                                      'geometry':{'coordinates':3}}]}
        bad_keys_dict3 = {'records':[{'parameters':[{'name':1,'not_timeSeriesReadings':2}],
                                      'geometry':{'coordinates':3}}]}
        bad_keys_dict4 = {'records':[{'parameters':[{'name':1,'timeSeriesReadings':{'not_readings':4}}],
                                      'geometry':{'coordinates':3}}]}
        
        good_keys_dict = {'records':[{'parameters':[{'name':1,'timeSeriesReadings':{'readings':{'since':'2010-11-22T10:45:33Z',
                                                                                                'until':'2011-12-23T11:46:34Z',
                                                                                                'averageValue':5}}}],
                                      'geometry':{'coordinates':3}}]}


        self.assertEqual(fetch_epa(bad_keys_dict1), [])
        self.assertEqual(fetch_epa(bad_keys_dict2), [])
        self.assertEqual(fetch_epa(bad_keys_dict3), [])
        self.assertEqual(fetch_epa(bad_keys_dict4), [])

        self.assertEqual(fetch_epa(good_keys_dict), [{'name':1,'location':3,
                                                      'start':datetime(2010, 11, 22, 10, 45, 33),
                                                      'end':datetime(2011, 12, 23, 11, 46, 34),
                                                      'value':5}])



### Filtering the already downloaded data

In [None]:
def accepting_new_data(new_data, current_data):
    #readings is a list of dictionary, each one is a reading of the air quality 

    latest_current_df = current_data.groupby(['name', 'location'])['end'].max()

    kept_data = new_data.copy()

    for index in new_data.index:
        if new_data.loc[index,'end'] <= latest_current_df.loc[new_data.loc[index,'name']
                                                                          [new_data.loc[index,'location']],'end'] :
            kept_data.drop(index, axis='index')
    
    return kept_data

#    current_data[current_data['end'] > latest_current_df.loc[[current_data['name']][current_data['location']],'end']] 



In [9]:
df2 = df.groupby(['light_condition','severity'])['id'].max()
df2

light_condition                  severity            
Darkness (with street light)     Fatal                   50743981
                                 First Aid               50891686
                                 Minor                   50892562
                                 Not known               50851906
                                 Property Damage Only    50893517
                                 Serious                 50850565
Darkness (without street light)  Fatal                   50887142
                                 First Aid               50883347
                                 Minor                   50890335
                                 Not known               50893551
                                 Property Damage Only    50892483
                                 Serious                 50877777
Dawn / Dusk                      Fatal                   50599762
                                 First Aid               50861392
                      

In [11]:
df2.loc['Daylight']['Fatal']

50881284

### 

### Posting the new data

In [14]:
import elasticsearch as es

In [None]:

#the data is dict
es.index(index='airquality',document=data)