In [2]:
import os
import pandas as pd
import numpy as np

import locale
from locale import atof

In [3]:
df = pd.read_excel(
     os.path.join("datasets", "besiktas.xlsx"),
     engine='openpyxl',
     parse_dates=True,
     thousands='.'
)
print (df)

                    Tarih PM10 ( µg/m3 ) SO2 ( µg/m3 ) CO ( µg/m3 )  \
0     2012-01-01 01:00:56              -             -            -   
1     2012-01-01 02:00:56              -             -            -   
2     2012-01-01 03:00:56              -             -            -   
3     2012-01-01 04:00:56              -             -            -   
4     2012-01-01 05:00:56              -             -            -   
...                   ...            ...           ...          ...   
81690 2021-04-26 19:00:56          11,10          1,00       296,70   
81691 2021-04-26 20:00:56          17,40          0,80       254,10   
81692 2021-04-26 21:00:56          30,00          0,70       180,00   
81693 2021-04-26 22:00:56          18,60          1,00       399,00   
81694 2021-04-26 23:00:56              -          1,10       446,60   

      NO2 ( µg/m3 ) NOX ( µg/m3 ) O3 ( µg/m3 ) PM 2.5 ( µg/m3 )  
0                 -             -            -                -  
1              

In [4]:
# Replace '-' string with NaN
df = df.replace ('-', '-1')

# Also fixing ',' delimeter with '.' for float conversion
# . is for thousands , for the last delimeter
df['PM10 ( µg/m3 )'] = df['PM10 ( µg/m3 )'].astype(str).str.replace('.','')
df['PM10 ( µg/m3 )'] = df['PM10 ( µg/m3 )'].astype(str).str.replace(',','.')

# Casting str to float
df['PM10 ( µg/m3 )'] = pd.to_numeric(df['PM10 ( µg/m3 )'], downcast="float")

In [5]:
# Counting number of -1 in the column
df[df == '-1'].count () 

Tarih                   0
PM10 ( µg/m3 )          0
SO2 ( µg/m3 )       61745
CO ( µg/m3 )        62154
NO2 ( µg/m3 )       63773
NOX ( µg/m3 )       67718
O3 ( µg/m3 )        62743
PM 2.5 ( µg/m3 )    67638
dtype: int64

In [6]:
# Filling empty rows
# https://towardsdatascience.com/7-ways-to-handle-missing-values-in-machine-learning-1a6326adf79e
# 1- We cant just delete them because we need consistent timestamps
# 2- If too many empty rows exists we should discard them
# 3- Replacing missing data with mean/median
# 3.1- This does not cover the covariance between features
df['PM10 ( µg/m3 )'] = df['PM10 ( µg/m3 )'].replace (-1.0, df[df != -1]['PM10 ( µg/m3 )'].median ())

In [7]:
df['PM10 ( µg/m3 )']

0        26.1
1        26.1
2        26.1
3        26.1
4        26.1
         ... 
81690    11.1
81691    17.4
81692    30.0
81693    18.6
81694    26.1
Name: PM10 ( µg/m3 ), Length: 81695, dtype: float32

## Sensor Locations

In [49]:
from math import cos, asin, sqrt, pi

# Distance between two coordinates taken from
# https://stackoverflow.com/questions/27928/calculate-distance-between-two-latitude-longitude-points-haversine-formula
def coord_distance(lat1, lon1, lat2, lon2):
    p = pi/180
    a = 0.5 - cos((lat2-lat1)*p)/2 + cos(lat1*p) * cos(lat2*p) * (1-cos((lon2-lon1)*p))/2
    return 12742 * asin(sqrt(a)) #2*R*asin...

In [50]:
sensor_l = pd.read_excel(
     os.path.join("sensor_locations.xlsx"),
     engine='openpyxl',
     parse_dates=True,
)
n_sensors = sensor_l.shape[0]
print (sensor_l)
print (sensor_l.shape)

          İlçe    Enlem   Boylam
0      aksaray  41.0244  29.0997
1     besiktas  41.0520  29.0094
2     buyukada  40.8521  29.1180
3  catladıkapı  41.0023  28.9751
4      esenler  41.0368  28.8880
5      kadıkoy  40.9908  29.0333
6     kandilli  41.0624  29.0582
7       kartal  40.9110  29.1830
8  mecidiyekoy  41.0659  28.9944
9     umraniye  41.0126  29.1618
(10, 3)


In [67]:
# Preparing adjacency matrix for the sensor nodes
distance_matrix = np.zeros ((n_sensors, n_sensors))

# Create distance matrix from each sensor to other ones
# For each sensor
ix = 0
for sensor in sensor_l['İlçe']:
    s_lat = sensor_l[sensor_l['İlçe'] == sensor].values[0][1]
    s_lon = sensor_l[sensor_l['İlçe'] == sensor].values[0][2]
    # Look at other sensors
    o_ix = 0
    for o_sensor in sensor_l['İlçe']:
        o_lat = sensor_l[sensor_l['İlçe'] == o_sensor].values[0][1] 
        o_lon = sensor_l[sensor_l['İlçe'] == o_sensor].values[0][2] 
        # Calculate the distance
        distance = coord_distance (s_lat, s_lon, o_lat, o_lon)
        # Update the distance matrix
        distance_matrix [ix][o_ix] = distance
        #
        o_ix += 1
    ix += 1
    
# Take the absolute of the difference between all values and the max value
# Then divide it by the max value to get the weighted adjacency matrix
# Add 1 to the max value so that the weights will never be lost
max_val = np.max (distance_matrix) + 1
distance_matrix = (max_val - distance_matrix)/max_val
distance_matrix

array([[1.        , 0.72246006, 0.34720973, 0.63525996, 0.39507831,
        0.77216223, 0.81407726, 0.51027434, 0.66159891, 0.81752721],
       [0.72246006, 1.        , 0.18399078, 0.78839024, 0.64948686,
        0.75905202, 0.85558757, 0.27301488, 0.93232307, 0.54105276],
       [0.34720973, 0.18399078, 1.        , 0.30142131, 0.04239443,
        0.42311837, 0.18768574, 0.71030258, 0.11898075, 0.38111595],
       [0.63525996, 0.78839024, 0.30142131, 1.        , 0.71969965,
        0.828519  , 0.67203794, 0.31409166, 0.75359924, 0.46650847],
       [0.39507831, 0.64948686, 0.04239443, 0.71969965, 1.        ,
        0.5509857 , 0.50571456, 0.03396331, 0.67766267, 0.21457354],
       [0.77216223, 0.75905202, 0.42311837, 0.828519  , 0.5509857 ,
        1.        , 0.72044807, 0.47736705, 0.6954975 , 0.62461969],
       [0.81407726, 0.85558757, 0.18768574, 0.67203794, 0.50571456,
        0.72044807, 1.        , 0.32657829, 0.8178543 , 0.65005373],
       [0.51027434, 0.27301488, 0.7103025