In [2]:
import os
import pandas as pd
import numpy as np

import locale
from locale import atof
import xlsxwriter

from PreProcessingUtil import preprocessing

In [77]:
df = pd.read_excel(
     os.path.join("../datasets", "pollutants", "besiktas.xlsx"),
     engine='openpyxl',
     parse_dates=True,
     thousands='.'
)
print (df)

                    Tarih PM10 ( µg/m3 ) SO2 ( µg/m3 ) CO ( µg/m3 )  \
0     2012-01-01 01:00:56              -             -            -   
1     2012-01-01 02:00:56              -             -            -   
2     2012-01-01 03:00:56              -             -            -   
3     2012-01-01 04:00:56              -             -            -   
4     2012-01-01 05:00:56              -             -            -   
...                   ...            ...           ...          ...   
81690 2021-04-26 19:00:56          11,10          1,00       296,70   
81691 2021-04-26 20:00:56          17,40          0,80       254,10   
81692 2021-04-26 21:00:56          30,00          0,70       180,00   
81693 2021-04-26 22:00:56          18,60          1,00       399,00   
81694 2021-04-26 23:00:56              -          1,10       446,60   

      NO2 ( µg/m3 ) NOX ( µg/m3 ) O3 ( µg/m3 ) PM 2.5 ( µg/m3 )  
0                 -             -            -                -  
1              

In [78]:
# Replace '-' string with NaN
df = df.replace ('-', '-1')

# Also fixing ',' delimeter with '.' for float conversion
# . is for thousands , for the last delimeter
df['PM10 ( µg/m3 )'] = df['PM10 ( µg/m3 )'].astype(str).str.replace('.','')
df['PM10 ( µg/m3 )'] = df['PM10 ( µg/m3 )'].astype(str).str.replace(',','.')

# Casting str to float
df['PM10 ( µg/m3 )'] = pd.to_numeric(df['PM10 ( µg/m3 )'], downcast="float")

In [79]:
# Counting number of -1 in the column
df[df == '-1'].count () 

Tarih                   0
PM10 ( µg/m3 )          0
SO2 ( µg/m3 )       61745
CO ( µg/m3 )        62154
NO2 ( µg/m3 )       63773
NOX ( µg/m3 )       67718
O3 ( µg/m3 )        62743
PM 2.5 ( µg/m3 )    67638
dtype: int64

In [80]:
# Filling empty rows
# https://towardsdatascience.com/7-ways-to-handle-missing-values-in-machine-learning-1a6326adf79e
# 1- We cant just delete them because we need consistent timestamps
# 2- If too many empty rows exists we should discard them
# 3- Replacing missing data with mean/median
# 3.1- This does not cover the covariance between features
df['PM10 ( µg/m3 )'] = df['PM10 ( µg/m3 )'].replace (-1.0, df[df != -1]['PM10 ( µg/m3 )'].median ())

In [81]:
df['PM10 ( µg/m3 )']

0        26.1
1        26.1
2        26.1
3        26.1
4        26.1
         ... 
81690    11.1
81691    17.4
81692    30.0
81693    18.6
81694    26.1
Name: PM10 ( µg/m3 ), Length: 81695, dtype: float32

## Town - Pollution Dataset

In [86]:
# creating dataframes per town
dfAksaray = pd.read_excel(os.path.join("../", "datasets", "pollutants", "aksaray.xlsx"),engine='openpyxl',parse_dates=True,thousands='.')
dfBesiktas = pd.read_excel(os.path.join("../", "datasets", "pollutants", "besiktas.xlsx"),engine='openpyxl',parse_dates=True,thousands='.')
dfBuyukada = pd.read_excel(os.path.join("../", "datasets", "pollutants", "buyukada.xlsx"),engine='openpyxl',parse_dates=True,thousands='.')
dfCatladıkapı = pd.read_excel(os.path.join("../", "datasets", "pollutants", "catladıkapı.xlsx"),engine='openpyxl',parse_dates=True,thousands='.')
dfEsenler = pd.read_excel(os.path.join("../", "datasets", "pollutants", "esenler.xlsx"),engine='openpyxl',parse_dates=True,thousands='.')
dfKadıkoy = pd.read_excel(os.path.join("../", "datasets", "pollutants", "kadıkoy.xlsx"),engine='openpyxl',parse_dates=True,thousands='.')
dfKandilli = pd.read_excel(os.path.join("../", "datasets", "pollutants", "kandilli.xlsx"),engine='openpyxl',parse_dates=True,thousands='.')
dfKartal = pd.read_excel(os.path.join("../", "datasets", "pollutants", "kartal.xlsx"),engine='openpyxl',parse_dates=True,thousands='.')
dfMecidiyekoy = pd.read_excel(os.path.join("../", "datasets", "pollutants", "mecidiyekoy.xlsx"),engine='openpyxl',parse_dates=True,thousands='.')
dfUmraniye = pd.read_excel(os.path.join("../", "datasets", "pollutants", "umraniye.xlsx"),engine='openpyxl',parse_dates=True,thousands='.')

In [87]:
import os
import pandas as pd
import numpy as np

def fixValues(df):
    # Replace '-' string with NaN
    df = df.replace ('-', '-1')

    # Also fixing ',' delimeter with '.' for float conversion '.' is for thousands , for the last delimeter
    # Casting str to float
    df['PM10 ( µg/m3 )'] = df['PM10 ( µg/m3 )'].astype(str).str.replace('.','')
    df['PM10 ( µg/m3 )'] = df['PM10 ( µg/m3 )'].astype(str).str.replace(',','.')
    df['PM10 ( µg/m3 )'] = pd.to_numeric(df['PM10 ( µg/m3 )'], downcast="float")
    
    df['SO2 ( µg/m3 )'] = df['SO2 ( µg/m3 )'].astype(str).str.replace('.','')
    df['SO2 ( µg/m3 )'] = df['SO2 ( µg/m3 )'].astype(str).str.replace(',','.')
    df['SO2 ( µg/m3 )'] = pd.to_numeric(df['SO2 ( µg/m3 )'], downcast="float")
    
    df['CO ( µg/m3 )'] = df['CO ( µg/m3 )'].astype(str).str.replace('.','')
    df['CO ( µg/m3 )'] = df['CO ( µg/m3 )'].astype(str).str.replace(',','.')
    df['CO ( µg/m3 )'] = pd.to_numeric(df['CO ( µg/m3 )'], downcast="float")
    
    df['NO2 ( µg/m3 )'] = df['NO2 ( µg/m3 )'].astype(str).str.replace('.','')
    df['NO2 ( µg/m3 )'] = df['NO2 ( µg/m3 )'].astype(str).str.replace(',','.')
    df['NO2 ( µg/m3 )'] = pd.to_numeric(df['NO2 ( µg/m3 )'], downcast="float")
    
    df['NOX ( µg/m3 )'] = df['NOX ( µg/m3 )'].astype(str).str.replace('.','')
    df['NOX ( µg/m3 )'] = df['NOX ( µg/m3 )'].astype(str).str.replace(',','.')
    df['NOX ( µg/m3 )'] = pd.to_numeric(df['NOX ( µg/m3 )'], downcast="float")
    
    df['O3 ( µg/m3 )'] = df['O3 ( µg/m3 )'].astype(str).str.replace('.','')
    df['O3 ( µg/m3 )'] = df['O3 ( µg/m3 )'].astype(str).str.replace(',','.')
    df['O3 ( µg/m3 )'] = pd.to_numeric(df['O3 ( µg/m3 )'], downcast="float")
    
    df['PM 2.5 ( µg/m3 )'] = df['PM 2.5 ( µg/m3 )'].astype(str).str.replace('.','')
    df['PM 2.5 ( µg/m3 )'] = df['PM 2.5 ( µg/m3 )'].astype(str).str.replace(',','.')
    df['PM 2.5 ( µg/m3 )'] = pd.to_numeric(df['PM 2.5 ( µg/m3 )'], downcast="float")
    
    return df
    
def fillEmptyRows(df):
    # Filling empty rows
    # https://towardsdatascience.com/7-ways-to-handle-missing-values-in-machine-learning-1a6326adf79e
    # 1- We cant just delete them because we need consistent timestamps
    # 2- If too many empty rows exists we should discard them
    # 3- Replacing missing data with mean/median
    # 3.1- This does not cover the covariance between features
    
    #df['PM10 ( µg/m3 )'] = df['PM10 ( µg/m3 )'].replace (-1.0, df[df != -1]['PM10 ( µg/m3 )'].median ())
    #df['SO2 ( µg/m3 )'] = df['SO2 ( µg/m3 )'].replace (-1.0, df[df != -1]['SO2 ( µg/m3 )'].median ())
    #df['CO ( µg/m3 )'] = df['CO ( µg/m3 )'].replace (-1.0, df[df != -1]['CO ( µg/m3 )'].median ())
    #df['NO2 ( µg/m3 )'] = df['NO2 ( µg/m3 )'].replace (-1.0, df[df != -1]['NO2 ( µg/m3 )'].median ())
    #df['NOX ( µg/m3 )'] = df['NOX ( µg/m3 )'].replace (-1.0, df[df != -1]['NOX ( µg/m3 )'].median ())
    #df['O3 ( µg/m3 )'] = df['O3 ( µg/m3 )'].replace (-1.0, df[df != -1]['O3 ( µg/m3 )'].median ())
    #df['PM 2.5 ( µg/m3 )'] = df['PM 2.5 ( µg/m3 )'].replace (-1.0, df[df != -1]['PM 2.5 ( µg/m3 )'].median ())
    if not(df['PM10 ( µg/m3 )'][df['PM10 ( µg/m3 )']!=-1].count()>0):df['PM10 ( µg/m3 )']=0
    if not(df['SO2 ( µg/m3 )'][df['SO2 ( µg/m3 )']!=-1].count()>0):df['SO2 ( µg/m3 )']=0
    if not(df['CO ( µg/m3 )'][df['CO ( µg/m3 )']!=-1].count()>0):df['CO ( µg/m3 )']=0
    if not(df['NO2 ( µg/m3 )'][df['NO2 ( µg/m3 )']!=-1].count()>0):df['NO2 ( µg/m3 )']=0
    if not(df['NOX ( µg/m3 )'][df['NOX ( µg/m3 )']!=-1].count()>0):df['NOX ( µg/m3 )']=0
    if not(df['O3 ( µg/m3 )'][df['O3 ( µg/m3 )']!=-1].count()>0):df['O3 ( µg/m3 )']=0
    if not(df['PM 2.5 ( µg/m3 )'][df['PM 2.5 ( µg/m3 )']!=-1].count()>0):df['PM 2.5 ( µg/m3 )']=0

    #if not(df['PM10 ( µg/m3 )'].notnull().values.any()): df['PM10 ( µg/m3 )']=0
    #if not(df['SO2 ( µg/m3 )'].notnull().values.any()): df['SO2 ( µg/m3 )']=0
    #if not(df['CO ( µg/m3 )'].notnull().values.any()): df['CO ( µg/m3 )']=0
    #if not(df['NO2 ( µg/m3 )'].notnull().values.any()): df['NO2 ( µg/m3 )']=0
    #if not(df['NOX ( µg/m3 )'].notnull().values.any()): df['NOX ( µg/m3 )']=0
    #if not(df['O3 ( µg/m3 )'].notnull().values.any()): df['O3 ( µg/m3 )']=0
    #if not(df['PM 2.5 ( µg/m3 )'].notnull().values.any()): df['PM 2.5 ( µg/m3 )']=0

    df = fillWithSameHourValue(df, 'PM10 ( µg/m3 )')
    df = fillWithSameHourValue(df, 'SO2 ( µg/m3 )')
    df = fillWithSameHourValue(df, 'CO ( µg/m3 )')
    df = fillWithSameHourValue(df, 'NO2 ( µg/m3 )')
    df = fillWithSameHourValue(df, 'NOX ( µg/m3 )')
    df = fillWithSameHourValue(df, 'O3 ( µg/m3 )')
    df = fillWithSameHourValue(df, 'PM 2.5 ( µg/m3 )')
    return df

def fillWithSameHourValue(df, pollutantColumnName):
    for x in range (df[pollutantColumnName].shape[0]):
        if(df[pollutantColumnName].iloc[x] == -1 and x < 24):
            df[pollutantColumnName].iloc[x] = findValue(df[pollutantColumnName], x)
        if(df[pollutantColumnName].iloc[x] == -1 and x >= 24):
            df[pollutantColumnName].iloc[x] = df[pollutantColumnName].iloc[x-24]
        
    return df

def findValue(df, i):
    if(df.shape[0]<=i):
        return 0

    if(df.iloc[i] == -1):
        i = i+24
        return findValue(df, i)
    else:
        return df.iloc[i] 

def preprocessingx(df):
    df = fixValues(df)
    df = fillEmptyRows(df)
    return df

In [88]:
# making preprocessing (fixing values, filling empty rows)
dfAksaray = preprocessing(dfAksaray)
dfBesiktas = preprocessing(dfBesiktas)
dfBuyukada = preprocessing(dfBuyukada)
dfCatladıkapı = preprocessing(dfCatladıkapı)
dfEsenler = preprocessing(dfEsenler)
dfKadıkoy = preprocessing(dfKadıkoy)
dfKandilli = preprocessing(dfKandilli)
dfKartal = preprocessing(dfKartal)
dfMecidiyekoy = preprocessing(dfMecidiyekoy)
dfUmraniye = preprocessing(dfUmraniye)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, val

In [91]:
# Initialize data to Dicts of series.  
lastNItems = 20000
columnName = 'NO2 ( µg/m3 )'
excelName = '../datasets/training/NO2.xlsx'

d = {
    'Aksaray' : dfAksaray[columnName][-lastNItems:].tolist(),  
    'Besiktas' : dfBesiktas[columnName][-lastNItems:].tolist(),
    'Buyukada' : dfBuyukada[columnName][-lastNItems:].tolist(), 
    'Catladıkapı' : dfCatladıkapı[columnName][-lastNItems:].tolist(),
    'Esenler' : dfEsenler[columnName][-lastNItems:].tolist(), 
    'Kadıkoy' : dfKadıkoy[columnName][-lastNItems:].tolist(),
    'Kandilli' : dfKandilli[columnName][-lastNItems:].tolist(), 
    'Kartal'    : dfKartal[columnName][-lastNItems:].tolist(),
    'Mecidiyekoy' : dfMecidiyekoy[columnName][-lastNItems:].tolist(), 
    'Umraniye'    : dfUmraniye[columnName][-lastNItems:].tolist()
    
}  
  
# creates Dataframe.  
dframe = pd.DataFrame(d, columns = ['Aksaray', 'Besiktas','Buyukada','Catladıkapı','Esenler','Kadıkoy','Kandilli','Kartal','Mecidiyekoy','Umraniye']) 
  
# print the data.  
print(dframe) 

# export to excel
dframe.to_excel(excelName) 

          Aksaray    Besiktas  Buyukada  Catladıkapı     Esenler     Kadıkoy  \
0       65.300003   92.400002         0    49.099998   97.400002   35.099998   
1       55.500000   86.699997         0    64.599998  100.300003   30.600000   
2       75.500000  108.400002         0    81.000000  106.599998   30.299999   
3       83.000000  133.500000         0    95.300003  119.000000   39.200001   
4       98.500000  129.000000         0   102.099998  128.199997   35.299999   
...           ...         ...       ...          ...         ...         ...   
19995  300.399994   15.900000         0    98.199997   64.000000   61.700001   
19996  322.500000   15.800000         0   125.099998   84.099998   74.300003   
19997  318.500000   18.900000         0    98.900002   99.199997   73.500000   
19998  317.700012   23.299999         0    73.900002  101.199997   98.900002   
19999  309.200012   22.100000         0    99.400002  109.400002  104.500000   

       Kandilli     Kartal  Mecidiyekoy

In [None]:
## Pollution Diff

In [None]:
sehirList = ['Aksaray', 'Besiktas','Buyukada','Catladıkapı','Esenler','Kadıkoy','Kandilli','Kartal','Mecidiyekoy','Umraniye']

In [None]:
n_sensors = len(sehirList)

In [None]:
dframe['Aksaray']-dframe['Besiktas']

In [None]:
# Preparing adjacency matrix for the sensor nodes
pol_diff_matrix = np.zeros ((n_sensors, n_sensors))

# Create distance matrix from each sensor to other ones
# For each sensor
ix = 0
for s_sehir in sehirList:
    s = dframe[s_sehir]
    # Look at other sensors
    o_ix = 0
    for o_sehir in sehirList:
        o = dframe[o_sehir]
        # Calculate the distance
        dframe['diff'] = s-o
        # Update the distance matrix
        pol_diff_matrix [ix][o_ix] = abs(dframe['diff'].mean())
        #
        o_ix += 1
    ix += 1
    
# Take the absolute of the difference between all values and the max value
# Then divide it by the max value to get the weighted adjacency matrix
# Add 1 to the max value so that the weights will never be lost
max_val = np.max (pol_diff_matrix) + 1
pol_diff_matrix = (max_val - pol_diff_matrix)/max_val
pol_diff_matrix

In [None]:
# export to excel
workbook = xlsxwriter.Workbook('datasets/training/pol_diff_adj.xlsx')
worksheet = workbook.add_worksheet()

row = 0

for col, data in enumerate(pol_diff_matrix):
    worksheet.write_column(row, col, data)

workbook.close()

## Sensor Locations

In [3]:
from math import cos, asin, sqrt, pi

# Distance between two coordinates taken from
# https://stackoverflow.com/questions/27928/calculate-distance-between-two-latitude-longitude-points-haversine-formula
def coord_distance(lat1, lon1, lat2, lon2):
    p = pi/180
    a = 0.5 - cos((lat2-lat1)*p)/2 + cos(lat1*p) * cos(lat2*p) * (1-cos((lon2-lon1)*p))/2
    return 12742 * asin(sqrt(a)) #2*R*asin...

In [4]:
sensor_l = pd.read_excel(
     os.path.join("../datasets/adjacency/sensor_locations.xlsx"),
     engine='openpyxl',
     parse_dates=True,
)
n_sensors = sensor_l.shape[0]
print (sensor_l)
print (sensor_l.shape)

          İlçe    Enlem   Boylam
0      aksaray  41.0244  29.0997
1     besiktas  41.0520  29.0094
2     buyukada  40.8521  29.1180
3  catladıkapı  41.0023  28.9751
4      esenler  41.0368  28.8880
5      kadıkoy  40.9908  29.0333
6     kandilli  41.0624  29.0582
7       kartal  40.9110  29.1830
8  mecidiyekoy  41.0659  28.9944
9     umraniye  41.0126  29.1618
(10, 3)


In [9]:
# Preparing adjacency matrix for the sensor nodes
distance_matrix = np.zeros ((n_sensors, n_sensors))

# Create distance matrix from each sensor to other ones
# For each sensor
ix = 0
for sensor in sensor_l['İlçe']:
    s_lat = sensor_l[sensor_l['İlçe'] == sensor].values[0][1]
    s_lon = sensor_l[sensor_l['İlçe'] == sensor].values[0][2]
    # Look at other sensors
    o_ix = 0
    for o_sensor in sensor_l['İlçe']:
        o_lat = sensor_l[sensor_l['İlçe'] == o_sensor].values[0][1] 
        o_lon = sensor_l[sensor_l['İlçe'] == o_sensor].values[0][2] 
        # Calculate the distance
        distance = coord_distance (s_lat, s_lon, o_lat, o_lon)
        # Update the distance matrix
        distance_matrix [ix][o_ix] = distance
        #
        o_ix += 1
    ix += 1
    
# Take the absolute of the difference between all values and the max value
# Then divide it by the max value to get the weighted adjacency matrix
# Add 1 to the max value so that the weights will never be lost
max_val = np.max (distance_matrix) + 1
distance_matrix = (max_val - distance_matrix)/max_val
distance_matrix

array([[1.        , 0.72246006, 0.34720973, 0.63525996, 0.39507831,
        0.77216223, 0.81407726, 0.51027434, 0.66159891, 0.81752721],
       [0.72246006, 1.        , 0.18399078, 0.78839024, 0.64948686,
        0.75905202, 0.85558757, 0.27301488, 0.93232307, 0.54105276],
       [0.34720973, 0.18399078, 1.        , 0.30142131, 0.04239443,
        0.42311837, 0.18768574, 0.71030258, 0.11898075, 0.38111595],
       [0.63525996, 0.78839024, 0.30142131, 1.        , 0.71969965,
        0.828519  , 0.67203794, 0.31409166, 0.75359924, 0.46650847],
       [0.39507831, 0.64948686, 0.04239443, 0.71969965, 1.        ,
        0.5509857 , 0.50571456, 0.03396331, 0.67766267, 0.21457354],
       [0.77216223, 0.75905202, 0.42311837, 0.828519  , 0.5509857 ,
        1.        , 0.72044807, 0.47736705, 0.6954975 , 0.62461969],
       [0.81407726, 0.85558757, 0.18768574, 0.67203794, 0.50571456,
        0.72044807, 1.        , 0.32657829, 0.8178543 , 0.65005373],
       [0.51027434, 0.27301488, 0.7103025

In [None]:
# export to excel
workbook = xlsxwriter.Workbook('datasets/training/sensor_dist_adj.xlsx')
worksheet = workbook.add_worksheet()

row = 0

for col, data in enumerate(distance_matrix):
    worksheet.write_column(row, col, data)

workbook.close()

## Population

In [None]:
sensor_l = pd.read_excel(
     os.path.join("datasets/adjacency/population.xlsx"),
     engine='openpyxl',
     parse_dates=True,
)
n_sensors = sensor_l.shape[0]
print (sensor_l)
print (sensor_l.shape)

## Randomized Data

In [14]:
# Create uniform random matrix with values between 0 and 1
n_cities = 10
randomized_matrix = np.random.rand (n_cities, n_cities)

# Make diagonal 1 because they are themselves
np.fill_diagonal(randomized_matrix, 1)

# export to excel
workbook = xlsxwriter.Workbook('../datasets/training/random_dist_adj.xlsx')
worksheet = workbook.add_worksheet()

row = 0

for col, data in enumerate(randomized_matrix):
    worksheet.write_column(row, col, data)

workbook.close()

## Gas Usage For Year

In [55]:
# For natural gas usage NO2 and SO2 pollutants are important
gas_pd = pd.read_excel(
     os.path.join("../datasets/adjacency/natural_gas_consumption.xlsx"),
     engine='openpyxl',
     parse_dates=True,
     sheet_name='Tüketim verileri(m3)'
)

In [72]:
# İlçe listesi
sehirList = ['FATİH', 'BEŞİKTAŞ','ADALAR','FATİH','ESENLER','KADIKÖY','ÜSKÜDAR','KARTAL','ŞİŞLİ','ÜMRANİYE']

# Preparing adjacency matrix for the sensor nodes
gas_matrix = np.zeros ((n_sensors, n_sensors))

# Create custom dictionary in order to create a new pandas dataframe for correlation
district_gas_dict = {}

for data in gas_pd['İlçe']:
    if data in sehirList:
        row = gas_pd.loc[gas_pd['İlçe'] == data]
        gas_val = row[2019].values[0]
        
        district_gas_dict [data] = gas_val
        
final_dict = []        
        
for dist in sehirList:
    final_dict.append( district_gas_dict [dist])
    
final_dict = np.asarray (final_dict)
final_dict = final_dict / final_dict.max ()

# We take the difference between consumptions
for ix_one, sehir in enumerate (sehirList):
    for ix_two, sehir in enumerate (sehirList):
        if ix_one == ix_two:
            gas_matrix [ix_one, ix_two] = 1
        else:
            gas_matrix [ix_one, ix_two] = np.abs (final_dict [ix_one] - final_dict [ix_two])

In [73]:
# export to excel
workbook = xlsxwriter.Workbook('../datasets/training/gas_adj.xlsx')
worksheet = workbook.add_worksheet()

row = 0

for col, data in enumerate(gas_matrix):
    worksheet.write_column(row, col, data)

workbook.close()

## Waste Processing Locations

In [5]:
# For natural gas usage NO2 and SO2 pollutants are important
waste_pd = pd.read_excel(
     os.path.join("../datasets/adjacency/waste_facility.xlsx"),
     engine='openpyxl',
     parse_dates=True
)

In [37]:
# using sensor locations sensor_l, calculate the sum of distance from each waste facility to sensors
waste_dist = []
for s_l in sensor_l ['İlçe']:
    # Get current sensors location data
    s_l_data = sensor_l [sensor_l ['İlçe'] == s_l]
    sensor_lat = s_l_data ['Enlem'].values [0]
    sensor_lon = s_l_data ['Boylam'].values [0]
    total_distance = 0
    # Iterate all waste faciliies
    for index, row in waste_pd.iterrows():
        waste_lat = row ['LATITUDE']
        waste_lon = row ['LONGTITUDE']
        # Calculate distance
        dist = coord_distance (sensor_lat, sensor_lon, waste_lat, waste_lon)
        total_distance += dist
    
    waste_dist.append (total_distance)
    
# Calculate the adj matrix, the distance is inversly correlated so higher distance means lower pollution
# from the facilities
waste_dist = np.asarray (waste_dist)
waste_dist = 1 / waste_dist
waste_dist = waste_dist / waste_dist.max ()

waste_adj = np.zeros ((len(waste_dist), len (waste_dist)))

# We take the difference between consumptions
for ix_one, waste in enumerate (waste_dist):
    for ix_two, waste in enumerate (waste_dist):
        if ix_one == ix_two:
            waste_adj [ix_one, ix_two] = 1
        else:
            waste_adj [ix_one, ix_two] = np.abs (waste_dist [ix_one] - waste_dist [ix_two])

In [38]:
# export to excel
workbook = xlsxwriter.Workbook('../datasets/training/waste_facilities_adj.xlsx')
worksheet = workbook.add_worksheet()

row = 0

for col, data in enumerate(waste_adj):
    worksheet.write_column(row, col, data)

workbook.close()

## Park Counts

In [5]:
# For natural gas usage NO2 and SO2 pollutants are important
park_pd = pd.read_excel(
     os.path.join("../datasets/adjacency/park_location.xlsx"),
     engine='openpyxl',
     parse_dates=True
)
park_pd

Unnamed: 0,NAME,LONGITUDE,LATITUDE,NEIGHBORHOOD_NAME,COUNTY_NAME
0,Yayla Sokak Parkı,28.925082,41.081198,AKŞEMSETTİN,EYÜPSULTAN
1,Stad Parkı,28.914355,40.994647,TELSİZ,ZEYTİNBURNU
2,Kadriye Gök Parkı,28.899872,40.990061,NURİPAŞA,ZEYTİNBURNU
3,Çukurbostan Parkı,28.933638,41.011341,ŞEHREMİNİ,FATİH
4,Tutya Parkı,28.937360,41.009753,SEYYİD ÖMER,FATİH
...,...,...,...,...,...
3622,1415. Sok . Parkı,28.895567,41.104201,GAZİ,SULTANGAZİ
3623,Yedikule Surları Parkı,28.922778,40.993955,YEDİKULE,FATİH
3624,Bağlarbaşı Çocuk Parkı,29.035293,41.024379,SELAMİ ALİ,ÜSKÜDAR
3625,Park,29.027359,41.085210,AKAT,BEŞİKTAŞ


In [25]:
# using sensor locations sensor_l, calculate the sum of distance from each waste facility to sensors
park_dist = []
for s_l in sensor_l ['İlçe']:
    # Get current sensors location data
    s_l_data = sensor_l [sensor_l ['İlçe'] == s_l]
    sensor_lat = s_l_data ['Enlem'].values [0]
    sensor_lon = s_l_data ['Boylam'].values [0]
    close_park_count = 0
    # Iterate all waste faciliies
    for index, row in park_pd.iterrows():
        waste_lat = row ['LATITUDE']
        waste_lon = row ['LONGITUDE']
        # Calculate distance
        dist = coord_distance (sensor_lat, sensor_lon, waste_lat, waste_lon)
        if dist < 2.5:
            close_park_count += 1
    
    park_dist.append (close_park_count)
      

        
# Calculate the adj matrix, the park number is inversly correlated with pollution!
park_dist = np.asarray (park_dist)
park_dist = 1 / park_dist
park_dist = park_dist / park_dist.max ()

park_adj = np.zeros ((len(park_dist), len (park_dist)))

# We take the difference between consumptions
for ix_one, park in enumerate (park_dist):
    for ix_two, park in enumerate (park_dist):
        if ix_one == ix_two:
            park_adj [ix_one, ix_two] = 1
        else:
            park_adj [ix_one, ix_two] = np.abs (park_dist [ix_one] - park_dist [ix_two]) 
                 

In [26]:
# export to excel
workbook = xlsxwriter.Workbook('../datasets/training/park_location_adj.xlsx')
worksheet = workbook.add_worksheet()

row = 0

for col, data in enumerate(park_adj):
    worksheet.write_column(row, col, data)

workbook.close()

## Amount of Waste

In [32]:
# For natural gas usage NO2 and SO2 pollutants are important
waste_amount_pd = pd.read_excel(
     os.path.join("../datasets/adjacency/amount_of_waste.xlsx"),
     engine='openpyxl',
     parse_dates=True
)

In [38]:
# İlçe listesi
sehirList = ['Fatih', 'Beşiktaş','Adalar','Fatih','Esenler','Kadıköy','Üsküdar','Kartal','Şişli','Ümraniye']

# Preparing adjacency matrix for the sensor nodes
waste_amount_matrix = np.zeros ((len (sehirList), len (sehirList)))

# Create custom dictionary in order to create a new pandas dataframe for correlation
waste_amount_dict = {}

for data in waste_amount_pd['İlçe (Disticts)']:
    if data in sehirList:
        row = waste_amount_pd.loc[waste_amount_pd['İlçe (Disticts)'] == data]
        waste_amount = row['2020'].values[0]
        
        waste_amount_dict [data] = waste_amount
        
final_dict = []        
        
for dist in sehirList:
    final_dict.append( waste_amount_dict [dist])
      
final_dict = np.asarray (final_dict)
final_dict = final_dict / final_dict.max ()

# We take the difference between consumptions
for ix_one, sehir in enumerate (sehirList):
    for ix_two, sehir in enumerate (sehirList):
        if ix_one == ix_two:
            waste_amount_matrix [ix_one, ix_two] = 1
        else:
            waste_amount_matrix [ix_one, ix_two] = np.abs (final_dict [ix_one] - final_dict [ix_two])

In [40]:
# export to excel
workbook = xlsxwriter.Workbook('../datasets/training/waste_amount_adj.xlsx')
worksheet = workbook.add_worksheet()

row = 0

for col, data in enumerate(waste_amount_matrix):
    worksheet.write_column(row, col, data)

workbook.close()

## Population

In [2]:
population_pd = pd.read_excel(
     os.path.join("../datasets/adjacency/population.xlsx"),
     engine='openpyxl',
     parse_dates=True
)

In [3]:
# İlçe listesi
sehirList = ['Fatih', 'Beşiktaş','Adalar','Fatih','Esenler','Kadıköy','Üsküdar','Kartal','Şişli','Ümraniye']

# Preparing adjacency matrix for the sensor nodes
adj_matrix = np.zeros ((len (sehirList), len (sehirList)))

# Create custom dictionary in order to create a new pandas dataframe for correlation
adj_dict = {}

for data in population_pd ['İlçe']:
    if data in sehirList:
        row = population_pd.loc[population_pd ['İlçe'] == data]
        pop_count = row ['İlçe Nüfusu'].values[0]
        
        adj_dict [data] = pop_count
        
final_dict = []        
        
for dist in sehirList:
    final_dict.append( adj_dict [dist])
      
final_dict = np.asarray (final_dict)
final_dict = final_dict / final_dict.max ()

# We take the difference between consumptions
for ix_one, sehir in enumerate (sehirList):
    for ix_two, sehir in enumerate (sehirList):
        if ix_one == ix_two:
            adj_matrix [ix_one, ix_two] = 1
        else:
            adj_matrix [ix_one, ix_two] = np.abs (final_dict [ix_one] - final_dict [ix_two])

In [4]:
# export to excel
workbook = xlsxwriter.Workbook('../datasets/training/population_adj.xlsx')
worksheet = workbook.add_worksheet()

row = 0

for col, data in enumerate(adj_matrix):
    worksheet.write_column(row, col, data)

workbook.close()

## Readers and Writers Percentage

In [9]:
writer_pd = pd.read_excel(
     os.path.join("../datasets/adjacency/2019-yl-ilce-bazl-okuma-yazma-bilen-bilmeyen-kii-says.xlsx"),
     engine='openpyxl',
     parse_dates=True
)

In [10]:
# İlçe listesi
sehirList = ['Fatih', 'Beşiktaş','Adalar','Fatih','Esenler','Kadıköy','Üsküdar','Kartal','Şişli','Ümraniye']

# Preparing adjacency matrix for the sensor nodes
adj_matrix = np.zeros ((len (sehirList), len (sehirList)))

# Create custom dictionary in order to create a new pandas dataframe for correlation
adj_dict = {}

for data in writer_pd ['İlçeler']:
    if data in sehirList:
        row = writer_pd.loc[writer_pd ['İlçeler'] == data]
        non_writers = row [' Okuma Yazma Bilmeyen'].values [0]
        writers = row ['Okuma Yazma Bilen'].values [0]
        
        adj_dict [data] = (1.0 * non_writers) / (non_writers + writers)
        
final_dict = []        
        
for dist in sehirList:
    final_dict.append( adj_dict [dist])
      
final_dict = np.asarray (final_dict)

# We take the difference between consumptions
for ix_one, sehir in enumerate (sehirList):
    for ix_two, sehir in enumerate (sehirList):
        if ix_one == ix_two:
            adj_matrix [ix_one, ix_two] = 1
        else:
            adj_matrix [ix_one, ix_two] = np.abs (final_dict [ix_one] - final_dict [ix_two])

In [11]:
# export to excel
workbook = xlsxwriter.Workbook('../datasets/training/non_writer_percentage_adj.xlsx')
worksheet = workbook.add_worksheet()

row = 0

for col, data in enumerate(adj_matrix):
    worksheet.write_column(row, col, data)

workbook.close()

## Building Counts

In [19]:
data_pd = pd.read_excel(
     os.path.join("../datasets/adjacency/building_counts_2017.xlsx"),
     engine='openpyxl',
     parse_dates=True,
     sheet_name='Sum'
)
data_pd

Unnamed: 0,İlçe,Total
0,Adalar,6393.0
1,Arnavutköy,31941.0
2,Ataşehir,27583.0
3,Avcılar,26762.0
4,Bağcılar,42439.0
...,...,...
832,,
833,,
834,,
835,,


In [26]:
# İlçe listesi
sehirList = ['Fatih', 'Beşiktaş','Adalar','Fatih','Esenler','Kadıköy','Üsküdar','Kartal','Şişli','Ümraniye']

# Preparing adjacency matrix for the sensor nodes
adj_matrix = np.zeros ((len (sehirList), len (sehirList)))

# Create custom dictionary in order to create a new pandas dataframe for correlation
adj_dict = {}

for data in data_pd ['İlçe']:
    if data in sehirList:
        row = data_pd.loc[data_pd ['İlçe'] == data]
        neighbors = row ['Total'].values [0]
        adj_dict [data] = neighbors
        
final_dict = []        
        
for dist in sehirList:
    final_dict.append( adj_dict [dist])
      
final_dict = np.asarray (final_dict)
final_dict = final_dict / np.max (final_dict)

# We take the difference between consumptions
for ix_one, sehir in enumerate (sehirList):
    for ix_two, sehir in enumerate (sehirList):
        if ix_one == ix_two:
            adj_matrix [ix_one, ix_two] = 1
        else:
            adj_matrix [ix_one, ix_two] = np.abs (final_dict [ix_one] - final_dict [ix_two])
         

In [27]:
# export to excel
workbook = xlsxwriter.Workbook('../datasets/training/neighbor_adj.xlsx')
worksheet = workbook.add_worksheet()

row = 0

for col, data in enumerate(adj_matrix):
    worksheet.write_column(row, col, data)

workbook.close()