# Data Dictionary
#### DATE_TIME 
It is the field that contains date and time information. Data format YYYY-MM-DD HH24: MI: SS format. The date break is hourly.
#### OBSERVATORY_NAME
It indicates the type number of the sensor where the measurement is made.
#### AVERAGE_TEMPERATURE
Average temperature (& ordm; C) measured from the respective sensor at the given hour.
#### AVERAGE_HUMIDITY
Average wind speed (km / h) measured from the relevant sensor for the given hour.
#### AVERAGE_WIND
Average wind direction (km / h) measured from the respective sensor at the given hour.
#### AVERAGE_DIRECTIONOFWIND
Average precipitation amount (kg / m²) measured from the relevant sensor in the given hour.
#### AVERAGE_PRECIPITATION
Average amount of precipitation (kg / m²) measured from the relevant sensor in the given hour.
#### AVERAGE_ROAD_TEMPERATURE
Average road temperature (& ordm; C) measured from the respective sensor at the given hour.

In [1]:
import os
import pandas as pd
import numpy as np

import locale
from locale import atof
import xlsxwriter

from PreProcessingUtil import preprocessing

In [2]:
# Select rows in a DataFrame between two dates
def selectedDateFrame(dfPol, dfMet):
    dfPol = dfPol[(startDate<=dfPol['Tarih']) & (dfPol['Tarih']<endDate)]
    dfMet = dfMet[(startDate<=dfMet['DATE_TIME']) & (dfMet['DATE_TIME']<endDate)]
    return dfPol, dfMet

# Function to fill missing rows
def fillingEmptyRows(dfTrue,dfFalse):
    for x in range (dfTrue.shape[0]):
        if not (str(dfTrue['Tarih'].iloc[x])[:13] == str(dfFalse['DATE_TIME'].iloc[x])[:13]):
            temp = dfFalse.iloc[x-1] # get previous value
            temp['DATE_TIME'] = dfTrue['Tarih'].iloc[x] # set true date
            dfFalse = Insert_row(x, dfFalse, temp) # insert missing value        
    return dfTrue, dfFalse

# Function to insert row in the dataframe
def Insert_row(row_number, df, row_value):
    # Starting value of upper half
    start_upper = 0
   
    # End value of upper half
    end_upper = row_number
   
    # Start value of lower half
    start_lower = row_number
   
    # End value of lower half
    end_lower = df.shape[0]
   
    # Create a list of upper_half index
    upper_half = [*range(start_upper, end_upper, 1)]
   
    # Create a list of lower_half index
    lower_half = [*range(start_lower, end_lower, 1)]
   
    # Increment the value of lower half by 1
    lower_half = [x.__add__(1) for x in lower_half]
   
    # Combine the two lists
    index_ = upper_half + lower_half
   
    # Update the index of the dataframe
    df.index = index_
   
    # Insert a row at the end
    df.loc[row_number] = row_value
    
    # Sort the index labels
    df = df.sort_index()
   
    # return the dataframe
    return df

In [3]:
# creating dataframes per town
df1  = pd.read_csv(os.path.join("../", "datasets", "meteorology_observation", "meteorology_observation_202001.csv"))
df2  = pd.read_csv(os.path.join("../", "datasets", "meteorology_observation", "meteorology_observation_202002.csv"))
df3  = pd.read_csv(os.path.join("../", "datasets", "meteorology_observation", "meteorology_observation_202003.csv"))
df4  = pd.read_csv(os.path.join("../", "datasets", "meteorology_observation", "meteorology_observation_202004.csv"))
df5  = pd.read_csv(os.path.join("../", "datasets", "meteorology_observation", "meteorology_observation_202005.csv"))
df6  = pd.read_csv(os.path.join("../", "datasets", "meteorology_observation", "meteorology_observation_202006.csv"))
df7  = pd.read_csv(os.path.join("../", "datasets", "meteorology_observation", "meteorology_observation_202007.csv"))
df8  = pd.read_csv(os.path.join("../", "datasets", "meteorology_observation", "meteorology_observation_202008.csv"))
df9  = pd.read_csv(os.path.join("../", "datasets", "meteorology_observation", "meteorology_observation_202009.csv"))
df10 = pd.read_csv(os.path.join("../", "datasets", "meteorology_observation", "meteorology_observation_202010.csv"))
df11 = pd.read_csv(os.path.join("../", "datasets", "meteorology_observation", "meteorology_observation_202011.csv"))
df12 = pd.read_csv(os.path.join("../", "datasets", "meteorology_observation", "meteorology_observation_202012.csv"))
df13 = pd.read_csv(os.path.join("../", "datasets", "meteorology_observation", "meteorology_observation_202101.csv"))
df14 = pd.read_csv(os.path.join("../", "datasets", "meteorology_observation", "meteorology_observation_202102.csv"))

In [4]:
# concat all dataframes
bigdata = pd.concat([df1, df2, df3, df4, df5, df6, df7, df8, df9, df10, df11, df12, df13, df14], ignore_index=True, sort=False)

In [5]:
# distinct town names
bigdata['OBSERVATORY_NAME'].unique()

array(['BEYKOZ_ANADOLU_FENERI', 'ISTOC_MAHMUTBEY',
       'GUNGOREN_DAVUTPASA_MARMARA', 'KILYOS', 'SILE_DARLIK', 'PASAKOY',
       'SANCAKTEPE_MGM', 'ESENLER', 'HACIOSMAN_SARIYER', 'SILE_ISAKOY',
       'DURUSU', 'SUBASI', 'CAMLICA_TUNEL_K', 'BAYRAMOGLU_TUZLA',
       'KAMILOBA', 'CAMLICA_LIBADIYE', 'SARIYER_ITU_MASLAK', 'CATALCA',
       'BUYUKADA', 'SABIHAGOKCEN', 'EMINONU', 'OLIMPIYAT', 'G_O_PASA',
       'BESIKTAS_YILDIZ', 'SILIVRI_ORMAN_SAHASI', 'USKUDAR_MGM', 'SILE',
       'RIVA_TUNEL_G', 'BUYUKCEKMECE_MGM', 'KARTAL_AYDOS_DAGI',
       'UMRANIYE', 'ODAYERI', 'ARNAVUTKOY_MGM', 'HADIMKOY', 'BEYKOZ',
       'SUREYYAPASA', 'SAMANDIRA', 'SARIYER', 'AKOM', 'CATALCA_MGM',
       'B_CEKMECE_SVIRAJLARI', 'BAHCESEHIR_I_KULE', 'SILIVRI_MGM',
       'TERKOS', 'CAVUSBASI', 'CANTA', 'TERKOS_BARAJI', 'PENDIK',
       'CEKMEKOY_OMERLI_MGM', 'SISLİ_MGM', 'KARTAL', 'CIFTALAN', 'SILE_2',
       'KADIKOY_GOZTEPE_MGM', 'GOZTEPE', 'BEYLİKDUZU_MGM', 'YSS_KOPRUSU',
       'BEYKOZ_MGM', 'AHL_BAKIRKOY', 

In [6]:
# selected columns
reducedData = bigdata[["DATE_TIME", "OBSERVATORY_NAME", "AVERAGE_TEMPERATURE", "AVERAGE_HUMIDITY", "AVERAGE_WIND", "AVERAGE_DIRECTIONOFWIND", "AVERAGE_PRECIPITATION", "AVERAGE_ROAD_TEMPERATURE"]]

In [7]:
# town names different in meteorology and pollutant datasets, so find and give the convenient
polTownName = 'kartal'
metTownName = 'KARTAL'

In [8]:
# all pollutant values according to a particular town (sorted by date time)
polPath = polTownName + '.xlsx'
dfPollutant = pd.read_excel(os.path.join("../", "datasets", "pollutants", polPath),engine='openpyxl',parse_dates=True,thousands='.')

In [9]:
# all meteorology values according to a particular town (sorted by date time)
dfMeteorology = reducedData[reducedData['OBSERVATORY_NAME']==metTownName].sort_values('DATE_TIME')

In [10]:
# determining dates
startDate = '2020-01-01 00:00:00'
endDate = '2021-02-21 00:00:00'

In [11]:
# pooling selected rows according to determined date time
dfPol, dfMet = selectedDateFrame(dfPollutant, dfMeteorology)

In [12]:
# filling the missing rows with previous row value
dfPol, dfMet = fillingEmptyRows(dfPol, dfMet)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)


In [13]:
# making preprocessing (fixing values, filling empty rows)
dfPol = preprocessing(dfPol)

In [14]:
dfPol

Unnamed: 0,Tarih,PM10 ( µg/m3 ),SO2 ( µg/m3 ),CO ( µg/m3 ),NO2 ( µg/m3 ),NOX ( µg/m3 ),O3 ( µg/m3 ),PM 2.5 ( µg/m3 )
8759,2020-01-01 00:00:56,38.299999,4.9,1658.199951,53.700001,135.199997,61.299999,24.000000
8760,2020-01-01 01:00:56,36.599998,5.1,1709.500000,62.000000,239.199997,171.199997,31.000000
8761,2020-01-01 02:00:56,48.500000,4.6,1495.000000,48.900002,84.800003,70.800003,24.400000
8762,2020-01-01 03:00:56,43.000000,4.5,1298.599976,31.400000,39.599998,137.000000,32.900002
8763,2020-01-01 04:00:56,34.299999,3.6,1183.599976,33.200001,41.099998,145.199997,23.000000
...,...,...,...,...,...,...,...,...
18762,2021-02-20 19:00:56,40.599998,3.0,729.299988,71.400002,20.000000,20.900000,12.300000
18763,2021-02-20 20:00:56,34.799999,2.9,928.599976,91.599998,19.600000,10.200000,21.600000
18764,2021-02-20 21:00:56,25.299999,3.3,812.200012,98.599998,19.400000,18.700001,282.299988
18765,2021-02-20 22:00:56,26.900000,3.5,616.099976,102.099998,25.900000,44.799999,187.199997


In [15]:
# reset index values
dfMet = dfMet.reset_index(drop=True)
dfPol = dfPol.reset_index(drop=True)

In [16]:
# concat dataframes
result = pd.concat([dfPol, dfMet], axis=1, join='inner')
display(result)

Unnamed: 0,Tarih,PM10 ( µg/m3 ),SO2 ( µg/m3 ),CO ( µg/m3 ),NO2 ( µg/m3 ),NOX ( µg/m3 ),O3 ( µg/m3 ),PM 2.5 ( µg/m3 ),DATE_TIME,OBSERVATORY_NAME,AVERAGE_TEMPERATURE,AVERAGE_HUMIDITY,AVERAGE_WIND,AVERAGE_DIRECTIONOFWIND,AVERAGE_PRECIPITATION,AVERAGE_ROAD_TEMPERATURE
0,2020-01-01 00:00:56,38.299999,4.9,1658.199951,53.700001,135.199997,61.299999,24.000000,2020-01-01 00:00:00,KARTAL,7.91071,72.82143,2.12679,302.69643,-99.0,-99.0
1,2020-01-01 01:00:56,36.599998,5.1,1709.500000,62.000000,239.199997,171.199997,31.000000,2020-01-01 01:00:00,KARTAL,7.65000,74.05172,2.24138,307.05172,-99.0,-99.0
2,2020-01-01 02:00:56,48.500000,4.6,1495.000000,48.900002,84.800003,70.800003,24.400000,2020-01-01 02:00:00,KARTAL,7.22295,70.57377,2.38033,320.54098,-99.0,-99.0
3,2020-01-01 03:00:56,43.000000,4.5,1298.599976,31.400000,39.599998,137.000000,32.900002,2020-01-01 03:00:00,KARTAL,6.66607,58.62500,1.23571,313.35714,-99.0,-99.0
4,2020-01-01 04:00:56,34.299999,3.6,1183.599976,33.200001,41.099998,145.199997,23.000000,2020-01-01 04:00:00,KARTAL,6.52500,60.05000,1.17167,277.20000,-99.0,-99.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10003,2021-02-20 19:00:56,40.599998,3.0,729.299988,71.400002,20.000000,20.900000,12.300000,2021-02-20 19:00:00,KARTAL,2.89167,57.66667,1.37708,118.62500,-99.0,-99.0
10004,2021-02-20 20:00:56,34.799999,2.9,928.599976,91.599998,19.600000,10.200000,21.600000,2021-02-20 20:00:00,KARTAL,2.75714,58.30357,1.49464,94.91071,-99.0,-99.0
10005,2021-02-20 21:00:56,25.299999,3.3,812.200012,98.599998,19.400000,18.700001,282.299988,2021-02-20 21:00:00,KARTAL,2.82321,60.17857,2.14643,86.00000,-99.0,-99.0
10006,2021-02-20 22:00:56,26.900000,3.5,616.099976,102.099998,25.900000,44.799999,187.199997,2021-02-20 22:00:00,KARTAL,2.82407,61.22222,1.99074,87.48148,-99.0,-99.0


In [17]:
# create excel writer object
path = '../datasets/training/'+polTownName+'_combined.xlsx'
writer = pd.ExcelWriter(path)
# write dataframe to excel
result.to_excel(writer)
# save the excel
writer.save()