# Loading data and converting to dataframe

# Load data from .xlsx file

In [21]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import copy

## Solar intensity data

In [22]:
#Load initial data
hourly_sun_intensity = pd.read_excel('2-10_21_524-2 Andmed.xlsx', sheet_name = 'tunni sum.kiirgus', header = 1)

In [23]:
hourly_sun_intensity.head()

Unnamed: 0,Aasta,Kuu,Päaev,Kell (UTC),"Haapsalu summaarne kiirgus, W/m²","Tallinn-Harku summaarne kiirgus, W/m²","Narva summaarne kiirgus, W/m²","Narva-Jõesuu summaarne kiirgus, W/m²","Pärnu summaarne kiirgus, W/m²","Pärnu-Sauga summaarne kiirgus, W/m²","Roomassaare summaarne kiirgus, W/m²","Tartu-Tõravere summaarne kiirgus, W/m²","Tiirikoja summaarne kiirgus, W/m²","Vilsandi summaarne kiirgus, W/m²"
0,2010,1,1,00:00:00,,0,,0.0,,0.0,0.0,0,0.0,0.0
1,2010,1,1,01:00:00,,0,,1.0,,1.0,0.0,0,0.0,0.0
2,2010,1,1,02:00:00,,0,,0.0,,1.0,0.0,0,0.0,0.0
3,2010,1,1,03:00:00,,0,,0.0,,1.0,0.0,0,0.0,0.0
4,2010,1,1,04:00:00,,0,,0.0,,0.0,0.0,0,0.0,0.0


In [24]:
#Update column names by shortening them and converting to English
newColumnNames = dict()
newColumnNames["Aasta"] = "y"
newColumnNames["Kuu"] = "m"
newColumnNames["Päaev"] = "d"
newColumnNames["Kell (UTC)"] = "time"
for columnName in hourly_sun_intensity.columns:
    if "kiirgus" in columnName:
        newColumnNames[columnName] = "solar_"+columnName.replace(" summaarne kiirgus, W/m²", "")
#newColumnNames = ["y", "m", "d", "time"]+["solar_"+columnName.replace(" summaarne kiirgus, W/m²", "") for columnName in hourly_sun_intensity.columns if "kiirgus" in columnName]
hourly_sun_intensity = hourly_sun_intensity.rename(columns=newColumnNames)
#hourly_sun_intensity.columns = newColumnNames

In [25]:
hourly_sun_intensity.head()

Unnamed: 0,y,m,d,time,solar_Haapsalu,solar_Tallinn-Harku,solar_Narva,solar_Narva-Jõesuu,solar_Pärnu,solar_Pärnu-Sauga,solar_Roomassaare,solar_Tartu-Tõravere,solar_Tiirikoja,solar_Vilsandi
0,2010,1,1,00:00:00,,0,,0.0,,0.0,0.0,0,0.0,0.0
1,2010,1,1,01:00:00,,0,,1.0,,1.0,0.0,0,0.0,0.0
2,2010,1,1,02:00:00,,0,,0.0,,1.0,0.0,0,0.0,0.0
3,2010,1,1,03:00:00,,0,,0.0,,1.0,0.0,0,0.0,0.0
4,2010,1,1,04:00:00,,0,,0.0,,0.0,0.0,0,0.0,0.0


In [26]:
# Some weather stations have changed locations over time, as the differences between their locations are rather small (less than 8 km)
# We at first do not make separation between them

def join_columns(c1, c2, nc, df, column_id): # Function for joining columns, where an area has two weather measuring points
    data = []
    cs = [c1, c2]
    for i, rows in df[cs].iterrows():
        if (pd.isna(rows[0]) == True) & (pd.isna(rows[1]) == False):
            data.append(round(rows[1], 2))
        elif (pd.isna(rows[0]) == False) & (pd.isna(rows[1]) == True):
            data.append(round(rows[0], 2))
        elif (pd.isna(rows[0]) == False) & (pd.isna(rows[1]) == False):
            data.append(round(rows.mean(), 2))
        elif (pd.isna(rows[0]) == True) & (pd.isna(rows[1]) == True):
            data.append(rows[0])

    df = df.drop(columns = [c1, c2])
    df.insert(column_id, nc, data)
    
    return df

In [27]:
#Merge columns, which are due to weather station moving
hourly_sun_intensity = join_columns('solar_Narva', 'solar_Narva-Jõesuu', 'solar_Narva', hourly_sun_intensity, 4)
hourly_sun_intensity = join_columns('solar_Pärnu-Sauga', 'solar_Pärnu', 'solar_Pärnu', hourly_sun_intensity, 5)

In [29]:
hourly_sun_intensity.head()

Unnamed: 0,y,m,d,time,solar_Narva,solar_Pärnu,solar_Haapsalu,solar_Tallinn-Harku,solar_Roomassaare,solar_Tartu-Tõravere,solar_Tiirikoja,solar_Vilsandi
0,2010,1,1,00:00:00,0.0,0.0,,0,0.0,0,0.0,0.0
1,2010,1,1,01:00:00,1.0,1.0,,0,0.0,0,0.0,0.0
2,2010,1,1,02:00:00,0.0,1.0,,0,0.0,0,0.0,0.0
3,2010,1,1,03:00:00,0.0,1.0,,0,0.0,0,0.0,0.0
4,2010,1,1,04:00:00,0.0,0.0,,0,0.0,0,0.0,0.0


In [116]:
hourly_sun_intensity.time.iloc[0]

datetime.time(12, 0)

In [32]:
#Drop rows where some value is missing
hourly_sun_intensity = hourly_sun_intensity.dropna()
#If value is -1 it corresponds to night, set it to 0
hourly_sun_intensity = hourly_sun_intensity.replace(-1, 0)

In [36]:
hourly_sun_intensity.describe()

Unnamed: 0,y,m,d,solar_Narva,solar_Pärnu,solar_Haapsalu,solar_Tallinn-Harku,solar_Roomassaare,solar_Tartu-Tõravere,solar_Tiirikoja,solar_Vilsandi
count,88788.0,88788.0,88788.0,88788.0,88788.0,88788.0,88788.0,88788.0,88788.0,88788.0,88788.0
mean,2015.158907,6.587872,15.792078,112.948293,117.667951,114.756544,113.180328,126.094033,115.880603,112.650786,124.480527
std,3.081945,3.403052,8.781745,192.71594,198.79072,196.087648,193.198241,209.548204,191.901672,189.882892,206.588459
min,2010.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,2013.0,4.0,8.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,2015.0,7.0,16.0,3.0,3.0,1.0,2.0,3.0,6.0,4.0,5.0
75%,2018.0,9.0,23.0,140.0,153.0,146.0,144.0,168.0,153.0,147.0,161.0
max,2020.0,12.0,31.0,932.0,925.0,941.0,902.0,952.0,886.0,921.0,890.0


## Data from different weather stations

In [100]:
selectedColumns = ["Aasta", "Kuu", "Päev", "Kell (UTC)", "Õhutemperatuur °C"]

In [98]:
#Update column names by shortening them and converting to English

def updateColumnNames(df, location):
    newColumnNames = dict()
    newColumnNames["Aasta"] = "y"
    newColumnNames["Kuu"] = "m"
    newColumnNames["Päev"] = "d"
    newColumnNames["Kell (UTC)"] = "time"
    newColumnNames["Õhutemperatuur °C"] = f"temp_{location}"
    df = df.rename(columns=newColumnNames)
    return df

In [99]:
def getFromXlsx(filename, columns, location):
    #Load xlsx
    df = pd.read_excel(filename, header = 1)
    df = df[columns]
    #Drop rows where data is missing
    #df = df.dropna()
    #Update column names for clarity
    df = updateColumnNames(df, location)
    return df

In [101]:
#Get Tallinn
data_tallinn = getFromXlsx("./data/Tallinn-Harku_2004-2020.xlsx", selectedColumns, "tallinn")

#Get Roomassaare
data_roomassaare = getFromXlsx("./data/Roomassaare_2008-2020.xlsx", selectedColumns, "roomassaare")

#Merge tables
data_weather = data_tallinn.merge(data_roomassaare, how='left', on=["y", "m", "d", "time"])

In [106]:
#Get Parnu
data_parnu1 = getFromXlsx("./data/Parnu-Sauga_01.12.2004-31.03.2019.xlsx", selectedColumns, "parnu")
data_parnu2 = getFromXlsx("./data/Parnu_01.04.2019-2020.xlsx", selectedColumns, "parnu")
data_parnu = data_parnu1.append(data_parnu2)

#Merge tables
data_weather = data_weather.merge(data_parnu, how='left', on=["y", "m", "d", "time"])

In [107]:
data_Tartu = getFromXlsx("./data/Tartu-Toravere_2004-2020.xlsx", selectedColumns, "tartu")

#Merge tables
data_weather = data_weather.merge(data_Tartu, how='left', on=["y", "m", "d", "time"])

In [110]:
#Once again drop all rows where some row is missing
data_weather = data_weather.dropna()

In [111]:
data_weather

Unnamed: 0,y,m,d,time,temp_tallinn,temp_roomassaare,temp_parnu_x,temp_parnu_y
35064,2008,1,1,00:00:00,-0.5,1.3,-1.3,-1.3
35065,2008,1,1,01:00:00,-0.7,1.2,-1.3,-1.3
35066,2008,1,1,02:00:00,-0.9,1.3,-1.4,-1.4
35067,2008,1,1,03:00:00,-0.9,0.8,-1.4,-1.4
35068,2008,1,1,04:00:00,-0.9,0.8,-1.5,-1.5
...,...,...,...,...,...,...,...,...
149035,2020,12,31,19:00:00,0.5,2.8,1.8,1.8
149036,2020,12,31,20:00:00,0.2,2.6,1.9,1.9
149037,2020,12,31,21:00:00,0.2,2.7,1.8,1.8
149038,2020,12,31,22:00:00,0.1,2.9,1.7,1.7
