# This model here is trained on 4 selected weather stations to predict one given weather station

## Loading data and converting to dataframe

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import copy
import datetime
import sklearn

### Solar intensity data

In [3]:
#Load initial data
hourly_sun_intensity = pd.read_excel('2-10_21_524-2 Andmed.xlsx', sheet_name = 'tunni sum.kiirgus', header = 1)

In [4]:
#Update column names by shortening them and converting to English
newColumnNames = dict()
newColumnNames["Aasta"] = "y"
newColumnNames["Kuu"] = "m"
newColumnNames["Päaev"] = "d"
newColumnNames["Kell (UTC)"] = "time"
for columnName in hourly_sun_intensity.columns:
    if "kiirgus" in columnName:
        newColumnNames[columnName] = "solar_"+columnName.replace(" summaarne kiirgus, W/m²", "")
#newColumnNames = ["y", "m", "d", "time"]+["solar_"+columnName.replace(" summaarne kiirgus, W/m²", "") for columnName in hourly_sun_intensity.columns if "kiirgus" in columnName]
hourly_sun_intensity = hourly_sun_intensity.rename(columns=newColumnNames)
#hourly_sun_intensity.columns = newColumnNames

In [5]:
# Some weather stations have changed locations over time, as the differences between their locations are rather small (less than 8 km)
# We at first do not make separation between them

def join_columns(c1, c2, nc, df, column_id): # Function for joining columns, where an area has two weather measuring points
    data = []
    cs = [c1, c2]
    for i, rows in df[cs].iterrows():
        if (pd.isna(rows[0]) == True) & (pd.isna(rows[1]) == False):
            data.append(round(rows[1], 2))
        elif (pd.isna(rows[0]) == False) & (pd.isna(rows[1]) == True):
            data.append(round(rows[0], 2))
        elif (pd.isna(rows[0]) == False) & (pd.isna(rows[1]) == False):
            data.append(round(rows.mean(), 2))
        elif (pd.isna(rows[0]) == True) & (pd.isna(rows[1]) == True):
            data.append(rows[0])

    df = df.drop(columns = [c1, c2])
    df.insert(column_id, nc, data)
    
    return df

In [6]:
#Merge columns, which are due to weather station moving
hourly_sun_intensity = join_columns('solar_Narva', 'solar_Narva-Jõesuu', 'solar_Narva', hourly_sun_intensity, 4)
hourly_sun_intensity = join_columns('solar_Pärnu-Sauga', 'solar_Pärnu', 'solar_Pärnu', hourly_sun_intensity, 5)

In [7]:
#Drop rows where some value is missing
hourly_sun_intensity = hourly_sun_intensity.dropna()
#If value is -1 it corresponds to night, set it to 0
hourly_sun_intensity = hourly_sun_intensity.replace(-1, 0)

In [8]:
#Shift the times -X minutes to facilitate predicting future solar intensity from existing
def shiftDateTime(df, numberOfHours):
    dateTimes = []
    for i in range(len(df)):
        row = df.iloc[i]
        dateTimes+=[datetime.datetime.combine(datetime.date(row.y, row.m, row.d), row.time)+datetime.timedelta(hours=numberOfHours)]
    df2 = copy.deepcopy(df)
    df2["y"] = [date.year for date in dateTimes]
    df2["m"] = [date.month for date in dateTimes]
    df2["d"] = [date.day for date in dateTimes]
    df2["time"] = [date.time() for date in dateTimes]
    
    return df2
    
    

In [9]:
hourly_sun_intensity_Shifted = shiftDateTime(hourly_sun_intensity, -1)

### Data from different weather stations

#### Locations


Tallinn-Harku
Laius: N 59°23´53´´
Pikkus: E 24°36´10´´
Decimal
Lat: 59.398055
Long: 24.602778


Haapsalu meteoroloogiajaam
Laius N 58°56´40´´
Pikkus E 23°33´18´´
Decimal
Lat: 58.944444
Long: 23.555

Narva
Laius: N 59°23´22´´
Pikkus: E 28°06´33´´
Decimal
Lat: 59.389444
Long: 28.109167

Pärnu
Laius: N 58°23´4,44´´
Pikkus: E 24°29´6,71´´
Decimal
Lat: 58.384556
Long: 24.485197

Roomassaare
Laius: N 58°13’05”
Pikkus: E 22°30’23”
Decimal
Lat: 58.218056 
Long: 22.506389 

Tartu-Tõravere meteoroloogiajaam
Laius: N 58°15´51´´
Pikkus: E 26°27´41´
Decimal
Lat: 58.264167
Long: 26.461389

Tiirikoja järvejaam
Laius: N 58°51´55´´
Pikkus: E 26°57´08´´
Decimal
Lat: 58.865278
Long: 26.952222

Vilsandi rannikujaam
Laius: N 58°22´58”
Pikkus: E 21°48´51”
Deciaml
Lat: 58.382778
Long: 21.814167

In [10]:
weather_station_coordinates = dict()
weather_station_coordinates["tallinn"] = [59.398055, 24.602778]
weather_station_coordinates["haapsalu"] = [58.944444, 23.555]
weather_station_coordinates["narva"] = [59.389444, 28.109167]
weather_station_coordinates["parnu"] = [59.389444, 28.109167]
weather_station_coordinates["roomassaare"] = [58.218056, 22.506389]
weather_station_coordinates["tartu"] = [58.264167, 26.461389]
weather_station_coordinates["tiirikoja"] = [58.865278, 26.952222]
weather_station_coordinates["vilsandi"] = [58.382778, 21.814167]

In [11]:
selectedColumns = ["Aasta", "Kuu", "Päev", "Kell (UTC)", "Õhutemperatuur °C", "10 minuti keskmine tuule kiirus m/s"]

In [12]:
#Update column names by shortening them and converting to English

def updateColumnNames(df, location):
    newColumnNames = dict()
    newColumnNames["Aasta"] = "y"
    newColumnNames["Kuu"] = "m"
    newColumnNames["Päev"] = "d"
    newColumnNames["Kell (UTC)"] = "time"
    newColumnNames["Õhutemperatuur °C"] = f"temp_{location}"
    newColumnNames["10 minuti keskmine tuule kiirus m/s"] = f"wind_speed_{location}"
    df = df.rename(columns=newColumnNames)
    return df

In [13]:
def getFromXlsx(filename, columns, location):
    #Load xlsx
    df = pd.read_excel(filename, header = 1)
    df = df[columns]
    #Drop rows where data is missing
    #df = df.dropna()
    #Update column names for clarity
    df = updateColumnNames(df, location)
    return df

In [14]:
#Get Tallinn
data_tallinn = getFromXlsx("./data/Tallinn-Harku_2004-2020.xlsx", selectedColumns, "tallinn")
#data_tallinn["lat_tallinn"] = len(data_tallinn)*[weather_station_coordinates["tallinn"][0]]
#data_tallinn["long_tallinn"] = len(data_tallinn)*[weather_station_coordinates["tallinn"][1]]
#Get Roomassaare
data_roomassaare = getFromXlsx("./data/Roomassaare_2008-2020.xlsx", selectedColumns, "roomassaare")
#data_roomassaare["lat_roomassaare"] = len(data_roomassaare)*[weather_station_coordinates["roomassaare"][0]]
#data_roomassaare["long_roomassaare"] = len(data_roomassaare)*[weather_station_coordinates["roomassaare"][1]]
#Merge tables
data_weather = data_tallinn.merge(data_roomassaare, how='left', on=["y", "m", "d", "time"])

In [15]:
#Get Vilsandi
data_vilsandi = getFromXlsx("./data/Vilsandi_2004-2020.xlsx", selectedColumns, "vilsandi")
#data_vilsandi["lat_vilsandi"] = len(data_vilsandi)*[weather_station_coordinates["vilsandi"][0]]
#data_vilsandi["long_vilsandi"] = len(data_vilsandi)*[weather_station_coordinates["vilsandi"][1]]
#Merge tables
data_weather = data_weather.merge(data_vilsandi, how='left', on=["y", "m", "d", "time"])

In [16]:
##Get Parnu
#data_parnu1 = getFromXlsx("./data/Parnu-Sauga_01.12.2004-31.03.2019.xlsx", selectedColumns, "parnu")
#data_parnu2 = getFromXlsx("./data/Parnu_01.04.2019-2020.xlsx", selectedColumns, "parnu")
#data_parnu = data_parnu1.append(data_parnu2)

##Merge tables
#data_weather = data_weather.merge(data_parnu, how='left', on=["y", "m", "d", "time"])

In [17]:
data_Tartu = getFromXlsx("./data/Tartu-Toravere_2004-2020.xlsx", selectedColumns, "tartu")
#data_Tartu["lat_tartu"] = len(data_Tartu)*[weather_station_coordinates["tartu"][0]]
#data_Tartu["long_tartu"] = len(data_Tartu)*[weather_station_coordinates["tartu"][1]]
#Merge tables

data_weather = data_weather.merge(data_Tartu, how='left', on=["y", "m", "d", "time"])


In [18]:
#Once again drop all rows where some row is missing
data_weather = data_weather.dropna()

### Join weather and solar data

In [40]:
data_solar_weather = hourly_sun_intensity_Shifted.merge(data_weather, how='left', on=["y", "m", "d", "time"])

In [41]:
hours = []
for i in range(len(data_solar_weather)):
    hours+=[data_solar_weather.iloc[i].time.hour]
data_solar_weather["h"] = hours


In [42]:
data_solar_weather = data_solar_weather.dropna()

In [43]:
data_solar_weather.columns

Index(['y', 'm', 'd', 'time', 'solar_Narva', 'solar_Pärnu', 'solar_Haapsalu',
       'solar_Tallinn-Harku', 'solar_Roomassaare', 'solar_Tartu-Tõravere',
       'solar_Tiirikoja', 'solar_Vilsandi', 'temp_tallinn',
       'wind_speed_tallinn', 'temp_roomassaare', 'wind_speed_roomassaare',
       'temp_vilsandi', 'wind_speed_vilsandi', 'temp_tartu',
       'wind_speed_tartu', 'h'],
      dtype='object')

# Let's train a model

In [22]:
from sklearn.tree import DecisionTreeRegressor
dtr= DecisionTreeRegressor(max_depth = 100)

In [44]:
X = data_solar_weather[['m', 'd', 'h', 'temp_tallinn',
       'wind_speed_tallinn', 'temp_roomassaare', 'wind_speed_roomassaare',
       'temp_vilsandi', 'wind_speed_vilsandi', 'temp_tartu',
       'wind_speed_tartu']]

y = data_solar_weather[['solar_Pärnu']]

In [45]:
from sklearn.model_selection import train_test_split

In [46]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=111)

In [47]:
dtr.fit(X_train, y_train)

DecisionTreeRegressor(max_depth=100)

In [48]:
from sklearn.metrics import mean_squared_error
# squared = False returns RMSE, otherwise MSE
mean_squared_error(y_test, dtr.predict(X_test), squared = False)

114.13963731178066

In [49]:
y_test

Unnamed: 0,solar_Pärnu
40555,525.0
29248,0.0
34223,39.0
83520,226.0
43926,0.0
...,...
11427,237.0
78569,0.0
11882,36.0
73513,283.0
