In [23]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Pre-Processing & Training

In [24]:
# Importing libraries
import pandas as pd
import numpy as np
import math

In [25]:
# Reading csv files
generation = pd.read_csv("../input/enerjisa-enerji-veri-maratonu/generation.csv",sep=";")
temperature = pd.read_csv("../input/enerjisa-enerji-veri-maratonu/temperature.csv",sep=";")
sample_submission = pd.read_csv("../input/enerjisa-enerji-veri-maratonu/sample_submission.csv")


In [26]:
# Pre-processing temperature data
temperature = temperature.dropna()
temperature["DateTime"] = pd.to_datetime(temperature['DateTime'])
temperature = temperature.set_index("DateTime")
temperature = temperature[:"2021-11-30 23:00:00"]

temperature.head()

In [27]:
# Pre-processing generation data
generation = generation.dropna()
generation["DateTime"] = pd.to_datetime(generation['DateTime'])
generation = generation.set_index("DateTime")
generation = generation['Generation'].str.replace(',','.').astype(float)

generation.head()

In [28]:
# Joining temperature with generation into a unified dataframe 
index=pd.date_range("2019-01-01 00:00:00", freq="H", periods=25560)
df = pd.Series(range(len(index)), index=index)
temperature.index.append(df.index.difference(temperature.index))
temperature_missing = pd.DataFrame(columns = temperature.columns,index = df.index.difference(temperature.index))
temperature_ = pd.concat([temperature,temperature_missing])
df = temperature_.join(generation)
df

In [29]:
# Creating datetime features
df["Datetime"] = df.index
df['year'] = pd.DatetimeIndex(df['Datetime']).year
df['month'] = pd.DatetimeIndex(df['Datetime']).month
df['day_of_month'] = pd.DatetimeIndex(df['Datetime']).day
df['hour'] = pd.DatetimeIndex(df['Datetime']).hour
df['quarter'] = pd.DatetimeIndex(df['Datetime']).quarter
df['day_of_year'] = pd.DatetimeIndex(df['Datetime']).dayofyear
df['day_of_week'] = pd.DatetimeIndex(df['Datetime']).dayofweek
df['week_of_year'] = pd.DatetimeIndex(df['Datetime']).weekofyear

In [30]:
#konya_ges["Datetime"].dtype

In [31]:
#konya_ges["Datetime"] = pd.to_datetime(konya_ges['Datetime'])
#konya_ges.index = konya_ges["Datetime"]
#konya_ges = konya_ges.drop("Datetime",axis=1)
#df = df.join(konya_ges,how="left")

In [32]:
# Train-test split (this was done before mean encoding in order to prevent leakage)
from sklearn.model_selection import train_test_split
df, test = train_test_split(df,test_size = 0.33,random_state = 42)

In [33]:
# Mean encoding by using datetime features and generation data
df["hour_avg"] = df.groupby("hour")["Generation"].transform("mean")
df["day_of_week_avg"] = df.groupby("day_of_week")["Generation"].transform("mean")
df["month_avg"] = df.groupby("month")["Generation"].transform("mean")
df["year_avg"] = df.groupby("year")["Generation"].transform("mean")
df["hour_year_avg"] = df.groupby(["hour","year"])["Generation"].transform("mean")
df["hour_day_of_week_avg"] = df.groupby(["hour","day_of_week"])["Generation"].transform("mean")
df["hour_month_avg"] = df.groupby(["hour","month"])["Generation"].transform("mean")
df["day_of_week_month_avg"] = df.groupby(["day_of_week","month"])["Generation"].transform("mean")
df["day_of_week_year_avg"] = df.groupby(["day_of_week","year"])["Generation"].transform("mean")
df["WWCode_avg"] = df.groupby("WWCode")["Generation"].transform("mean")

In [34]:
# Dropping datetime and reformatting temperature values
df = df.drop("Datetime",axis=1)
df[["AirTemperature","ComfortTemperature","RelativeHumidity","WindSpeed","EffectiveCloudCover"]] = df[["AirTemperature","ComfortTemperature","RelativeHumidity","WindSpeed","EffectiveCloudCover"]].apply(lambda x: x.str.replace(',','.'))
df = df[df.columns].astype("float")

In [35]:
# Defining X and y for ML model
X = df.drop("Generation",axis=1)
y = df["Generation"]

In [36]:
# importing necessary libraries 
import lightgbm as lgb
from sklearn.metrics import mean_squared_error

In [37]:
# Creating LightGBM model
lgb_model = lgb.LGBMRegressor()
lgb_model.fit(X,y)
y_pred = lgb_model.predict(X)

In [38]:
# Calculating root mean squared error of training (RMSE)
y_pred[y_pred<0] = 0
mean_squared_error(y,y_pred)**0.5

In [39]:
# Calculating root mean squared error of validation (10-fold cross validation)
from sklearn.model_selection import cross_val_score
print(-cross_val_score(lgb_model, X, y, cv=10,scoring="neg_root_mean_squared_error").mean())

# Test Set

In [40]:
# Joining test data with mean encoding data that we created in training set
test["hour_avg"] = np.nan
test = pd.merge(test,df[["hour","hour_avg"]].drop_duplicates(),how="inner",on="hour").drop("hour_avg_x",axis=1).rename(columns = {"hour_avg_y":"hour_avg"})
test["day_of_week_avg"] = np.nan
test = pd.merge(test,df[["day_of_week","day_of_week_avg"]].drop_duplicates(),how="inner",on="day_of_week").drop("day_of_week_avg_x",axis=1).rename(columns = {"day_of_week_avg_y":"day_of_week_avg"})
test["month_avg"] = np.nan
test = pd.merge(test,df[["month","month_avg"]].drop_duplicates(),how="inner",on="month").drop("month_avg_x",axis=1).rename(columns = {"month_avg_y":"month_avg"})
test["month_avg"] = np.nan
test = pd.merge(test,df[["month","month_avg"]].drop_duplicates(),how="inner",on="month").drop("month_avg_x",axis=1).rename(columns = {"month_avg_y":"month_avg"})
test["year_avg"] = np.nan
test = pd.merge(test,df[["year","year_avg"]].drop_duplicates(),how="inner",on="year").drop("year_avg_x",axis=1).rename(columns = {"year_avg_y":"year_avg"})
test["hour_year_avg"] = np.nan
test = pd.merge(test,df[["hour","year","hour_year_avg"]].drop_duplicates(),how="inner",on=["year","hour"]).drop("hour_year_avg_x",axis=1).rename(columns = {"hour_year_avg_y":"hour_year_avg"})
test["hour_day_of_week_avg"] = np.nan
test = pd.merge(test,df[["hour","day_of_week","hour_day_of_week_avg"]].drop_duplicates(),how="inner",on=["day_of_week","hour"]).drop("hour_day_of_week_avg_x",axis=1).rename(columns = {"hour_day_of_week_avg_y":"hour_day_of_week_avg"})
test["hour_month_avg"] = np.nan
test = pd.merge(test,df[["hour","month","hour_month_avg"]].drop_duplicates(),how="inner",on=["month","hour"]).drop("hour_month_avg_x",axis=1).rename(columns = {"hour_month_avg_y":"hour_month_avg"})
test["day_of_week_month_avg"] = np.nan
test = pd.merge(test,df[["day_of_week","month","day_of_week_month_avg"]].drop_duplicates(),how="inner",on=["month","day_of_week"]).drop("day_of_week_month_avg_x",axis=1).rename(columns = {"day_of_week_month_avg_y":"day_of_week_month_avg"})
test["day_of_week_year_avg"] = np.nan 
test = pd.merge(test,df[["day_of_week","year","day_of_week_year_avg"]].drop_duplicates(),how="inner",on=["year","day_of_week"]).drop("day_of_week_year_avg_x",axis=1).rename(columns = {"day_of_week_year_avg_y":"day_of_week_year_avg"})
test["WWCode_avg"] = np.nan 
test = pd.merge(test,df[["WWCode","WWCode_avg"]].drop_duplicates(),how="inner",on=["WWCode"]).drop("WWCode_avg_x",axis=1).rename(columns = {"WWCode_avg_y":"WWCode_avg"})

In [41]:
# Reformatting temperature data and dropping datetime column
test.index = test["Datetime"]
test = test.drop("Datetime",axis=1)
test[["AirTemperature","ComfortTemperature","RelativeHumidity","WindSpeed","EffectiveCloudCover"]] = test[["AirTemperature","ComfortTemperature","RelativeHumidity","WindSpeed","EffectiveCloudCover"]].apply(lambda x: x.str.replace(',','.'))
test = test[test.columns].astype("float")

In [42]:
# Defining X and y for test set
test_X = test[X.columns]
test_y = test["Generation"]

In [43]:
# Calculating root mean squared error (RMSE) of test set
test_pred= lgb_model.predict(test_X)
mean_squared_error(test_y,test_pred)**0.5

# Preparing Kaggle Submission

In [44]:
# Reading sample submission
sample_submission = pd.read_csv("../input/enerjisa-enerji-veri-maratonu/sample_submission.csv")

In [45]:
# Reformatting datetime 
sample_submission["DateTime"] = pd.to_datetime(sample_submission['DateTime'])
sample_submission = sample_submission.set_index("DateTime")

In [46]:
# Preparing temperature data for submission
temperature = pd.read_csv("../input/enerjisa-enerji-veri-maratonu/temperature.csv",sep=";")
temperature = temperature.dropna()
temperature["DateTime"] = pd.to_datetime(temperature['DateTime'])
temperature = temperature.set_index("DateTime")
temperature_ = pd.concat([temperature,temperature_missing])
df2 = temperature_["2021-12-01 00:00:00":"2021-12-31 23:00:00"].join(sample_submission)

In [47]:
# Creating datetime features for submission set
df2["Datetime"] = df2.index
df2['year'] = pd.DatetimeIndex(df2['Datetime']).year
df2['month'] = pd.DatetimeIndex(df2['Datetime']).month
df2['day_of_month'] = pd.DatetimeIndex(df2['Datetime']).day
df2['hour'] = pd.DatetimeIndex(df2['Datetime']).hour
df2['quarter'] = pd.DatetimeIndex(df2['Datetime']).quarter
df2['day_of_year'] = pd.DatetimeIndex(df2['Datetime']).dayofyear
df2['day_of_week'] = pd.DatetimeIndex(df2['Datetime']).dayofweek
df2['week_of_year'] = pd.DatetimeIndex(df2['Datetime']).weekofyear

In [48]:
# Reformatting temperature data
df2 = df2.drop("Datetime",axis=1)
df2[["AirTemperature","ComfortTemperature","RelativeHumidity","WindSpeed","EffectiveCloudCover"]] = df2[["AirTemperature","ComfortTemperature","RelativeHumidity","WindSpeed","EffectiveCloudCover"]].apply(lambda x: x.str.replace(',','.'))
df2 = df2[df2.columns].astype("float")

In [49]:
# Joining submission set with mean encoding data that we created in training set
df2["hour_avg"] = np.nan
df2 = pd.merge(df2,df[["hour","hour_avg"]].drop_duplicates(),how="left",on="hour").drop("hour_avg_x",axis=1).rename(columns = {"hour_avg_y":"hour_avg"})
df2["day_of_week_avg"] = np.nan
df2 = pd.merge(df2,df[["day_of_week","day_of_week_avg"]].drop_duplicates(),how="left",on="day_of_week").drop("day_of_week_avg_x",axis=1).rename(columns = {"day_of_week_avg_y":"day_of_week_avg"})
df2["month_avg"] = np.nan
df2 = pd.merge(df2,df[["month","month_avg"]].drop_duplicates(),how="left",on="month").drop("month_avg_x",axis=1).rename(columns = {"month_avg_y":"month_avg"})
df2["year_avg"] = np.nan
df2 = pd.merge(df2,df[["year","year_avg"]].drop_duplicates(),how="left",on="year").drop("year_avg_x",axis=1).rename(columns = {"year_avg_y":"year_avg"})
df2["hour_year_avg"] = np.nan
df2 = pd.merge(df2,df[["hour","year","hour_year_avg"]].drop_duplicates(),how="left",on=["year","hour"]).drop("hour_year_avg_x",axis=1).rename(columns = {"hour_year_avg_y":"hour_year_avg"})
df2["hour_day_of_week_avg"] = np.nan
df2 = pd.merge(df2,df[["hour","day_of_week","hour_day_of_week_avg"]].drop_duplicates(),how="left",on=["day_of_week","hour"]).drop("hour_day_of_week_avg_x",axis=1).rename(columns = {"hour_day_of_week_avg_y":"hour_day_of_week_avg"})
df2["hour_month_avg"] = np.nan
df2 = pd.merge(df2,df[["hour","month","hour_month_avg"]].drop_duplicates(),how="left",on=["month","hour"]).drop("hour_month_avg_x",axis=1).rename(columns = {"hour_month_avg_y":"hour_month_avg"})
df2["day_of_week_month_avg"] = np.nan
df2 = pd.merge(df2,df[["day_of_week","month","day_of_week_month_avg"]].drop_duplicates(),how="left",on=["month","day_of_week"]).drop("day_of_week_month_avg_x",axis=1).rename(columns = {"day_of_week_month_avg_y":"day_of_week_month_avg"})
df2["day_of_week_year_avg"] = np.nan 
df2 = pd.merge(df2,df[["day_of_week","year","day_of_week_year_avg"]].drop_duplicates(),how="left",on=["year","day_of_week"]).drop("day_of_week_year_avg_x",axis=1).rename(columns = {"day_of_week_year_avg_y":"day_of_week_year_avg"})
df2["WWCode_avg"] = np.nan 
df2 = pd.merge(df2,df[["WWCode","WWCode_avg"]].drop_duplicates(),how="left",on=["WWCode"]).drop("WWCode_avg_x",axis=1).rename(columns = {"WWCode_avg_y":"WWCode_avg"})


In [50]:
# Applying ML model to submission set
X_submission = df2[X.columns]
submission = lgb_model.predict(X_submission)
submission[submission<0] = 0

In [51]:
# Preparing final kaggle submission file
sample_submission = pd.read_csv("../input/enerjisa-enerji-veri-maratonu/sample_submission.csv")
sample_submission["Generation"] = submission
sample_submission.to_csv("kaggle_submission.csv",index = False)