In [245]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from catboost import CatBoostRegressor
import xgboost as xgb
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error
import numpy as np



In [246]:

# Load the datasets
train = pd.read_csv("train.csv")
weather = pd.read_csv("weather.csv")
holidays = pd.read_csv("holidays.csv")
#test = pd.read_csv("test.csv")
sample_submission = pd.read_csv("sample_submission.csv")

In [247]:
def aggregate_weather_data(daily_weather):
        
        daily_weather['date'] = pd.to_datetime(daily_weather['date'])
        daily_weather['day'] = daily_weather['date'].dt.date
        daily_weather['hour'] = daily_weather['date'].dt.hour


        daily_weather = daily_weather.groupby(['ilce', pd.Grouper(freq='D', key='date')])
        ## Apply the aggregations
        daily_weather= daily_weather.agg({
            't_2m:C': ['max', 'min', 'mean','std'],  # temperature
            'prob_precip_1h:p': ['sum', 'max' ,'mean'],  # precipitation
            'wind_speed_10m:ms': ['max', 'mean','std'],  # wind speed
            'wind_dir_10m:d': 'mean',  # wind direction
            'global_rad:W': 'sum',  # sunshine duration
            'effective_cloud_cover:p': ['mean','std'],  # cloud cover
            'relative_humidity_2m:p': ['max', 'min', 'mean']  # humidity
        })

        # Flatten the MultiIndex columns
        daily_weather.columns = ['_'.join(col).strip() for col in daily_weather.columns.values]
        daily_weather= daily_weather.reset_index() 
        return daily_weather
def create_unique_id(df):
    df['unique_id'] = df['date'].astype(str) +  '-' +df['ilce'].astype(str) 
    return df

In [248]:
train.columns = ['date' , 'ilce' , 'bildirimsiz_sum', 'bildirimli_sum']
train = create_unique_id(train)
train['date'] = pd.to_datetime(train['date'])
 #Process data and train model

In [249]:
weather.columns = ['date', 'lat', 'lon', 't_2m:C', 'effective_cloud_cover:p', 'global_rad:W', 'relative_humidity_2m:p', 'wind_dir_10m:d', 'wind_speed_10m:ms', 'prob_precip_1h:p', 't_apparent:C', 'ilce']
weather = aggregate_weather_data(weather)
weather = create_unique_id(weather)


In [250]:
holidays = pd.read_csv('holidays.csv')
holidays.columns =['year', 'month' ,'day' , 'holiday']
add_columns = holidays['holiday']
# Create a new column for each unique holiday name

holidays['date'] = pd.to_datetime(holidays[['year', 'month', 'day']])
for new in holidays['holiday'].unique():
    holidays[new] = (holidays['holiday'] == new).astype(int)
holidays.drop(['holiday', 'year', 'month', 'day'] , axis=1 , inplace=True)
holidays.columns
holidays.head()


Unnamed: 0,date,New Year's Day,National Sovereignty and Children's Day,Labour Day,"Commemoration of Ataturk, Youth and Sports Day",Democracy and National Unity Day,Victory Day,Republic Day,Ramadan Feast,Ramadan Feast Holiday,Sacrifice Feast,Sacrifice Feast Holiday,National Sovereignty and Children's Day; Ramadan Feast Holiday
0,2021-01-01,1,0,0,0,0,0,0,0,0,0,0,0
1,2021-04-23,0,1,0,0,0,0,0,0,0,0,0,0
2,2021-05-01,0,0,1,0,0,0,0,0,0,0,0,0
3,2021-05-19,0,0,0,1,0,0,0,0,0,0,0,0
4,2021-07-15,0,0,0,0,1,0,0,0,0,0,0,0


In [251]:

merged_data = pd.merge(train, holidays, on='date', how='left')

merged_data[merged_data['Victory Day'] == 1]


Unnamed: 0,date,ilce,bildirimsiz_sum,bildirimli_sum,unique_id,New Year's Day,National Sovereignty and Children's Day,Labour Day,"Commemoration of Ataturk, Youth and Sports Day",Democracy and National Unity Day,Victory Day,Republic Day,Ramadan Feast,Ramadan Feast Holiday,Sacrifice Feast,Sacrifice Feast Holiday,National Sovereignty and Children's Day; Ramadan Feast Holiday
239,2021-08-30,izmir-aliaga,3,0,2021-08-30-izmir-aliaga,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
591,2022-08-30,izmir-aliaga,1,0,2022-08-30-izmir-aliaga,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
952,2023-08-30,izmir-aliaga,4,0,2023-08-30-izmir-aliaga,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1347,2021-08-30,izmir-bayindir,3,0,2021-08-30-izmir-bayindir,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1709,2022-08-30,izmir-bayindir,3,3,2022-08-30-izmir-bayindir,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
46718,2021-08-30,manisa-kula,3,1,2021-08-30-manisa-kula,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
47046,2022-08-30,manisa-kula,2,0,2022-08-30-manisa-kula,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
47380,2023-08-30,manisa-kula,4,0,2023-08-30-manisa-kula,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
47658,2021-08-30,manisa-ahmetli,2,0,2021-08-30-manisa-ahmetli,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


In [252]:
test.columns= ['date','ilce','bildirimli_sum']
test = create_unique_id(test)
test.head()


Unnamed: 0,date,ilce,bildirimli_sum,unique_id
0,2024-02-01,izmir-aliaga,0,2024-02-01-izmir-aliaga
1,2024-02-01,izmir-bayindir,1,2024-02-01-izmir-bayindir
2,2024-02-01,izmir-bayrakli,0,2024-02-01-izmir-bayrakli
3,2024-02-01,izmir-bergama,1,2024-02-01-izmir-bergama
4,2024-02-01,izmir-bornova,1,2024-02-01-izmir-bornova


In [253]:

sample_submission.head()

Unnamed: 0,unique_id,bildirimsiz_sum
0,2024-02-01-izmir-aliaga,
1,2024-02-01-izmir-bayindir,
2,2024-02-01-izmir-bayrakli,
3,2024-02-01-izmir-bergama,
4,2024-02-01-izmir-bornova,


In [254]:
train.sample()

Unnamed: 0,date,ilce,bildirimsiz_sum,bildirimli_sum,unique_id
31090,2023-03-11,manisa-demirci,2,0,2023-03-11-manisa-demirci


In [255]:
##@@@ TRAIN Data


train['date'] = pd.to_datetime(train['date'])
print(train['date'].dt.weekday)
train['days_in_week'] = train['date'].dt.weekday
train['month']= train['date'].dt.month
train['year']= train['date'].dt.year
train.set_index('unique_id')

0        4
1        5
2        6
3        0
4        1
        ..
48143    1
48144    2
48145    3
48146    4
48147    0
Name: date, Length: 48148, dtype: int32


Unnamed: 0_level_0,date,ilce,bildirimsiz_sum,bildirimli_sum,days_in_week,month,year
unique_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2021-01-01-izmir-aliaga,2021-01-01,izmir-aliaga,5,0,4,1,2021
2021-01-02-izmir-aliaga,2021-01-02,izmir-aliaga,13,0,5,1,2021
2021-01-03-izmir-aliaga,2021-01-03,izmir-aliaga,4,0,6,1,2021
2021-01-04-izmir-aliaga,2021-01-04,izmir-aliaga,9,0,0,1,2021
2021-01-05-izmir-aliaga,2021-01-05,izmir-aliaga,2,0,1,1,2021
...,...,...,...,...,...,...,...
2024-01-23-manisa-ahmetli,2024-01-23,manisa-ahmetli,1,0,1,1,2024
2024-01-24-manisa-ahmetli,2024-01-24,manisa-ahmetli,2,0,2,1,2024
2024-01-25-manisa-ahmetli,2024-01-25,manisa-ahmetli,2,0,3,1,2024
2024-01-26-manisa-ahmetli,2024-01-26,manisa-ahmetli,2,0,4,1,2024


In [256]:
# Lagged features capture the values of the target variable at previous time steps, which can help model temporal dependencies.
lags = 10
for lag in range(1,10):
    train[f'bildirimsiz_lag{lag}'] = train.groupby('ilce')['bildirimsiz_sum'].shift(lag)
    train[f'bildirimli_lag{lag}'] = train.groupby('ilce')['bildirimli_sum'].shift(lag)

In [257]:
# Differencing captures the changes in the target variable (bildirimsiz_sum) between consecutive time steps, which can help model stationarity and remove trends.
periods = [1, 2, 3, 5, 7, 14, 30]
for period in periods:
    train[f'Target_Diff{period}'] = train.groupby('ilce')['bildirimsiz_sum'].diff(periods=period)

In [258]:
# Target encoding replaces categorical values with the mean of the target variable (bildirimsiz_sum) for each category, capturing the relationship between categories and the target.
def target_encoding(data, target, category):
    means = data.groupby(category)[target].mean()
    return data[category].map(means)

train['Target_Encoded_Ilce'] = target_encoding(train, 'bildirimsiz_sum', 'ilce')

In [259]:
pip install statsmodels

Note: you may need to restart the kernel to use updated packages.


In [260]:
# Autoregressive features model the target variable (bildirimsiz_sum) as a linear combination of its past values, capturing temporal dependencies.
from statsmodels.tsa.ar_model import AutoReg

ar_model = AutoReg(train['bildirimsiz_sum'], lags=[1, 7]).fit()
train['Target_AR_Prediction'] = ar_model.predict()

In [261]:
# Rolling window statistics summarize the recent behavior of the target variable (bildirimsiz_sum) over a specified window size, capturing short-term trends and patterns.
windows = [3, 7, 14]
for window in windows:
    train[f'Target_Rolling_Mean_{window}'] = train.groupby('ilce')['bildirimsiz_sum'].rolling(window=window).mean().reset_index(0, drop=True)
    train[f'Target_Rolling_Std_{window}'] = train.groupby('ilce')['bildirimsiz_sum'].rolling(window=window).std().reset_index(0, drop=True)

In [262]:
pip install statsmodels

Note: you may need to restart the kernel to use updated packages.


In [263]:
# Zaman serisi ayrıştırması, hedef değişkeni (bildirimsiz_sum) eğilim, mevsimsellik ve kalıntı bileşenlerine ayırır, bu bileşenler özellik olarak kullanılabilir.
from statsmodels.tsa.seasonal import seasonal_decompose

decomposed = seasonal_decompose(train['bildirimsiz_sum'], period=7)
train['Target_Trend'] = decomposed.trend
train['Target_Seasonal'] = decomposed.seasonal
train['Target_Residual'] = decomposed.resid

In [264]:
# Exponentially Weighted Moving Average (EWMA) assigns exponentially decreasing weights to past observations of the target variable (bildirimsiz_sum), capturing recent trends more strongly.
spans = [3, 7, 14]
for span in spans:
    train[f'Target_EWM_Mean_{span}'] = train.groupby('ilce')['bildirimsiz_sum'].ewm(span=span, adjust=False).mean().reset_index(0, drop=True)

In [265]:
# Differencing captures the changes in the target variable (bildirimsiz_sum) between consecutive time steps, which can help model stationarity and remove trends.
periods = [1, 7, 14, 30]
for period in periods:
    train[f'Target_Diff{period}'] = train.groupby('ilce')['bildirimsiz_sum'].diff(periods=period)

In [266]:
# Logarithmic transformation can help stabilize the variance of the target variable (bildirimsiz_sum) and make it more suitable for modeling.
train['Log_Target'] = np.log1p(train['bildirimsiz_sum'])


In [267]:
train = train.dropna()
train

Unnamed: 0,date,ilce,bildirimsiz_sum,bildirimli_sum,unique_id,days_in_week,month,year,bildirimsiz_lag1,bildirimli_lag1,...,Target_Rolling_Std_7,Target_Rolling_Mean_14,Target_Rolling_Std_14,Target_Trend,Target_Seasonal,Target_Residual,Target_EWM_Mean_3,Target_EWM_Mean_7,Target_EWM_Mean_14,Log_Target
30,2021-01-31,izmir-aliaga,12,0,2021-01-31-izmir-aliaga,6,1,2021,24.0,0.0,...,6.792853,7.000000,6.348834,9.857143,0.045360,2.097497,13.669251,11.260940,8.898156,2.564949
31,2021-02-01,izmir-aliaga,3,0,2021-02-01-izmir-aliaga,0,2,2021,12.0,0.0,...,7.380799,7.071429,6.293639,8.714286,0.070319,-5.784605,8.334625,9.195705,8.111735,1.386294
32,2021-02-02,izmir-aliaga,12,0,2021-02-02-izmir-aliaga,1,2,2021,3.0,0.0,...,7.033931,7.857143,6.162631,8.571429,-0.006240,3.434811,10.167313,9.896779,8.630170,2.564949
33,2021-02-03,izmir-aliaga,5,0,2021-02-03-izmir-aliaga,2,2,2021,12.0,0.0,...,7.244045,8.000000,6.063764,5.428571,-0.063109,-0.365463,7.583656,8.672584,8.146148,1.791759
34,2021-02-04,izmir-aliaga,1,0,2021-02-04-izmir-aliaga,3,2,2021,5.0,0.0,...,7.994045,7.500000,6.345804,4.571429,-0.031605,-3.539824,4.291828,6.754438,7.193328,0.693147
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48140,2024-01-20,manisa-ahmetli,2,0,2024-01-20-manisa-ahmetli,5,1,2024,2.0,0.0,...,0.786796,1.928571,1.141139,1.428571,0.021658,0.549771,1.824890,1.783654,1.833278,1.098612
48141,2024-01-21,manisa-ahmetli,2,0,2024-01-21-manisa-ahmetli,6,1,2024,2.0,0.0,...,0.755929,2.000000,1.109400,1.571429,0.045360,0.383212,1.912445,1.837741,1.855508,1.098612
48142,2024-01-22,manisa-ahmetli,1,0,2024-01-22-manisa-ahmetli,0,1,2024,2.0,0.0,...,0.755929,1.928571,1.141139,1.714286,0.070319,-0.784605,1.456222,1.628305,1.741440,0.693147
48143,2024-01-23,manisa-ahmetli,1,0,2024-01-23-manisa-ahmetli,1,1,2024,1.0,0.0,...,0.534522,1.928571,1.141139,1.714286,-0.006240,-0.708046,1.228111,1.471229,1.642582,0.693147


Creating Weather data and merging them into train data 

In [272]:
train_df = pd.merge(train, merged_data,on="unique_id")
train_df.fillna(0,inplace=True)
train_df.columns
train_df['day_in_year'] = train_df['date']
train_df['date'] = train_df['date_x']
train_df['ilce'] = train_df['ilce_x']
train_df['bildirimsiz_sum'] = train_df['bildirimsiz_sum_x']
train_df['bildirimli_sum'] = train_df['bildirimli_sum_x']
train_df['date'] = train_df['date_x']
train_df.drop(['date_x'	,'ilce_x'	,'bildirimsiz_sum_x'	,'bildirimli_sum_x','date_y'	,'ilce_y'	,'bildirimsiz_sum_y'	,'bildirimli_sum_y'] , axis=1 , inplace=True)
train_df

Unnamed: 0,unique_id,days_in_week,month,year,bildirimsiz_lag1,bildirimli_lag1,bildirimsiz_lag2,bildirimli_lag2,bildirimsiz_lag3,bildirimli_lag3,...,Republic Day,Ramadan Feast,Ramadan Feast Holiday,Sacrifice Feast,Sacrifice Feast Holiday,National Sovereignty and Children's Day; Ramadan Feast Holiday,date,ilce,bildirimsiz_sum,bildirimli_sum
0,2021-01-31-izmir-aliaga,6,1,2021,24.0,0.0,4.0,1.0,9.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,2021-01-31,izmir-aliaga,12,0
1,2021-02-01-izmir-aliaga,0,2,2021,12.0,0.0,24.0,0.0,4.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,2021-02-01,izmir-aliaga,3,0
2,2021-02-02-izmir-aliaga,1,2,2021,3.0,0.0,12.0,0.0,24.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,2021-02-02,izmir-aliaga,12,0
3,2021-02-03-izmir-aliaga,2,2,2021,12.0,0.0,3.0,0.0,12.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,2021-02-03,izmir-aliaga,5,0
4,2021-02-04-izmir-aliaga,3,2,2021,5.0,0.0,12.0,0.0,3.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,2021-02-04,izmir-aliaga,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
46730,2024-01-20-manisa-ahmetli,5,1,2024,2.0,0.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,2024-01-20,manisa-ahmetli,2,0
46731,2024-01-21-manisa-ahmetli,6,1,2024,2.0,0.0,2.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,2024-01-21,manisa-ahmetli,2,0
46732,2024-01-22-manisa-ahmetli,0,1,2024,2.0,0.0,2.0,0.0,2.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,2024-01-22,manisa-ahmetli,1,0
46733,2024-01-23-manisa-ahmetli,1,1,2024,1.0,0.0,2.0,0.0,2.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,2024-01-23,manisa-ahmetli,1,0


In [273]:
train_df.columns

Index(['unique_id', 'days_in_week', 'month', 'year', 'bildirimsiz_lag1',
       'bildirimli_lag1', 'bildirimsiz_lag2', 'bildirimli_lag2',
       'bildirimsiz_lag3', 'bildirimli_lag3', 'bildirimsiz_lag4',
       'bildirimli_lag4', 'bildirimsiz_lag5', 'bildirimli_lag5',
       'bildirimsiz_lag6', 'bildirimli_lag6', 'bildirimsiz_lag7',
       'bildirimli_lag7', 'bildirimsiz_lag8', 'bildirimli_lag8',
       'bildirimsiz_lag9', 'bildirimli_lag9', 'Target_Diff1', 'Target_Diff2',
       'Target_Diff3', 'Target_Diff5', 'Target_Diff7', 'Target_Diff14',
       'Target_Diff30', 'Target_Encoded_Ilce', 'Target_AR_Prediction',
       'Target_Rolling_Mean_3', 'Target_Rolling_Std_3',
       'Target_Rolling_Mean_7', 'Target_Rolling_Std_7',
       'Target_Rolling_Mean_14', 'Target_Rolling_Std_14', 'Target_Trend',
       'Target_Seasonal', 'Target_Residual', 'Target_EWM_Mean_3',
       'Target_EWM_Mean_7', 'Target_EWM_Mean_14', 'Log_Target',
       'New Year's Day', 'National Sovereignty and Children's D

In [280]:
import pandas as pd
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from sklearn.metrics import mean_squared_error

# Hedef değişkeni ve özellikleri ayır
X = train_df.drop(columns=["bildirimsiz_sum"])
y = train_df["bildirimsiz_sum"]

# Eğitim ve test setlerini ayır
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train

Unnamed: 0,unique_id,days_in_week,month,year,bildirimsiz_lag1,bildirimli_lag1,bildirimsiz_lag2,bildirimli_lag2,bildirimsiz_lag3,bildirimli_lag3,...,Victory Day,Republic Day,Ramadan Feast,Ramadan Feast Holiday,Sacrifice Feast,Sacrifice Feast Holiday,National Sovereignty and Children's Day; Ramadan Feast Holiday,date,ilce,bildirimli_sum
15686,2021-03-01-izmir-kemalpasa,0,3,2021,11.0,3.0,5.0,5.0,7.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2021-03-01,izmir-kemalpasa,1
5183,2023-07-08-izmir-bornova,5,7,2023,6.0,5.0,10.0,6.0,11.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2023-07-08,izmir-bornova,0
44406,2021-06-30-izmir-selcuk,2,6,2021,5.0,0.0,3.0,0.0,3.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2021-06-30,izmir-selcuk,0
41851,2023-01-16-manisa-yunusemre,0,1,2023,7.0,6.0,8.0,0.0,3.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2023-01-16,manisa-yunusemre,0
38997,2021-02-08-manisa-soma,0,2,2021,9.0,0.0,2.0,0.0,3.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2021-02-08,manisa-soma,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11284,2022-11-20-izmir-gaziemir,6,11,2022,7.0,1.0,5.0,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2022-11-20,izmir-gaziemir,0
44732,2022-08-30-izmir-selcuk,1,8,2022,1.0,0.0,1.0,0.0,1.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,2022-08-30,izmir-selcuk,0
38158,2021-07-03-manisa-selendi,5,7,2021,2.0,0.0,2.0,0.0,4.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2021-07-03,manisa-selendi,0
860,2023-06-28-izmir-aliaga,2,6,2023,2.0,0.0,7.0,0.0,9.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,2023-06-28,izmir-aliaga,0
