In [233]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from catboost import CatBoostRegressor
import xgboost as xgb
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error
import numpy as np



In [234]:

# Load the datasets
train = pd.read_csv("train.csv")
weather = pd.read_csv("weather.csv")
holidays = pd.read_csv("holidays.csv")
test = pd.read_csv("test.csv")
sample_submission = pd.read_csv("sample_submission.csv")

In [235]:
def aggregate_weather_data(daily_weather):
        
        daily_weather['date'] = pd.to_datetime(daily_weather['date'])
        daily_weather['day'] = daily_weather['date'].dt.date
        daily_weather['hour'] = daily_weather['date'].dt.hour


        daily_weather = daily_weather.groupby(['ilce', pd.Grouper(freq='D', key='date')])
        ## Apply the aggregations
        daily_weather= daily_weather.agg({
            't_2m:C': ['max', 'min', 'mean','std'],  # temperature
            'prob_precip_1h:p': ['sum', 'max' ,'mean'],  # precipitation
            'wind_speed_10m:ms': ['max', 'mean','std'],  # wind speed
            'wind_dir_10m:d': 'mean',  # wind direction
            'global_rad:W': 'sum',  # sunshine duration
            'effective_cloud_cover:p': ['mean','std'],  # cloud cover
            'relative_humidity_2m:p': ['max', 'min', 'mean']  # humidity
        })

        # Flatten the MultiIndex columns
        daily_weather.columns = ['_'.join(col).strip() for col in daily_weather.columns.values]
        daily_weather= daily_weather.reset_index() 
        return daily_weather
def create_unique_id(df):
    df['unique_id'] = df['date'].astype(str) +  '-' +df['ilce'].astype(str) 
    return df

In [236]:
train.columns = ['date' , 'ilce' , 'bildirimsiz_sum', 'bildirimli_sum']
train = create_unique_id(train)
train['date'] = pd.to_datetime(train['date'])
 #Process data and train model

In [237]:
weather.columns = ['date', 'lat', 'lon', 't_2m:C', 'effective_cloud_cover:p', 'global_rad:W', 'relative_humidity_2m:p', 'wind_dir_10m:d', 'wind_speed_10m:ms', 'prob_precip_1h:p', 't_apparent:C', 'ilce']
weather = aggregate_weather_data(weather)
weather = create_unique_id(weather)


In [238]:
holidays = pd.read_csv('holidays.csv')
holidays.columns =['year', 'month' ,'day' , 'holiday']
add_columns = holidays['holiday']
# Create a new column for each unique holiday name

holidays['date'] = pd.to_datetime(holidays[['year', 'month', 'day']])
for new in holidays['holiday'].unique():
    holidays[new] = (holidays['holiday'] == new).astype(int)
holidays.drop(['holiday', 'year', 'month', 'day'] , axis=1 , inplace=True)
holidays.columns
holidays.head()


Unnamed: 0,date,New Year's Day,National Sovereignty and Children's Day,Labour Day,"Commemoration of Ataturk, Youth and Sports Day",Democracy and National Unity Day,Victory Day,Republic Day,Ramadan Feast,Ramadan Feast Holiday,Sacrifice Feast,Sacrifice Feast Holiday,National Sovereignty and Children's Day; Ramadan Feast Holiday
0,2021-01-01,1,0,0,0,0,0,0,0,0,0,0,0
1,2021-04-23,0,1,0,0,0,0,0,0,0,0,0,0
2,2021-05-01,0,0,1,0,0,0,0,0,0,0,0,0
3,2021-05-19,0,0,0,1,0,0,0,0,0,0,0,0
4,2021-07-15,0,0,0,0,1,0,0,0,0,0,0,0


In [239]:

merged_data = pd.merge(train, holidays, on='date', how='left')

merged_data[merged_data['Victory Day'] == 1]


Unnamed: 0,date,ilce,bildirimsiz_sum,bildirimli_sum,unique_id,New Year's Day,National Sovereignty and Children's Day,Labour Day,"Commemoration of Ataturk, Youth and Sports Day",Democracy and National Unity Day,Victory Day,Republic Day,Ramadan Feast,Ramadan Feast Holiday,Sacrifice Feast,Sacrifice Feast Holiday,National Sovereignty and Children's Day; Ramadan Feast Holiday
239,2021-08-30,izmir-aliaga,3,0,2021-08-30-izmir-aliaga,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
591,2022-08-30,izmir-aliaga,1,0,2022-08-30-izmir-aliaga,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
952,2023-08-30,izmir-aliaga,4,0,2023-08-30-izmir-aliaga,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1347,2021-08-30,izmir-bayindir,3,0,2021-08-30-izmir-bayindir,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1709,2022-08-30,izmir-bayindir,3,3,2022-08-30-izmir-bayindir,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
46718,2021-08-30,manisa-kula,3,1,2021-08-30-manisa-kula,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
47046,2022-08-30,manisa-kula,2,0,2022-08-30-manisa-kula,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
47380,2023-08-30,manisa-kula,4,0,2023-08-30-manisa-kula,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
47658,2021-08-30,manisa-ahmetli,2,0,2021-08-30-manisa-ahmetli,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


In [240]:
test.columns= ['date','ilce','bildirimli_sum']
test = create_unique_id(test)
test.head()


Unnamed: 0,date,ilce,bildirimli_sum,unique_id
0,2024-02-01,izmir-aliaga,0,2024-02-01-izmir-aliaga
1,2024-02-01,izmir-bayindir,1,2024-02-01-izmir-bayindir
2,2024-02-01,izmir-bayrakli,0,2024-02-01-izmir-bayrakli
3,2024-02-01,izmir-bergama,1,2024-02-01-izmir-bergama
4,2024-02-01,izmir-bornova,1,2024-02-01-izmir-bornova


In [241]:

sample_submission.head()

Unnamed: 0,unique_id,bildirimsiz_sum
0,2024-02-01-izmir-aliaga,
1,2024-02-01-izmir-bayindir,
2,2024-02-01-izmir-bayrakli,
3,2024-02-01-izmir-bergama,
4,2024-02-01-izmir-bornova,


In [258]:
train.sample()
train.head()

Unnamed: 0,date,ilce,bildirimsiz_sum,bildirimli_sum,unique_id,days_in_week,month,year,bildirimsiz_lag1,bildirimli_lag1,...,Target_Rolling_Std_7,Target_Rolling_Mean_14,Target_Rolling_Std_14,Target_Trend,Target_Seasonal,Target_Residual,Target_EWM_Mean_3,Target_EWM_Mean_7,Target_EWM_Mean_14,Log_Target
30,2021-01-31,izmir-aliaga,12,0,2021-01-31-izmir-aliaga,6,1,2021,24.0,0.0,...,6.792853,7.0,6.348834,9.857143,0.04536,2.097497,13.669251,11.26094,8.898156,2.564949
31,2021-02-01,izmir-aliaga,3,0,2021-02-01-izmir-aliaga,0,2,2021,12.0,0.0,...,7.380799,7.071429,6.293639,8.714286,0.070319,-5.784605,8.334625,9.195705,8.111735,1.386294
32,2021-02-02,izmir-aliaga,12,0,2021-02-02-izmir-aliaga,1,2,2021,3.0,0.0,...,7.033931,7.857143,6.162631,8.571429,-0.00624,3.434811,10.167313,9.896779,8.63017,2.564949
33,2021-02-03,izmir-aliaga,5,0,2021-02-03-izmir-aliaga,2,2,2021,12.0,0.0,...,7.244045,8.0,6.063764,5.428571,-0.063109,-0.365463,7.583656,8.672584,8.146148,1.791759
34,2021-02-04,izmir-aliaga,1,0,2021-02-04-izmir-aliaga,3,2,2021,5.0,0.0,...,7.994045,7.5,6.345804,4.571429,-0.031605,-3.539824,4.291828,6.754438,7.193328,0.693147


In [243]:
##@@@ TRAIN Data


train['date'] = pd.to_datetime(train['date'])
print(train['date'].dt.weekday)
train['days_in_week'] = train['date'].dt.weekday
train['month']= train['date'].dt.month
train['year']= train['date'].dt.year
train.set_index('unique_id')

0        4
1        5
2        6
3        0
4        1
        ..
48143    1
48144    2
48145    3
48146    4
48147    0
Name: date, Length: 48148, dtype: int32


Unnamed: 0_level_0,date,ilce,bildirimsiz_sum,bildirimli_sum,days_in_week,month,year
unique_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2021-01-01-izmir-aliaga,2021-01-01,izmir-aliaga,5,0,4,1,2021
2021-01-02-izmir-aliaga,2021-01-02,izmir-aliaga,13,0,5,1,2021
2021-01-03-izmir-aliaga,2021-01-03,izmir-aliaga,4,0,6,1,2021
2021-01-04-izmir-aliaga,2021-01-04,izmir-aliaga,9,0,0,1,2021
2021-01-05-izmir-aliaga,2021-01-05,izmir-aliaga,2,0,1,1,2021
...,...,...,...,...,...,...,...
2024-01-23-manisa-ahmetli,2024-01-23,manisa-ahmetli,1,0,1,1,2024
2024-01-24-manisa-ahmetli,2024-01-24,manisa-ahmetli,2,0,2,1,2024
2024-01-25-manisa-ahmetli,2024-01-25,manisa-ahmetli,2,0,3,1,2024
2024-01-26-manisa-ahmetli,2024-01-26,manisa-ahmetli,2,0,4,1,2024


In [244]:
# Lagged features capture the values of the target variable at previous time steps, which can help model temporal dependencies.
lags = 10
for lag in range(1,10):
    train[f'bildirimsiz_lag{lag}'] = train.groupby('ilce')['bildirimsiz_sum'].shift(lag)
    train[f'bildirimli_lag{lag}'] = train.groupby('ilce')['bildirimli_sum'].shift(lag)

In [245]:
# Differencing captures the changes in the target variable (bildirimsiz_sum) between consecutive time steps, which can help model stationarity and remove trends.
periods = [1, 2, 3, 5, 7, 14, 30]
for period in periods:
    train[f'Target_Diff{period}'] = train.groupby('ilce')['bildirimsiz_sum'].diff(periods=period)

In [246]:
# Target encoding replaces categorical values with the mean of the target variable (bildirimsiz_sum) for each category, capturing the relationship between categories and the target.
def target_encoding(data, target, category):
    means = data.groupby(category)[target].mean()
    return data[category].map(means)

train['Target_Encoded_Ilce'] = target_encoding(train, 'bildirimsiz_sum', 'ilce')

In [247]:
pip install statsmodels

Note: you may need to restart the kernel to use updated packages.


In [248]:
# Autoregressive features model the target variable (bildirimsiz_sum) as a linear combination of its past values, capturing temporal dependencies.
from statsmodels.tsa.ar_model import AutoReg

ar_model = AutoReg(train['bildirimsiz_sum'], lags=[1, 7]).fit()
train['Target_AR_Prediction'] = ar_model.predict()

In [249]:
# Rolling window statistics summarize the recent behavior of the target variable (bildirimsiz_sum) over a specified window size, capturing short-term trends and patterns.
windows = [3, 7, 14]
for window in windows:
    train[f'Target_Rolling_Mean_{window}'] = train.groupby('ilce')['bildirimsiz_sum'].rolling(window=window).mean().reset_index(0, drop=True)
    train[f'Target_Rolling_Std_{window}'] = train.groupby('ilce')['bildirimsiz_sum'].rolling(window=window).std().reset_index(0, drop=True)

In [250]:
pip install statsmodels

Note: you may need to restart the kernel to use updated packages.


In [251]:
# Zaman serisi ayrıştırması, hedef değişkeni (bildirimsiz_sum) eğilim, mevsimsellik ve kalıntı bileşenlerine ayırır, bu bileşenler özellik olarak kullanılabilir.
from statsmodels.tsa.seasonal import seasonal_decompose

decomposed = seasonal_decompose(train['bildirimsiz_sum'], period=7)
train['Target_Trend'] = decomposed.trend
train['Target_Seasonal'] = decomposed.seasonal
train['Target_Residual'] = decomposed.resid

In [252]:
# Exponentially Weighted Moving Average (EWMA) assigns exponentially decreasing weights to past observations of the target variable (bildirimsiz_sum), capturing recent trends more strongly.
spans = [3, 7, 14]
for span in spans:
    train[f'Target_EWM_Mean_{span}'] = train.groupby('ilce')['bildirimsiz_sum'].ewm(span=span, adjust=False).mean().reset_index(0, drop=True)

In [253]:
# Differencing captures the changes in the target variable (bildirimsiz_sum) between consecutive time steps, which can help model stationarity and remove trends.
periods = [1, 7, 14, 30]
for period in periods:
    train[f'Target_Diff{period}'] = train.groupby('ilce')['bildirimsiz_sum'].diff(periods=period)

In [254]:
# Logarithmic transformation can help stabilize the variance of the target variable (bildirimsiz_sum) and make it more suitable for modeling.
train['Log_Target'] = np.log1p(train['bildirimsiz_sum'])


In [255]:
train = train.dropna()
train

Unnamed: 0,date,ilce,bildirimsiz_sum,bildirimli_sum,unique_id,days_in_week,month,year,bildirimsiz_lag1,bildirimli_lag1,...,Target_Rolling_Std_7,Target_Rolling_Mean_14,Target_Rolling_Std_14,Target_Trend,Target_Seasonal,Target_Residual,Target_EWM_Mean_3,Target_EWM_Mean_7,Target_EWM_Mean_14,Log_Target
30,2021-01-31,izmir-aliaga,12,0,2021-01-31-izmir-aliaga,6,1,2021,24.0,0.0,...,6.792853,7.000000,6.348834,9.857143,0.045360,2.097497,13.669251,11.260940,8.898156,2.564949
31,2021-02-01,izmir-aliaga,3,0,2021-02-01-izmir-aliaga,0,2,2021,12.0,0.0,...,7.380799,7.071429,6.293639,8.714286,0.070319,-5.784605,8.334625,9.195705,8.111735,1.386294
32,2021-02-02,izmir-aliaga,12,0,2021-02-02-izmir-aliaga,1,2,2021,3.0,0.0,...,7.033931,7.857143,6.162631,8.571429,-0.006240,3.434811,10.167313,9.896779,8.630170,2.564949
33,2021-02-03,izmir-aliaga,5,0,2021-02-03-izmir-aliaga,2,2,2021,12.0,0.0,...,7.244045,8.000000,6.063764,5.428571,-0.063109,-0.365463,7.583656,8.672584,8.146148,1.791759
34,2021-02-04,izmir-aliaga,1,0,2021-02-04-izmir-aliaga,3,2,2021,5.0,0.0,...,7.994045,7.500000,6.345804,4.571429,-0.031605,-3.539824,4.291828,6.754438,7.193328,0.693147
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48140,2024-01-20,manisa-ahmetli,2,0,2024-01-20-manisa-ahmetli,5,1,2024,2.0,0.0,...,0.786796,1.928571,1.141139,1.428571,0.021658,0.549771,1.824890,1.783654,1.833278,1.098612
48141,2024-01-21,manisa-ahmetli,2,0,2024-01-21-manisa-ahmetli,6,1,2024,2.0,0.0,...,0.755929,2.000000,1.109400,1.571429,0.045360,0.383212,1.912445,1.837741,1.855508,1.098612
48142,2024-01-22,manisa-ahmetli,1,0,2024-01-22-manisa-ahmetli,0,1,2024,2.0,0.0,...,0.755929,1.928571,1.141139,1.714286,0.070319,-0.784605,1.456222,1.628305,1.741440,0.693147
48143,2024-01-23,manisa-ahmetli,1,0,2024-01-23-manisa-ahmetli,1,1,2024,1.0,0.0,...,0.534522,1.928571,1.141139,1.714286,-0.006240,-0.708046,1.228111,1.471229,1.642582,0.693147


Creating Weather data and merging them into train data 

In [256]:
train_df = pd.merge(train, merged_data,on="unique_id")
train_df.fillna(0,inplace=True)
train_df.columns


Index(['date_x', 'ilce_x', 'bildirimsiz_sum_x', 'bildirimli_sum_x',
       'unique_id', 'days_in_week', 'month', 'year', 'bildirimsiz_lag1',
       'bildirimli_lag1', 'bildirimsiz_lag2', 'bildirimli_lag2',
       'bildirimsiz_lag3', 'bildirimli_lag3', 'bildirimsiz_lag4',
       'bildirimli_lag4', 'bildirimsiz_lag5', 'bildirimli_lag5',
       'bildirimsiz_lag6', 'bildirimli_lag6', 'bildirimsiz_lag7',
       'bildirimli_lag7', 'bildirimsiz_lag8', 'bildirimli_lag8',
       'bildirimsiz_lag9', 'bildirimli_lag9', 'Target_Diff1', 'Target_Diff2',
       'Target_Diff3', 'Target_Diff5', 'Target_Diff7', 'Target_Diff14',
       'Target_Diff30', 'Target_Encoded_Ilce', 'Target_AR_Prediction',
       'Target_Rolling_Mean_3', 'Target_Rolling_Std_3',
       'Target_Rolling_Mean_7', 'Target_Rolling_Std_7',
       'Target_Rolling_Mean_14', 'Target_Rolling_Std_14', 'Target_Trend',
       'Target_Seasonal', 'Target_Residual', 'Target_EWM_Mean_3',
       'Target_EWM_Mean_7', 'Target_EWM_Mean_14', 'Log_Targ

In [257]:

train_df['date'] = train_df['date_x']
train_df['ilce'] = train_df['ilce_x']
train_df['bildirimsiz_sum'] = train_df['bildirimsiz_sum_x']
train_df['bildirimli_sum'] = train_df['bildirimli_sum_x']
train_df['date'] = train_df['date_x']

train_df.drop(['date_x'	,'ilce_x'	,'bildirimsiz_sum_x'	,'bildirimli_sum_x','date_y'	,'ilce_y'	,'bildirimsiz_sum_y'	,'bildirimli_sum_y'] , axis=1 , inplace=True)
train_df


Unnamed: 0,unique_id,days_in_week,month,year,bildirimsiz_lag1,bildirimli_lag1,bildirimsiz_lag2,bildirimli_lag2,bildirimsiz_lag3,bildirimli_lag3,...,Republic Day,Ramadan Feast,Ramadan Feast Holiday,Sacrifice Feast,Sacrifice Feast Holiday,National Sovereignty and Children's Day; Ramadan Feast Holiday,date,ilce,bildirimsiz_sum,bildirimli_sum
0,2021-01-31-izmir-aliaga,6,1,2021,24.0,0.0,4.0,1.0,9.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,2021-01-31,izmir-aliaga,12,0
1,2021-02-01-izmir-aliaga,0,2,2021,12.0,0.0,24.0,0.0,4.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,2021-02-01,izmir-aliaga,3,0
2,2021-02-02-izmir-aliaga,1,2,2021,3.0,0.0,12.0,0.0,24.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,2021-02-02,izmir-aliaga,12,0
3,2021-02-03-izmir-aliaga,2,2,2021,12.0,0.0,3.0,0.0,12.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,2021-02-03,izmir-aliaga,5,0
4,2021-02-04-izmir-aliaga,3,2,2021,5.0,0.0,12.0,0.0,3.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,2021-02-04,izmir-aliaga,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
46730,2024-01-20-manisa-ahmetli,5,1,2024,2.0,0.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,2024-01-20,manisa-ahmetli,2,0
46731,2024-01-21-manisa-ahmetli,6,1,2024,2.0,0.0,2.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,2024-01-21,manisa-ahmetli,2,0
46732,2024-01-22-manisa-ahmetli,0,1,2024,2.0,0.0,2.0,0.0,2.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,2024-01-22,manisa-ahmetli,1,0
46733,2024-01-23-manisa-ahmetli,1,1,2024,1.0,0.0,2.0,0.0,2.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,2024-01-23,manisa-ahmetli,1,0


In [259]:
train_df.to_csv("all_merged.csv", index=False)


In [205]:
train_df.columns

Index(['unique_id', 'days_in_week', 'month', 'year', 'bildirimsiz_lag1',
       'bildirimli_lag1', 'bildirimsiz_lag2', 'bildirimli_lag2',
       'bildirimsiz_lag3', 'bildirimli_lag3', 'bildirimsiz_lag4',
       'bildirimli_lag4', 'bildirimsiz_lag5', 'bildirimli_lag5',
       'bildirimsiz_lag6', 'bildirimli_lag6', 'bildirimsiz_lag7',
       'bildirimli_lag7', 'bildirimsiz_lag8', 'bildirimli_lag8',
       'bildirimsiz_lag9', 'bildirimli_lag9', 'Target_Diff1', 'Target_Diff2',
       'Target_Diff3', 'Target_Diff5', 'Target_Diff7', 'Target_Diff14',
       'Target_Diff30', 'Target_Encoded_Ilce', 'Target_AR_Prediction',
       'Target_Rolling_Mean_3', 'Target_Rolling_Std_3',
       'Target_Rolling_Mean_7', 'Target_Rolling_Std_7',
       'Target_Rolling_Mean_14', 'Target_Rolling_Std_14', 'Target_Trend',
       'Target_Seasonal', 'Target_Residual', 'Target_EWM_Mean_3',
       'Target_EWM_Mean_7', 'Target_EWM_Mean_14', 'Log_Target',
       'New Year's Day', 'National Sovereignty and Children's D

In [60]:
import pandas as pd
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_absolute_error

# Hedef değişkeni ve özellikleri ayır
X = train_df.drop(columns=["bildirimsiz_sum"])
y = train_df["bildirimsiz_sum"]

# Eğitim ve test setlerini ayır
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Encode categorical variables
label_encoder = LabelEncoder()
label_encoder.fit(X_test['ilce'])

X_train['ilce_encoded'] = label_encoder.fit_transform(X_train['ilce'])
X_test['ilce_encoded'] = label_encoder.transform(X_test['ilce'])  # Test setinde de aynı dönüşümü yapın

# Drop non-numeric columns
X_train_numeric = X_train.select_dtypes(include=['int', 'float', 'bool'])
X_test_numeric = X_test.select_dtypes(include=['int', 'float', 'bool'])

# Parametre aralıklarının belirlenmesi
param_grid = {
    'learning_rate': [0.01, 0.1, 0.3],
    'max_depth': [3, 5, 7],
    'n_estimators': [50, 100, 200]
}

# Model oluşturma
xgb = XGBRegressor()

# GridSearchCV ile en iyi parametre kombinasyonunun bulunması
grid_search = GridSearchCV(estimator=xgb, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2)
grid_search.fit(X_train_numeric, y_train)

# En iyi parametrelerin bulunması
best_params = grid_search.best_params_

# En iyi parametrelerle modeli eğitme (GridSearchCV zaten en iyi modeli döndürüyor)
xgboost_model = grid_search.best_estimator_
xgboost_model.fit(X_train_numeric, y_train)
X_test_numeric


Fitting 3 folds for each of 27 candidates, totalling 81 fits
[CV] END ...learning_rate=0.01, max_depth=3, n_estimators=50; total time=   0.8s
[CV] END ...learning_rate=0.01, max_depth=3, n_estimators=50; total time=   0.8s
[CV] END ...learning_rate=0.01, max_depth=3, n_estimators=50; total time=   0.8s
[CV] END ...learning_rate=0.01, max_depth=5, n_estimators=50; total time=   1.2s
[CV] END ...learning_rate=0.01, max_depth=5, n_estimators=50; total time=   1.2s
[CV] END ..learning_rate=0.01, max_depth=3, n_estimators=100; total time=   1.3s[CV] END ..learning_rate=0.01, max_depth=3, n_estimators=100; total time=   1.3s

[CV] END ..learning_rate=0.01, max_depth=3, n_estimators=100; total time=   1.3s
[CV] END ...learning_rate=0.01, max_depth=5, n_estimators=50; total time=   1.4s
[CV] END ..learning_rate=0.01, max_depth=3, n_estimators=200; total time=   1.8s
[CV] END ..learning_rate=0.01, max_depth=3, n_estimators=200; total time=   1.9s
[CV] END ..learning_rate=0.01, max_depth=5, n_es

Unnamed: 0,days_in_week,month,year,bildirimsiz_lag1,bildirimli_lag1,bildirimsiz_lag2,bildirimli_lag2,bildirimsiz_lag3,bildirimli_lag3,bildirimsiz_lag4,...,Democracy and National Unity Day,Victory Day,Republic Day,Ramadan Feast,Ramadan Feast Holiday,Sacrifice Feast,Sacrifice Feast Holiday,National Sovereignty and Children's Day; Ramadan Feast Holiday,bildirimli_sum,ilce_encoded
31604,2,9,2022,4.0,1.0,1.0,0.0,0.0,2.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,35
3243,0,3,2021,16.0,1.0,2.0,1.0,8.0,0.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,4
22314,3,1,2022,3.0,0.0,21.0,3.0,7.0,1.0,8.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4,24
19826,0,3,2021,13.0,1.0,12.0,0.0,8.0,0.0,5.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,21
3555,6,1,2022,7.0,0.0,13.0,0.0,30.0,0.0,5.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20201,5,3,2022,14.0,0.0,3.0,1.0,5.0,0.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,21
32541,3,8,2022,1.0,1.0,3.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,36
24567,0,3,2022,3.0,0.0,7.0,1.0,5.0,1.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,27
22012,2,3,2021,21.0,9.0,27.0,0.0,11.0,3.0,14.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,24


In [61]:

# Test seti üzerinde tahmin yapma
xgboost_preds_test = xgboost_model.predict(X_test_numeric)

print("XGBoost Regressor modeli ile test seti üzerinde tahminler:", X_test_numeric)

# Test seti üzerinde ortalama mutlak hata hesaplama
mae = mean_absolute_error(y_test, xgboost_preds_test)
print("Test seti üzerinde Ortalama Mutlak Hata (MAE):", mae)


XGBoost Regressor modeli ile test seti üzerinde tahminler:        days_in_week  month  year  bildirimsiz_lag1  bildirimli_lag1  \
31604             2      9  2022               4.0              1.0   
3243              0      3  2021              16.0              1.0   
22314             3      1  2022               3.0              0.0   
19826             0      3  2021              13.0              1.0   
3555              6      1  2022               7.0              0.0   
...             ...    ...   ...               ...              ...   
20201             5      3  2022              14.0              0.0   
32541             3      8  2022               1.0              1.0   
24567             0      3  2022               3.0              0.0   
22012             2      3  2021              21.0              9.0   
43779             0      2  2022               2.0              0.0   

       bildirimsiz_lag2  bildirimli_lag2  bildirimsiz_lag3  bildirimli_lag3  \
31604    

In [64]:
import pandas as pd

# Test veri setini yükle
test_data = pd.read_csv("test.csv")

# 'bildirimli_sum' sütununu kaldır
test_data.drop(columns=['bildirimli_sum'], inplace=True)


# Tahmin yap
# Tarih sütununu parçalara ayırma
test_data['tarih'] = pd.to_datetime(test_data['tarih'])
test_data['days_in_week'] = test_data['tarih'].dt.dayofweek
test_data['month'] = test_data['tarih'].dt.month
test_data['year'] = test_data['tarih'].dt.year

# İlçe bilgisini encode etme
test_data['ilce_encoded'] = label_encoder.transform(test_data['ilce'])

# Kategorik olmayan sütunları seçme
test_data_numeric = test_data[['days_in_week', 'month', 'year', 'ilce_encoded']]


print( test_data_numeric)


      days_in_week  month  year  ilce_encoded
0                3      2  2024             0
1                3      2  2024             2
2                3      2  2024             3
3                3      2  2024             4
4                3      2  2024             6
...            ...    ...   ...           ...
1358             3      2  2024             5
1359             3      2  2024            23
1360             3      2  2024            26
1361             3      2  2024            38
1362             3      2  2024            30

[1363 rows x 4 columns]


In [66]:



# Tahmin yapma
test_preds = xgboost_model.predict(test_data_numeric)

# Tahminleri DataFrame'e dönüştürme
output = pd.DataFrame({'unique_id': test_data['unique_id'], 'bildirimsiz_sum': test_preds})

# Sonuçları 'submission.csv' dosyasına kaydetme
#output.to_csv('submission.csv', index=False)output


ValueError: feature_names mismatch: ['days_in_week', 'month', 'year', 'bildirimsiz_lag1', 'bildirimli_lag1', 'bildirimsiz_lag2', 'bildirimli_lag2', 'bildirimsiz_lag3', 'bildirimli_lag3', 'bildirimsiz_lag4', 'bildirimli_lag4', 'bildirimsiz_lag5', 'bildirimli_lag5', 'bildirimsiz_lag6', 'bildirimli_lag6', 'bildirimsiz_lag7', 'bildirimli_lag7', 'bildirimsiz_lag8', 'bildirimli_lag8', 'bildirimsiz_lag9', 'bildirimli_lag9', 'Target_Diff1', 'Target_Diff2', 'Target_Diff3', 'Target_Diff5', 'Target_Diff7', 'Target_Diff14', 'Target_Diff30', 'Target_Encoded_Ilce', 'Target_AR_Prediction', 'Target_Rolling_Mean_3', 'Target_Rolling_Std_3', 'Target_Rolling_Mean_7', 'Target_Rolling_Std_7', 'Target_Rolling_Mean_14', 'Target_Rolling_Std_14', 'Target_Trend', 'Target_Seasonal', 'Target_Residual', 'Target_EWM_Mean_3', 'Target_EWM_Mean_7', 'Target_EWM_Mean_14', 'Log_Target', "New Year's Day", "National Sovereignty and Children's Day", 'Labour Day', 'Commemoration of Ataturk, Youth and Sports Day', 'Democracy and National Unity Day', 'Victory Day', 'Republic Day', 'Ramadan Feast', 'Ramadan Feast Holiday', 'Sacrifice Feast', 'Sacrifice Feast Holiday', "National Sovereignty and Children's Day; Ramadan Feast Holiday", 'bildirimli_sum', 'ilce_encoded'] ['days_in_week', 'month', 'year', 'ilce_encoded']
expected Ramadan Feast Holiday, Commemoration of Ataturk, Youth and Sports Day, Target_Diff5, bildirimsiz_lag6, bildirimli_lag4, Log_Target, Target_Diff30, Target_EWM_Mean_14, Sacrifice Feast, Target_EWM_Mean_7, bildirimli_lag9, Victory Day, bildirimsiz_lag3, Republic Day, bildirimli_lag1, bildirimsiz_lag2, bildirimsiz_lag9, bildirimli_lag6, Labour Day, bildirimli_sum, bildirimli_lag7, National Sovereignty and Children's Day, National Sovereignty and Children's Day; Ramadan Feast Holiday, Ramadan Feast, Target_Diff2, Sacrifice Feast Holiday, Target_Diff14, bildirimli_lag8, Target_Rolling_Mean_14, bildirimli_lag2, Target_Encoded_Ilce, bildirimsiz_lag5, Target_Diff3, Target_Seasonal, bildirimsiz_lag8, bildirimli_lag3, Target_Diff1, Target_Rolling_Std_7, Target_AR_Prediction, Target_Rolling_Std_3, bildirimsiz_lag4, New Year's Day, Target_Rolling_Std_14, Target_Rolling_Mean_7, bildirimsiz_lag1, Target_EWM_Mean_3, bildirimli_lag5, Target_Trend, Democracy and National Unity Day, Target_Residual, bildirimsiz_lag7, Target_Rolling_Mean_3, Target_Diff7 in input data