In [45]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from catboost import CatBoostRegressor
import xgboost as xgb
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error




In [48]:

# Load the datasets
train = pd.read_csv("train.csv")
weather = pd.read_csv("weather.csv")
holidays = pd.read_csv("holidays.csv")
test = pd.read_csv("test.csv")
sample_submission = pd.read_csv("sample_submission.csv")

In [47]:
def aggregate_weather_data(daily_weather):
        
        daily_weather['date'] = pd.to_datetime(daily_weather['date'])
        daily_weather['day'] = daily_weather['date'].dt.date
        daily_weather['hour'] = daily_weather['date'].dt.hour


        daily_weather = daily_weather.groupby(['ilce', pd.Grouper(freq='D', key='date')])
        ## Apply the aggregations
        daily_weather= daily_weather.agg({
            't_2m:C': ['max', 'min', 'mean','std'],  # temperature
            'prob_precip_1h:p': ['sum', 'max' ,'mean'],  # precipitation
            'wind_speed_10m:ms': ['max', 'mean','std'],  # wind speed
            'wind_dir_10m:d': 'mean',  # wind direction
            'global_rad:W': 'sum',  # sunshine duration
            'effective_cloud_cover:p': ['mean','std'],  # cloud cover
            'relative_humidity_2m:p': ['max', 'min', 'mean']  # humidity
        })

        # Flatten the MultiIndex columns
        daily_weather.columns = ['_'.join(col).strip() for col in daily_weather.columns.values]
        daily_weather= daily_weather.reset_index() 
        return daily_weather
def create_unique_id(df):
    df['unique_id'] = df['date'].astype(str) +  '-' +df['ilce'].astype(str) 
    return df

In [49]:
# Rename columns in holidays_df and convert date format
holidays.columns = ['year', 'month', 'day', 'holiday_name']
holidays['date'] = pd.to_datetime(holidays[['year', 'month', 'day']])
holidays.drop(['year', 'month', 'day'], axis=1, inplace=True)
holidays.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 55 entries, 0 to 54
Data columns (total 2 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   holiday_name  55 non-null     object        
 1   date          55 non-null     datetime64[ns]
dtypes: datetime64[ns](1), object(1)
memory usage: 1012.0+ bytes


In [50]:
train.columns = ['date' , 'ilce' , 'bildirimsiz_sum', 'bildirimli_sum']
train = create_unique_id(train)
train.head()
 #Process data and train model

Unnamed: 0,date,ilce,bildirimsiz_sum,bildirimli_sum,unique_id
0,2021-01-01,izmir-aliaga,5,0,2021-01-01-izmir-aliaga
1,2021-01-02,izmir-aliaga,13,0,2021-01-02-izmir-aliaga
2,2021-01-03,izmir-aliaga,4,0,2021-01-03-izmir-aliaga
3,2021-01-04,izmir-aliaga,9,0,2021-01-04-izmir-aliaga
4,2021-01-05,izmir-aliaga,2,0,2021-01-05-izmir-aliaga


In [51]:
weather.columns = ['date', 'lat', 'lon', 't_2m:C', 'effective_cloud_cover:p', 'global_rad:W', 'relative_humidity_2m:p', 'wind_dir_10m:d', 'wind_speed_10m:ms', 'prob_precip_1h:p', 't_apparent:C', 'ilce']
weather = aggregate_weather_data(weather)
weather = create_unique_id(weather)
weather.head()

Unnamed: 0,ilce,date,t_2m:C_max,t_2m:C_min,t_2m:C_mean,t_2m:C_std,prob_precip_1h:p_sum,prob_precip_1h:p_max,prob_precip_1h:p_mean,wind_speed_10m:ms_max,wind_speed_10m:ms_mean,wind_speed_10m:ms_std,wind_dir_10m:d_mean,global_rad:W_sum,effective_cloud_cover:p_mean,effective_cloud_cover:p_std,relative_humidity_2m:p_max,relative_humidity_2m:p_min,relative_humidity_2m:p_mean,unique_id
0,Izmir-Aliaga,2021-01-01,15.8,11.3,13.275,1.305923,24.0,1.0,1.0,6.4,4.266667,1.524201,167.204167,1752.5,55.045833,21.153455,91.8,75.7,85.991667,2021-01-01-Izmir-Aliaga
1,Izmir-Aliaga,2021-01-02,17.2,10.6,12.870833,2.377883,24.0,1.0,1.0,4.0,2.891667,0.474494,106.65,2228.6,36.125,14.109548,93.7,63.6,82.1375,2021-01-02-Izmir-Aliaga
2,Izmir-Aliaga,2021-01-03,15.9,11.1,12.633333,1.351864,202.3,50.6,8.429167,4.1,3.4375,0.368088,122.520833,942.4,64.870833,29.785311,84.6,73.0,79.120833,2021-01-03-Izmir-Aliaga
3,Izmir-Aliaga,2021-01-04,17.0,11.0,13.733333,2.388317,30.6,3.3,1.275,6.6,4.608333,0.917424,123.408333,1894.7,47.783333,15.557206,84.1,59.3,70.3125,2021-01-04-Izmir-Aliaga
4,Izmir-Aliaga,2021-01-05,16.8,12.2,14.375,1.522655,257.3,94.9,10.720833,7.8,3.991667,1.643939,195.720833,2297.2,30.379167,27.886337,95.2,55.6,75.220833,2021-01-05-Izmir-Aliaga


In [52]:
holidays.head()


Unnamed: 0,holiday_name,date
0,New Year's Day,2021-01-01
1,National Sovereignty and Children's Day,2021-04-23
2,Labour Day,2021-05-01
3,"Commemoration of Ataturk, Youth and Sports Day",2021-05-19
4,Democracy and National Unity Day,2021-07-15


In [53]:
test.columns= ['date','ilce','bildirimli_sum']
test = create_unique_id(test)
test.head()


Unnamed: 0,date,ilce,bildirimli_sum,unique_id
0,2024-02-01,izmir-aliaga,0,2024-02-01-izmir-aliaga
1,2024-02-01,izmir-bayindir,1,2024-02-01-izmir-bayindir
2,2024-02-01,izmir-bayrakli,0,2024-02-01-izmir-bayrakli
3,2024-02-01,izmir-bergama,1,2024-02-01-izmir-bergama
4,2024-02-01,izmir-bornova,1,2024-02-01-izmir-bornova


In [54]:

sample_submission.head()

Unnamed: 0,unique_id,bildirimsiz_sum
0,2024-02-01-izmir-aliaga,
1,2024-02-01-izmir-bayindir,
2,2024-02-01-izmir-bayrakli,
3,2024-02-01-izmir-bergama,
4,2024-02-01-izmir-bornova,


In [56]:
##@@@ TRAIN Data
train = pd.merge(train, weather, on='unique_id')

#$holiday applicationplayoffs = pd.DataFram#e({
  #'holiday': 'playoff',
  #'ds': pd.to_datetime(['2008-01-13', '2009-01-03', '2010-01-16',
                        #'2010-01-24', '2010-02-07', '2011-01-08',
                        #'2013-01-12', '2014-01-12', '2014-01-19',
                        #'2014-02-02', '2015-01-11', '2016-01-17',
                        #'2016-01-24', '2016-02-07']),
  #'lower_window': 0,
  #'upper_window': 1,
#})
#superbowls = pd.DataFrame({
  #'holiday': 'superbowl',
  #'ds': pd.to_datetime(['2010-02-07', '2014-02-02', '2016-02-07']),
  #'lower_window': 0,
  #'upper_window': 1,
#})
#holidays = pd.concat((playoffs, superbowls))

