In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from catboost import CatBoostRegressor
import xgboost as xgb
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error
import numpy as np



In [None]:
def aggregate_weather_data(daily_weather):
        
        daily_weather['tarih'] = pd.to_datetime(daily_weather['tarih'])
        daily_weather['day'] = daily_weather['tarih'].dt.date
        daily_weather['hour'] = daily_weather['tarih'].dt.hour


        daily_weather = daily_weather.groupby(['ilce', pd.Grouper(freq='D', key='tarih')])
        ## Apply the aggregations
        daily_weather= daily_weather.agg({
            't_2m:C': ['max', 'min', 'mean','std'],  # temperature
            'prob_precip_1h:p': ['sum', 'max' ,'mean'],  # precipitation
            'wind_speed_10m:ms': ['max', 'mean','std'],  # wind speed
            'wind_dir_10m:d': 'mean',  # wind direction
            'global_rad:W': 'sum',  # sunshine duration
            'effective_cloud_cover:p': ['mean','std'],  # cloud cover
            'relative_humidity_2m:p': ['max', 'min', 'mean']  # humidity
        })

        # Flatten the MultiIndex columns
        daily_weather.columns = ['_'.join(col).strip() for col in daily_weather.columns.values]
        daily_weather= daily_weather.reset_index() 
        return daily_weather
def create_unique_id(df):
    df['unique_id'] = df['tarih'].astype(str) +  '-' +df['ilce'].astype(str) 
    return df

In [None]:
from datetime import datetime
train= pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
sample_submission = pd.read_csv("sample_submission.csv")
weather = pd.read_csv("weather.csv")
holidays = pd.read_csv("holidays.csv")

# "date" sütununu "tarih" olarak değiştirme
weather.rename(columns={"date": "tarih"}, inplace=True)
weather.rename(columns={"name": "ilce"}, inplace=True)

print(weather.keys)
# Tüm ilçe isimlerini küçük harfe dönüştür
weather["ilce"] = weather["ilce"].str.lower()
train["ilce"] = train["ilce"].str.lower()
test["ilce"] = test["ilce"].str.lower()

print(len(test))
print(len(train))


In [None]:
holidays = pd.read_csv('holidays.csv')
holidays.columns =['year', 'month' ,'day' , 'holiday']
add_columns = holidays['holiday']
# Create a new column for each unique holiday name

holidays['tarih'] = pd.to_datetime(holidays[['year', 'month', 'day']])
for new in holidays['holiday'].unique():
    holidays[new] = (holidays['holiday'] == new).astype(int)
holidays.drop(['holiday', 'year', 'month', 'day'] , axis=1 , inplace=True)
holidays.columns
holidays.tail()


In [None]:
weather.rename(columns={'date': 'tarih','name':'ilce'}, inplace=True)
weather = aggregate_weather_data(weather)
#weather = create_unique_id(weather)
#weather['unique_id']


In [None]:

after_date = datetime(2024, 2, 1)
# train data buraya baslangictan 2024/2/01'a kadar olacak
weather_train = weather[weather['tarih'] < after_date]
holidays_train = holidays[holidays['tarih'] < after_date]

# test data buraya sadece asagidaki tarihten itibaren olacak
weather_test = weather[weather['tarih'] >= after_date ]
holidays_test = holidays[holidays['tarih'] >= after_date]
len(weather_test)


In [None]:
#ARADA BOSLUK GUNLER VAR ONLARI MEAN ILE DOLDURDUK SORUN OLABILIR TABI AMA OLSUN
from sklearn.impute import SimpleImputer


columns_to_impute = ['bildirimsiz_sum', 'bildirimli_sum']

# Create a simple imputer
imputer = SimpleImputer(strategy='mean')

# Fit the imputer to the data
imputer.fit(train[columns_to_impute])

# Impute the missing values
train[columns_to_impute] = imputer.transform(train[columns_to_impute])
train.tail()

In [None]:
train.columns = ['tarih' , 'ilce' , 'bildirimsiz_sum', 'bildirimli_sum']
#train = create_unique_id(train)
train['tarih'] = pd.to_datetime(train['tarih'])
 #Process data and train model
train

In [None]:
test.columns= ['tarih','ilce','bildirimli_sum']
#test = create_unique_id(test)
test.head()
print(len(test))

In [None]:

sample_submission.tail()

In [None]:
train.sample()

In [None]:
##@@@ test ops train ops
test['tarih'] = pd.to_datetime(test['tarih'])

test['days_in_month'] = test['tarih'].dt.days_in_month
test['days_in_year'] = test['tarih'].dt.day_of_year
test['days_in_week'] = test['tarih'].dt.weekday
test['month'] = test['tarih'].dt.month
test['year'] = test['tarih'].dt.year

train['tarih'] = pd.to_datetime(train['tarih'])

train['days_in_month'] = train['tarih'].dt.days_in_month
train['days_in_year'] = train['tarih'].dt.day_of_year
train['days_in_week'] = train['tarih'].dt.weekday
train['month'] = train['tarih'].dt.month
train['year'] = train['tarih'].dt.year


# Tarih sütunlarının formatlarını uygun hale getir
weather["tarih"] = pd.to_datetime(weather["tarih"])
train["tarih"] = pd.to_datetime(train["tarih"])
test["tarih"] = pd.to_datetime(test["tarih"])
max(test['tarih'])
len(test)

In [None]:
from sklearn.calibration import LabelEncoder
label_encoder = LabelEncoder()
label_encoder.fit(train['ilce'])
# Transform both datasets using the same encoder
train['ilce-value'] = label_encoder.transform(train['ilce'])
test['ilce-value'] = label_encoder.transform(test['ilce'])


In [None]:
# Set ilce and tarih as the index
test.set_index(['ilce', 'tarih'], inplace=True)
weather_test.set_index(['ilce', 'tarih'], inplace=True)
train.set_index(['ilce', 'tarih'], inplace=True)
weather_train.set_index(['ilce', 'tarih'], inplace=True)

# Merge the dataframes
test_df = pd.merge(test, weather_test, on=['ilce', 'tarih'])
train_df = pd.merge(train, weather_train, on=['ilce', 'tarih'])
train_df = pd.merge(train, weather_train, left_index=True, right_index=True)
len(test_df)
test_df

Creating Weather data and merging them into train data 

In [None]:
train_index_values = train_df.index.get_level_values('tarih')
train_df['tarih-value'] = pd.factorize(train_index_values)[0]
max_train_tarih = max(train_df['tarih-value'])

test_index_values = test_df.index.get_level_values('tarih')
test_df['tarih-value'] = pd.factorize(test_index_values)[0] + max_train_tarih + 1

test_df

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from sklearn.metrics import mean_squared_error

# Hedef değişkeni ve özellikleri ayır
X = train_df.drop(columns=["bildirimsiz_sum"])
y = train_df["bildirimsiz_sum"]

corr = train_df.corr()
target_corr = abs(corr["bildirimsiz_sum"])
corr_threshold = 0.02
high_corr_features = target_corr[target_corr > corr_threshold]
# özellik isimlerini alalım ve bildirimsiz_sum özelliğini çıkaralım
hcf_names = [k for k, v in high_corr_features.items()]; hcf_names.remove("bildirimsiz_sum")
#print(hcf_names)
features= ['bildirimli_sum', 'days_in_month', 'days_in_year',
       'days_in_week', 'month', 'year', 't_2m:C_max', 't_2m:C_min',
       't_2m:C_mean', 't_2m:C_std', 'prob_precip_1h:p_sum',
       'prob_precip_1h:p_max', 'prob_precip_1h:p_mean',
       'wind_speed_10m:ms_max', 'wind_speed_10m:ms_mean',
       'wind_speed_10m:ms_std', 'wind_dir_10m:d_mean', 'global_rad:W_sum',
       'effective_cloud_cover:p_mean', 'effective_cloud_cover:p_std',
       'relative_humidity_2m:p_max', 'relative_humidity_2m:p_min',
       'relative_humidity_2m:p_mean', 'tarih-value', 'ilce-value']

In [None]:
test_df.columns




In [None]:
X_test = test_df[features]
X_test

In [None]:
from sklearn.model_selection import RandomizedSearchCV
param_grid = {
    'learning_rate': [0.01, 0.05, 0.1, 0.3],
    'max_depth': [3, 5, 7, 10],
    'n_estimators': [50, 100, 200, 300],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
    'gamma': [0, 0.1, 0.3]
}

# Model oluşturma
xgb = XGBRegressor()

# GridSearchCV ile en iyi parametre kombinasyonunun bulunması
grid_search = GridSearchCV(estimator=xgb, param_grid=param_grid, cv=3, n_jobs=1, verbose=2)
grid_search.fit(X,y)
#njobs -1 olmali (benim makinem paralel calismiyor cunku )
# En iyi parametrelerin bulunması
best_params = grid_search.best_params_
print("En iyi parametreler:", best_params)


In [None]:
best_params = grid_search.best_params_

In [None]:

# XGBoost modelini eğitme
xgboost_model = XGBRegressor(**best_params)
xgboost_model.fit(train_df[features],y)

# CatBoost modelini eğitme
catboost_model = CatBoostRegressor()
catboost_model.fit(train_df[features],y)

# Tahminler yapma



X_test = test_df[features]
X_test['ilce'] = label_encoder.fit_transform(X_test['ilce'])
X_test


In [None]:
print(len(test_df))

In [None]:


#catboost_preds = catboost_model.predict(X_test)
xgboost_preds = xgboost_model.predict(X_test)
catboost_preds = catboost_model.predict(X_test)
print("----------------------------------")
print(len(X_test))
print(catboost_preds)

In [None]:

ensemble_preds = (catboost_preds + xgboost_preds) / 2
ensemble_preds=np.round(catboost_preds).astype(np.int8)
# Sample submission dosyasına tahminleri ekleyerek yeni bir dosya oluşturma
submission = sample_submission.copy()
print(ensemble_preds)
submission["bildirimsiz_sum"] = ensemble_preds
submission.to_csv("ensemble3_submission.csv", index=False)
#model2.py