In [14]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from catboost import CatBoostRegressor
import xgboost as xgb
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error




In [18]:
class MLModel:
    def __init__(self, model_type='catboost'):
        self.model_type = model_type
        self.scaler = StandardScaler

    def preprocess_data(self, train_df, weather_df, holidays_df=None):
        # Merge train_df with weather_df based on 'date' and 'ilce' columns
        # train_df = train_df.merge(weather_df, on=['date', 'ilce'], how='left')
    
        # If holidays_df is provided, merge train_df with holidays_df based on 'date' and 'ilce' columns
        if holidays_df is not None:
            train_df = train_df.merge(holidays_df, on=['date', 'ilce'], how='left')
    
        # Fill missing values in the resulting DataFrame using the ffill method
        train_df.fillna(method='ffill', inplace=True)
    
        return train_df

    def aggregate_weather_data(self, daily_weather):
        daily_weather = daily_weather.agg({
            't_2m:C': ['max', 'min', 'mean', 'std'],  
            'prob_precip_1h:p': ['sum', 'max', 'mean'],  
            'wind_speed_10m:ms': ['max', 'mean', 'std'],  
            'wind_dir_10m:d': 'mean',  
            'global_rad:W': 'sum',  
            'effective_cloud_cover:p': ['mean', 'std'],  
            'relative_humidity_2m:p': ['max', 'min', 'mean']  
        })
        daily_weather.columns = ['_'.join(col).strip() for col in daily_weather.columns.values]
        daily_weather = daily_weather.reset_index()
        daily_weather = daily_weather.rename(columns={'date': 'tarih', 'name': 'ilce'})
        return daily_weather

    def split_data(self, train_df):
        features = [col for col in train_df.columns if col not in ['tarih', 'ilce', 'bildirimsiz_sum', 'bildirimli_sum']]
        X = train_df[features]
        y = train_df["bildirimsiz_sum"]
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
        X_train = self.scaler.fit_transform(X_train)
        X_test = self.scaler.transform(X_test)
        return X_train, X_test, y_train, y_test

    def train_model(self, X_train, X_test, y_train, y_test):
        model = CatBoostRegressor()
        params = {'learning_rate': [0.01, 0.1, 1], 'depth': [3, 5, 7], 'l2_leaf_reg': [0.1, 1, 10]}
        grid = GridSearchCV(model, params, cv=5, scoring='neg_mean_squared_error')
        grid.fit(X_train, y_train)
        model = CatBoostRegressor(**grid.best_params_)
        model.fit(X_train, y_train)
        preds = model.predict(X_test)
        return preds

In [19]:

# Load the datasets
train = pd.read_csv("train.csv")
weather = pd.read_csv("weather.csv")
holidays = pd.read_csv("holidays.csv")
test = pd.read_csv("test.csv")
sample_submission = pd.read_csv("sample_submission.csv")

In [20]:
# Rename columns in holidays_df and convert date format
holidays.columns = ['year', 'month', 'day', 'holiday_name']
holidays['date'] = pd.to_datetime(holidays[['year', 'month', 'day']])
holidays.drop(['year', 'month', 'day'], axis=1, inplace=True)
holidays.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 55 entries, 0 to 54
Data columns (total 2 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   holiday_name  55 non-null     object        
 1   date          55 non-null     datetime64[ns]
dtypes: datetime64[ns](1), object(1)
memory usage: 1012.0+ bytes


In [21]:
# Instantiate the model class
model = MLModel(model_type='catboost')

In [22]:
train.head()
 #Process data and train model

Unnamed: 0,tarih,ilce,bildirimsiz_sum,bildirimli_sum
0,2021-01-01,izmir-aliaga,5,0
1,2021-01-02,izmir-aliaga,13,0
2,2021-01-03,izmir-aliaga,4,0
3,2021-01-04,izmir-aliaga,9,0
4,2021-01-05,izmir-aliaga,2,0


In [23]:
# Rename columns in weather_df to English
weather.columns = ['date', 'lat', 'lon', 't_2m:C', 'effective_cloud_cover:p', 'global_rad:W', 'relative_humidity_2m:p', 'wind_dir_10m:ms', 'wind_speed_10m:ms', 'prob_precip_1h:p', 't_apparent:C', 'ilce']
location_data = weather[['lat', 'lon', 'ilce']]
weather.drop(['lat', 'lon'], axis=1, inplace=True)
weather.loc[:, 'ilce'] = weather['ilce'].apply(lambda x: x.lower())
weather.head()

Unnamed: 0,date,t_2m:C,effective_cloud_cover:p,global_rad:W,relative_humidity_2m:p,wind_dir_10m:ms,wind_speed_10m:ms,prob_precip_1h:p,t_apparent:C,ilce
0,2021-01-01 00:00:00,7.6,10.5,0.0,86.1,173.3,2.6,1.0,5.9,manisa-ahmetli
1,2021-01-01 01:00:00,7.4,6.8,0.0,88.0,174.5,2.7,1.0,5.7,manisa-ahmetli
2,2021-01-01 02:00:00,7.3,4.3,0.0,90.0,177.5,2.8,1.0,5.4,manisa-ahmetli
3,2021-01-01 03:00:00,7.2,8.9,0.0,90.8,175.3,2.7,1.0,5.3,manisa-ahmetli
4,2021-01-01 04:00:00,8.0,22.1,0.0,89.7,172.1,2.5,1.0,6.5,manisa-ahmetli


In [24]:
holidays.head()


Unnamed: 0,holiday_name,date
0,New Year's Day,2021-01-01
1,National Sovereignty and Children's Day,2021-04-23
2,Labour Day,2021-05-01
3,"Commemoration of Ataturk, Youth and Sports Day",2021-05-19
4,Democracy and National Unity Day,2021-07-15


In [25]:
test.columns= ['date','ilce','bildirimli_sum']
test.head()


Unnamed: 0,date,ilce,bildirimli_sum
0,2024-02-01,izmir-aliaga,0
1,2024-02-01,izmir-bayindir,1
2,2024-02-01,izmir-bayrakli,0
3,2024-02-01,izmir-bergama,1
4,2024-02-01,izmir-bornova,1


In [26]:

sample_submission.head()

Unnamed: 0,unique_id,bildirimsiz_sum
0,2024-02-01-izmir-aliaga,
1,2024-02-01-izmir-bayindir,
2,2024-02-01-izmir-bayrakli,
3,2024-02-01-izmir-bergama,
4,2024-02-01-izmir-bornova,


In [27]:

model.preprocess_data(train, weather, holidays)

KeyError: 'date'

In [None]:

# Evaluate the model
model.evaluate_model(model.X_val_scaled, model.y_val)



In [None]:
# Make predictions on the test data
test_predictions = model.model.predict(test.drop(['bildirimli_sum'], axis=1))



In [None]:
# Make predictions on the sample submission data
sample_submission_predictions = model.model.predict(sample_submission.drop(['bildirimsiz_sum'], axis=1))



In [None]:
# Save the predictions to CSV files
pd.DataFrame({'bildirimli_sum': test_predictions}).to_csv('test_predictions.csv', index=False)
pd.DataFrame({'bildirimsiz_sum': sample_submission_predictions}).to_csv('sample_submission_predictions.csv', index=False)

In [None]:
 Create an instance of the MLModel class
model = MLModel(model_type='catboost')

# Process and train the model
model.process_data_and_train(train, weather, holidays, test, sample_submission)

# Evaluate the model
model.evaluate_model(model.X_val_scaled, model.y_val)

# Make predictions on the test data
test_predictions = model.model.predict(test.drop(['bildirimli_sum'], axis=1))

# Make predictions on the sample submission data
sample_submission_predictions = model.model.predict(sample_submission.drop(['bildirimsiz_sum'], axis=1))

# Save the predictions to a CSV file (optional)
pd.DataFrame({'bildirimli_sum': test_predictions}).to_csv('test_predictions.csv', index=False)
pd.DataFrame({'bildirimsiz_sum': sample_submission_predictions}).to_csv('sample_submission_predictions.csv', index=False)