In [180]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from catboost import CatBoostRegressor
import xgboost as xgb
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error




In [181]:
class MLModel:
    def __init__(self, model_type='catboost'):
        self.model_type = model_type
        self.scaler = StandardScaler

    def preprocess_data(self, train_df, weather_df, holidays_df=None):
    #######burasi cok sikinti cok######
        train_df = pd.merge(train_df,weather_df,['date','ilce'])
    
        # Fill missing values in the resulting DataFrame using the ffill method
    
        return train_df

    def aggregate_weather_data(self, daily_weather):
        
        daily_weather['date'] = pd.to_datetime(daily_weather['date'])
        daily_weather['day'] = daily_weather['date'].dt.date
        daily_weather['hour'] = daily_weather['date'].dt.hour


        daily_weather = daily_weather.groupby(['ilce', pd.Grouper(freq='D', key='date')])
        ## Apply the aggregations
        daily_weather= daily_weather.agg({
            't_2m:C': ['max', 'min', 'mean','std'],  # temperature
            'prob_precip_1h:p': ['sum', 'max' ,'mean'],  # precipitation
            'wind_speed_10m:ms': ['max', 'mean','std'],  # wind speed
            'wind_dir_10m:d': 'mean',  # wind direction
            'global_rad:W': 'sum',  # sunshine duration
            'effective_cloud_cover:p': ['mean','std'],  # cloud cover
            'relative_humidity_2m:p': ['max', 'min', 'mean']  # humidity
        })

        # Flatten the MultiIndex columns
        daily_weather.columns = ['_'.join(col).strip() for col in daily_weather.columns.values]
        daily_weather= daily_weather.reset_index() 
        return daily_weather

    def split_data(self, train_df):
        features = [col for col in train_df.columns if col not in ['tarih', 'ilce', 'bildirimsiz_sum', 'bildirimli_sum']]
        X = train_df[features]
        y = train_df["bildirimsiz_sum"]
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
        X_train = self.scaler.fit_transform(X_train)
        X_test = self.scaler.transform(X_test)
        return X_train, X_test, y_train, y_test

    def train_model(self, X_train, X_test, y_train, y_test):
        model = CatBoostRegressor()
        params = {'learning_rate': [0.01, 0.1, 1], 'depth': [3, 5, 7], 'l2_leaf_reg': [0.1, 1, 10]}
        grid = GridSearchCV(model, params, cv=5, scoring='neg_mean_squared_error')
        grid.fit(X_train, y_train)
        model = CatBoostRegressor(**grid.best_params_)
        model.fit(X_train, y_train)
        preds = model.predict(X_test)
        return preds

In [182]:

# Load the datasets
train = pd.read_csv("train.csv")
weather = pd.read_csv("weather.csv")
holidays = pd.read_csv("holidays.csv")
test = pd.read_csv("test.csv")
sample_submission = pd.read_csv("sample_submission.csv")

In [183]:
# Rename columns in holidays_df and convert date format
holidays.columns = ['year', 'month', 'day', 'holiday_name']
holidays['date'] = pd.to_datetime(holidays[['year', 'month', 'day']])
holidays.drop(['year', 'month', 'day'], axis=1, inplace=True)
holidays.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 55 entries, 0 to 54
Data columns (total 2 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   holiday_name  55 non-null     object        
 1   date          55 non-null     datetime64[ns]
dtypes: datetime64[ns](1), object(1)
memory usage: 1012.0+ bytes


In [184]:
# Instantiate the model class
model = MLModel(model_type='catboost')

In [185]:
train.head()
 #Process data and train model

Unnamed: 0,tarih,ilce,bildirimsiz_sum,bildirimli_sum
0,2021-01-01,izmir-aliaga,5,0
1,2021-01-02,izmir-aliaga,13,0
2,2021-01-03,izmir-aliaga,4,0
3,2021-01-04,izmir-aliaga,9,0
4,2021-01-05,izmir-aliaga,2,0


In [186]:
# Rename columns in weather_df to English
weather.columns = ['date', 'lat', 'lon', 't_2m:C', 'effective_cloud_cover:p', 'global_rad:W', 'relative_humidity_2m:p', 'wind_dir_10m:d', 'wind_speed_10m:ms', 'prob_precip_1h:p', 't_apparent:C', 'ilce']
#location_data = weather[['lat', 'lon', 'ilce']]
#weather.drop(['lat', 'lon'], axis=1, inplace=True)
#weather.loc[:, 'ilce'] = weather['ilce'].apply(lambda x: x.lower())
#weather.head()

In [187]:
holidays.head()


Unnamed: 0,holiday_name,date
0,New Year's Day,2021-01-01
1,National Sovereignty and Children's Day,2021-04-23
2,Labour Day,2021-05-01
3,"Commemoration of Ataturk, Youth and Sports Day",2021-05-19
4,Democracy and National Unity Day,2021-07-15


In [188]:
test.columns= ['date','ilce','bildirimli_sum']
test.head()


Unnamed: 0,date,ilce,bildirimli_sum
0,2024-02-01,izmir-aliaga,0
1,2024-02-01,izmir-bayindir,1
2,2024-02-01,izmir-bayrakli,0
3,2024-02-01,izmir-bergama,1
4,2024-02-01,izmir-bornova,1


In [189]:

sample_submission.head()

Unnamed: 0,unique_id,bildirimsiz_sum
0,2024-02-01-izmir-aliaga,
1,2024-02-01-izmir-bayindir,
2,2024-02-01-izmir-bayrakli,
3,2024-02-01-izmir-bergama,
4,2024-02-01-izmir-bornova,


In [190]:
dailyw= model.aggregate_weather_data(daily_weather=weather)
dailyw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 54285 entries, 0 to 54284
Data columns (total 19 columns):
 #   Column                        Non-Null Count  Dtype         
---  ------                        --------------  -----         
 0   ilce                          54285 non-null  object        
 1   date                          54285 non-null  datetime64[ns]
 2   t_2m:C_max                    54285 non-null  float64       
 3   t_2m:C_min                    54285 non-null  float64       
 4   t_2m:C_mean                   54285 non-null  float64       
 5   t_2m:C_std                    54285 non-null  float64       
 6   prob_precip_1h:p_sum          54285 non-null  float64       
 7   prob_precip_1h:p_max          54285 non-null  float64       
 8   prob_precip_1h:p_mean         54285 non-null  float64       
 9   wind_speed_10m:ms_max         54285 non-null  float64       
 10  wind_speed_10m:ms_mean        54285 non-null  float64       
 11  wind_speed_10m:ms_std       

In [192]:

model.preprocess_data(train, weather, holidays)

UnboundLocalError: cannot access local variable 'lidx' where it is not associated with a value

In [None]:

# Evaluate the model
model.evaluate_model(model.X_val_scaled, model.y_val)



In [None]:
# Make predictions on the test data
test_predictions = model.model.predict(test.drop(['bildirimli_sum'], axis=1))



In [None]:
# Make predictions on the sample submission data
sample_submission_predictions = model.model.predict(sample_submission.drop(['bildirimsiz_sum'], axis=1))



In [None]:
# Save the predictions to CSV files
pd.DataFrame({'bildirimli_sum': test_predictions}).to_csv('test_predictions.csv', index=False)
pd.DataFrame({'bildirimsiz_sum': sample_submission_predictions}).to_csv('sample_submission_predictions.csv', index=False)

In [None]:
 Create an instance of the MLModel class
model = MLModel(model_type='catboost')

# Process and train the model
model.process_data_and_train(train, weather, holidays, test, sample_submission)

# Evaluate the model
model.evaluate_model(model.X_val_scaled, model.y_val)

# Make predictions on the test data
test_predictions = model.model.predict(test.drop(['bildirimli_sum'], axis=1))

# Make predictions on the sample submission data
sample_submission_predictions = model.model.predict(sample_submission.drop(['bildirimsiz_sum'], axis=1))

# Save the predictions to a CSV file (optional)
pd.DataFrame({'bildirimli_sum': test_predictions}).to_csv('test_predictions.csv', index=False)
pd.DataFrame({'bildirimsiz_sum': sample_submission_predictions}).to_csv('sample_submission_predictions.csv', index=False)