In [2]:
# Import necessary libraries
import pandas as pd
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder

In [3]:
# Load the training data
train_data = pd.read_csv("train_set_dirty.csv")

In [4]:
# Preprocess the training data
# Drop duplicates
train_data = train_data.drop_duplicates(subset='date_time', ignore_index=True)

In [None]:
# Convert 'holiday' column to binary
train_data['holiday'] = train_data['holiday'].apply(lambda x: 0 if pd.isna(x) else 1)

In [5]:
# Handle non-numeric values in 'weather_main' column using label encoding
label_encoder = LabelEncoder()
train_data['weather_main'] = label_encoder.fit_transform(train_data['weather_main'])

In [None]:
# Drop rows with missing target values
train_data.dropna(subset=['traffic_volume'], inplace=True)

In [36]:
# Split training data into features and target
X_train = train_data.drop(columns=['date_time', 'traffic_volume', 'weather_description'])
y_train = train_data['traffic_volume']

In [37]:
# Initialize the model
regr = HistGradientBoostingRegressor(random_state=32)

In [38]:
# Define hyperparameters to tune
param_grid = {
    'max_iter': [100, 200, 300],
    'max_depth': [None, 5, 10],
    'learning_rate': [0.1, 0.05, 0.01],
    'min_samples_leaf': [1, 3, 5],
}

In [39]:
# Perform grid search with cross-validation
grid_search = GridSearchCV(estimator=regr, param_grid=param_grid, cv=3, scoring='neg_mean_squared_error')
grid_search.fit(X_train, y_train)

In [40]:
# Get the best model from the grid search
best_regr = grid_search.best_estimator_

In [41]:
# Load the test data
test_data = pd.read_csv('test_set_nogt.csv')

In [42]:
# Preprocess test data
test_data['holiday'] = test_data['holiday'].apply(lambda x: 0 if pd.isna(x) else 1)
test_data['weather_main'] = label_encoder.transform(test_data['weather_main'])
test_data = test_data.drop(columns=['date_time', 'weather_description'])

In [43]:
# Predict on the test data
y_pred = best_regr.predict(test_data)

In [45]:
# Print the predictions
print(y_pred)

[3389.18402758 3429.10632771 3605.5125776  ... 3648.31794474 3591.67761807
 3387.86459577]


In [46]:
submission = pd.DataFrame({'ID': test_data.index, 'traffic_volume': y_pred})

In [None]:
submission.to_csv('Third_submission.csv', index = False)