In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.pipeline import Pipeline

In [2]:
train_data = pd.read_csv('Train.csv')
test_data = pd.read_csv('Test.csv')

In [3]:
for data in [train_data, test_data]:
    data['date'] = pd.to_datetime(data['date'])
    data['year'] = data['date'].dt.year
    data['month'] = data['date'].dt.month
    data['day'] = data['date'].dt.day

In [4]:
drop_columns = ['id', 'site_id', 'date']

categorical_features = ['city', 'country']
numerical_features = ['year', 'month', 'day']

In [5]:
for data in [train_data, test_data]:
    for col in data.select_dtypes(include=np.number).columns:
        if data[col].isnull().any():
            data[col].fillna(data[col].median(), inplace=True)

In [6]:
X_train = train_data.drop(columns=drop_columns + ['pm2_5'])
y_train = train_data['pm2_5']

In [7]:
X_test = test_data.drop(columns=drop_columns)
ids_test = test_data['id']

In [8]:
categorical_transformer = OneHotEncoder(handle_unknown='ignore')
numerical_transformer = StandardScaler()

In [9]:
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, categorical_features),
        ('num', numerical_transformer, numerical_features)
    ])

In [10]:
rf_model = RandomForestRegressor(random_state=42)

In [11]:
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', rf_model)
])

In [12]:
param_grid = {
    'model__n_estimators': [100, 200, 500],
    'model__max_depth': [None, 10, 20],
    'model__max_features': ['sqrt', 'log2', None]
}

In [13]:
randomized_search = RandomizedSearchCV(
    pipeline, param_grid, n_iter=100, cv=5, scoring='neg_root_mean_squared_error', random_state=42, n_jobs=-1, verbose=1)

In [14]:
randomized_search.fit(X_train, y_train)



Fitting 5 folds for each of 27 candidates, totalling 135 fits


In [15]:
best_estimator = randomized_search.best_estimator_

In [16]:
predictions = best_estimator.predict(X_test)

In [17]:
predictions_df = pd.DataFrame({
    'id': ids_test,
    'pm2_5': predictions
})

In [18]:
predictions_df.to_csv('test_predictions_rf_optimized_claude1.csv', index=False)