In [94]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import  mean_absolute_error
import xgboost as xgb
from random import randint
from sklearn.model_selection import train_test_split


## Data Cleaning

In [95]:
# Import data
data = pd.read_csv(r"C:\Users\w1381\桌面\python-projects\5261\train.csv")

In [96]:
# Extract useful features
relevant_features = ['imbalance_size', 'reference_price']


In [97]:
# Check for missing values in the dataset
missing_values = data.isnull().sum()
missing_values

stock_id                         0
date_id                          0
seconds_in_bucket                0
imbalance_size                 220
imbalance_buy_sell_flag          0
reference_price                220
matched_size                   220
far_price                  2894342
near_price                 2857180
bid_price                      220
bid_size                         0
ask_price                      220
ask_size                         0
wap                            220
target                          88
time_id                          0
row_id                           0
dtype: int64

In [98]:
# Create a ColumnTransformer for preprocessing of features
preprocessor = ColumnTransformer(
    transformers=[
        ('impute_scale', Pipeline([
            ('imputer', SimpleImputer(strategy='median')),
            ('scaler', StandardScaler())
        ]), relevant_features)
    ]
)


In [99]:
# Filling NA in the "target" with median
data["target"] = SimpleImputer(strategy='median').fit_transform(data["target"].values.reshape(-1,1))

In [100]:
data["target"].isna().sum()

0

In [101]:
# resample 1% of the data (original dataset is too large over 6m rows)
sampled_data = data.sample(frac=0.0001, random_state=83)  # Randomly sample 10% of the data

In [102]:
# Extract X and y
X = sampled_data[relevant_features]
y = sampled_data["target"]

In [103]:
# Split the data for train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1012)

## Random Forests Regression

In [104]:
# Create the pipeline with the preprocessor and the RandomForestRegressor
RF_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(n_jobs= - 1))
])

In [105]:
# Fit the RF_pipeline model
RF_pipeline.fit(X_train, y_train)

In [106]:
y_pred = RF_pipeline.predict(X_test)
     
print("Model MAE:", mean_absolute_error(y_test, y_pred))

Model MAE: 7.3703983901834285


In [107]:
# Get the feature importances
feature_importances = RF_pipeline.named_steps['regressor'].feature_importances_
print(feature_importances)

# Retrieve feature importances and map them to the corresponding feature names
feature_names = preprocessor.get_feature_names_out()

# Create a DataFrame to hold feature importances
importance_df = pd.DataFrame({
    'Feature': feature_names,
    'Importance': feature_importances
})

# Sort the DataFrame by feature importances
importance_df.sort_values(by='Importance', ascending=False, inplace=True)

# Display the feature importances
importance_df

[0.46730994 0.53269006]


Unnamed: 0,Feature,Importance
1,impute_scale__reference_price,0.53269
0,impute_scale__imbalance_size,0.46731


In [108]:
# # Define the parameter distribution for RandomForestRegressor with a reduced range and simpler models
# param_dist = {
#     'regressor__n_estimators': randint(20, 50),  # fewer trees
#     'regressor__max_depth': [5, 10],  # fewer options for max depth
#     'regressor__min_samples_split': randint(6, 10), # fewer split
#     'regressor__min_samples_leaf': randint(4, 6) # fewer leaves
# }


# # Instantiate the RandomizedSearchCV object with the pipeline and parameter distribution
# random_search = RandomizedSearchCV(RF_pipeline, param_distributions=param_dist,
#                                    n_iter=5,  # fewer iterations
#                                    cv=3,
#                                    scoring='neg_mean_squared_error',
#                                    n_jobs=-1)

# # Fit the RandomizedSearchCV object to the data
# random_search.fit(X_train_sample, y_train_sample)

# # View the best parameters from the hyperparameter tuning
# print("Best hyperparameters:", random_search.best_params_)

# # View the best score found during the search
# print("Best MSE score from Random Search:", -random_search.best_score_)

In [109]:
# Extract the best model from randomized search
RF_best_model = random_search.best_estimator_



In [110]:
# Make prediction using RF_best_model
y_pred = RF_best_model.predict(X_test)
