In [29]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
import lightgbm as lgb
import xgboost as xgb

In [30]:
train_data = pd.read_csv("MiNDAT.csv")
test_data = pd.read_csv("MiNDAT_UNK.csv")


In [31]:
target_column = "CORRUCYSTIC_DENSITY"



In [32]:
train_data = train_data.dropna(subset=[target_column])


In [33]:
X = train_data.drop(columns=[target_column])
y = train_data[target_column]

In [34]:
# Remove rows where target is NaN (just in case)
mask = y.notna()
X = X[mask]
y = y[mask]

In [35]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)


In [39]:
numeric_features = X_train.select_dtypes(include=["int64", "float64"]).columns
categorical_features = X_train.select_dtypes(include=["object"]).columns


In [40]:
# Preprocessor: scale numerics + encode categoricals
preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numeric_features),
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_features),
    ]
)


In [41]:
models = {
    "RandomForest": RandomForestRegressor(n_estimators=200, random_state=42),
    "GradientBoosting": GradientBoostingRegressor(n_estimators=200, random_state=42),
    "XGBoost": xgb.XGBRegressor(n_estimators=300, random_state=42, tree_method="hist"),
    "LightGBM": lgb.LGBMRegressor(n_estimators=300, random_state=42)
}


In [44]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer

# Example: separate numerical and categorical features
numeric_features = X_train.select_dtypes(include=["int64", "float64"]).columns
categorical_features = X_train.select_dtypes(include=["object"]).columns

numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="mean")),      # fill NaN with mean
    ("scaler", StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),  # fill NaN with mode
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features)
    ]
)

# Example model test loop
for name, model in models.items():
    pipe = Pipeline(steps=[("preprocessor", preprocessor), ("model", model)])
    pipe.fit(X_train, y_train)
    y_pred = pipe.predict(X_val)
    mse = mean_squared_error(y_val, y_pred)
    rmse = np.sqrt(mse)
    print(f"{name} RMSE: {rmse:.4f}")


RandomForest RMSE: 172.2157
GradientBoosting RMSE: 183.9197
XGBoost RMSE: 181.9850
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002264 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 11266
[LightGBM] [Info] Number of data points in the train set: 8820, number of used features: 67
[LightGBM] [Info] Start training from score 448.512124
LightGBM RMSE: 178.3105




In [45]:
final_model = lgb.LGBMRegressor(n_estimators=500, random_state=42)
pipe = Pipeline(steps=[("preprocessor", preprocessor), ("model", final_model)])
pipe.fit(X, y)


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.004690 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 11266
[LightGBM] [Info] Number of data points in the train set: 11026, number of used features: 67
[LightGBM] [Info] Start training from score 449.481411


0,1,2
,steps,"[('preprocessor', ...), ('model', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'mean'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,boosting_type,'gbdt'
,num_leaves,31
,max_depth,-1
,learning_rate,0.1
,n_estimators,500
,subsample_for_bin,200000
,objective,
,class_weight,
,min_split_gain,0.0
,min_child_weight,0.001


In [49]:
# ✅ Make sure test_data has the same feature columns as training
X_test = test_data[X_train.columns]  

# Predict with pipeline
preds = pipe.predict(X_test)

# Create submission file
submission = pd.DataFrame({
    "LOCAL_IDENTIFIER": test_data["LOCAL_IDENTIFIER"],  # ID column from test set
    target_column: preds  # replace target_column with your actual target name
})

# Save to CSV
submission.to_csv("submission1.csv", index=False)

print("✅ submission1.csv created successfully!")




✅ submission1.csv created successfully!


✅ submission.csv created successfully
