In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score

# 1. Load data
df = pd.read_csv('/content/training_dataset (1).csv')
val_df = pd.read_csv('/content/validation_set (1).csv')

# 2. Define features and target
target = 'berlangganan_deposito'
# assume 'customer_number' is ID, not feature
features = [c for c in df.columns if c not in [target, 'customer_number']]

X = df[features]
y = df[target]

# 3. Split training data for model evaluation
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 4. Preprocessing: numeric vs categorical
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_features = X.select_dtypes(include=['object', 'category']).columns.tolist()

numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    # Replaced 'sparse=False' with 'sparse_output=False'
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)

# 5. Define regression models to try
models = {
    'LinearRegression': LinearRegression(),
    'Ridge': Ridge(random_state=42),
    'Lasso': Lasso(random_state=42),
    'ElasticNet': ElasticNet(random_state=42),
    'SVR': SVR(),
    'RandomForest': RandomForestRegressor(random_state=42),
    'GradientBoosting': GradientBoostingRegressor(random_state=42)
}

# 6. Evaluate each model with cross-validation
results = []
for name, model in models.items():
    pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                               ('regressor', model)])
    # Use negative MSE for scoring, and R2
    neg_mse = cross_val_score(pipeline, X_train, y_train, scoring='neg_mean_squared_error', cv=5)
    r2 = cross_val_score(pipeline, X_train, y_train, scoring='r2', cv=5)
    results.append({
        'model': name,
        'RMSE_mean': np.sqrt(-neg_mse).mean(),
        'RMSE_std': np.sqrt(-neg_mse).std(),
        'R2_mean': r2.mean(),
        'R2_std': r2.std()
    })

results_df = pd.DataFrame(results).sort_values('RMSE_mean')
print("Cross-validation results:\n", results_df)

# 7. Select best model (lowest RMSE)
best_name = results_df.iloc[0]['model']
best_model = models[best_name]
print(f"\nBest model based on CV RMSE: {best_name}")

# 8. Train best model on full training data
best_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                                ('regressor', best_model)])
best_pipeline.fit(X, y)

# 9. Predict on validation set
X_val = val_df[features]
val_df['berlangganan_deposito'] = best_pipeline.predict(X_val)

# 10. Prepare submission
submission = val_df[['customer_number', 'berlangganan_deposito']]
submission.to_csv('submission.csv', index=False)
print("Submission file 'submission.csv' created.")

Cross-validation results:
               model  RMSE_mean  RMSE_std   R2_mean    R2_std
6  GradientBoosting   0.281483  0.007395  0.215922  0.025805
1             Ridge   0.283621  0.007361  0.203998  0.024767
0  LinearRegression   0.283635  0.007376  0.203918  0.024784
5      RandomForest   0.293113  0.004239  0.149629  0.018173
4               SVR   0.293912  0.006927  0.145186  0.022931
2             Lasso   0.317984  0.006129 -0.000407  0.000561
3        ElasticNet   0.317984  0.006129 -0.000407  0.000561

Best model based on CV RMSE: GradientBoosting
Submission file 'submission.csv' created.
