In [1]:
!pip install vecstack
from vecstack import stacking
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score #works
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
#from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTE
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from collections import Counter #for Smote,
from sklearn.preprocessing import MinMaxScaler

import warnings
warnings.filterwarnings("ignore")



In [4]:
train_data = pd.read_csv('RevisedHomesiteTrain1.csv')
test_data = pd.read_csv('RevisedHomesiteTest1.csv')
test_data= test_data.drop(['GeographicField64'], axis=1)


print(train_data.shape)
print(test_data.shape)

(65000, 596)
(173836, 595)


In [5]:
X_train = train_data.drop(['QuoteConversion_Flag'], axis=1)  
y_train = train_data['QuoteConversion_Flag']
X_train.shape, y_train.shape

((65000, 595), (65000,))

In [6]:
# Reset the index to make sure both X_train and y_train are aligned
X_train = X_train.reset_index(drop=True)
y_train = y_train.reset_index(drop=True)

# Verify that both have the same number of rows
print("Shape of X_train:", X_train.shape)
print("Shape of y_train:", y_train.shape)

Shape of X_train: (65000, 595)
Shape of y_train: (65000,)


In [7]:
scaler = MinMaxScaler()

# Fit the scaler on training data
X_train_scaled = scaler.fit_transform(X_train)
print(X_train_scaled.shape)

# Transform the test data using the same scaler
X_test_scaled = scaler.transform(test_data)
print(X_test_scaled.shape)

(65000, 595)
(173836, 595)


In [8]:
# Spliting the training set
X_train1, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

In [9]:
# Decision tree

# Initialize and train the Decision Tree model
decisiont = DecisionTreeClassifier()
decisiont.fit(X_train1, y_train)

print("Model Parameters:", decisiont.get_params())

# Predictions on the validation set
Y_val_pred = decisiont.predict(X_val)

# Accuracy on the validation set
accuracy = accuracy_score(y_val, Y_val_pred)
print(f"Accuracy Score (validation) for Decision Tree: {accuracy:.6f}")

Model Parameters: {'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'gini', 'max_depth': None, 'max_features': None, 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'monotonic_cst': None, 'random_state': None, 'splitter': 'best'}
Accuracy Score (validation) for Decision Tree: 0.889538


In [58]:
#Hyperparameter tuning parameters
parameters={'min_samples_split' : [2, 10, 20],'max_depth': [10, 20, 30]}

In [57]:
#Hyperparameter tuning for decision tree classifier
decisiont_random = RandomizedSearchCV(decisiont,parameters,n_iter=15)
decisiont_random.fit(X_train1, y_train)
grid_parm=decisiont_random.best_params_
print(grid_parm)

#Using the parameters obtained from HyperParameterTuning in the DecisionTreeClassifier
dt = DecisionTreeClassifier(**grid_parm)
dt.fit(X_train1,y_train)
dt_predict = dt.predict(X_val)

{'min_samples_split': 2, 'max_depth': 10}


In [59]:
# Accuracy on the validation set
accuracy = accuracy_score(y_val, dt_predict)
print(f"Accuracy Score (validation) for Decision Tree after Hyperparameter Tuning: {accuracy:.6f}")

# Confusion matrix
print("Confusion Matrix after Hyperparameter Tuning for Decision Tree:")
print(confusion_matrix(y_val, dt_predict))

# Classification report
print("=== Classification Report ===")
print(classification_report(y_val, dt_predict))

# Cross-validation AUC score
dt_cv_score = cross_val_score(dt, X_train1, y_train, cv=10, scoring="roc_auc")
print("=== All AUC Scores ===")
print(dt_cv_score)

Accuracy Score (validation) for Decision Tree after Hyperparameter Tuning: 0.917846
Confusion Matrix after Hyperparameter Tuning for Decision Tree:
[[10286   269]
 [  799  1646]]
=== Classification Report ===
              precision    recall  f1-score   support

           0       0.93      0.97      0.95     10555
           1       0.86      0.67      0.76      2445

    accuracy                           0.92     13000
   macro avg       0.89      0.82      0.85     13000
weighted avg       0.92      0.92      0.91     13000

=== All AUC Scores ===
[0.94355506 0.93315493 0.93040935 0.94200384 0.93927236 0.93967723
 0.93172132 0.92823599 0.9342832  0.94222052]


In [10]:
#Random forest
# Initialize and train the Random Forest model
randomforest = RandomForestClassifier()
randomforest.fit(X_train1, y_train)

# Predictions on the validation set
randomf_predict = randomforest.predict(X_val)

# Accuracy on the validation set
accuracy = accuracy_score(y_val, randomf_predict)
print(f"Accuracy Score (validation) for Random Forest: {accuracy:.6f}")

# Confusion matrix
print("Confusion Matrix for Random Forest:")
print(confusion_matrix(y_val, randomf_predict))

Accuracy Score (validation) for Random Forest: 0.908000
Confusion Matrix for Random Forest:
[[10356   199]
 [  997  1448]]


In [61]:
#Hyperparameter tuning for random forest
rf_random = RandomizedSearchCV(randomforest, parameters, n_iter=15, random_state=42, n_jobs=-1)
rf_random.fit(X_train1, y_train)

# Best parameters from hyperparameter tuning
grid_parm_rf = rf_random.best_params_
print("Best Parameters from Hyperparameter Tuning:", grid_parm_rf)

# Random Forest with best parameters
rf = RandomForestClassifier(**grid_parm_rf)
rf.fit(X_train1, y_train)

# Predictions on the validation set
rf_predict = rf.predict(X_val)

# Accuracy on the validation set
accuracy = accuracy_score(y_val, rf_predict)
print(f"Accuracy Score (validation) after Hyperparameter Tuning for Random Forest: {accuracy:.6f}")

# Confusion matrix
print("Confusion Matrix after Hyperparameter Tuning for Random Forest:")
print(confusion_matrix(y_val, rf_predict))

# Classification report
print("=== Classification Report ===")
print(classification_report(y_val, rf_predict))

# Cross-validation AUC scores
rf_cv_score = cross_val_score(rf, X_train1, y_train, cv=10, scoring="roc_auc")
print("=== All AUC Scores ===")
print(rf_cv_score)

print("\n")
print("=== Mean AUC Score ===")
print(f"Mean AUC Score - Random Forest: {rf_cv_score.mean():.6f}")

Best Parameters from Hyperparameter Tuning: {'min_samples_split': 20, 'max_depth': 30}
Accuracy Score (validation) after Hyperparameter Tuning for Random Forest: 0.908692
Confusion Matrix after Hyperparameter Tuning for Random Forest:
[[10375   180]
 [ 1007  1438]]
=== Classification Report ===
              precision    recall  f1-score   support

           0       0.91      0.98      0.95     10555
           1       0.89      0.59      0.71      2445

    accuracy                           0.91     13000
   macro avg       0.90      0.79      0.83     13000
weighted avg       0.91      0.91      0.90     13000

=== All AUC Scores ===
[0.94726081 0.94464341 0.94588313 0.95070419 0.94559226 0.94586048
 0.94352928 0.94545054 0.94960088 0.9484599 ]


=== Mean AUC Score ===
Mean AUC Score - Random Forest: 0.946698


In [None]:
# MultiLayer Perceptron Model

# Initialize and train the MultiLayer Perceptron model
mlp = MLPClassifier(max_iter=20)
mlp.fit(X_train1, y_train)

# Make predictions on the test set
mlp_predict = mlp.predict(X_val)

# Calculate accuracy on the test set
accuracy = accuracy_score(y_val, mlp_predict)
print(f"Accuracy Score (test) for MultiLayer Perceptron: {accuracy:.6f}")

# Confusion matrix
print("Confusion Matrix for MultiLayer Perceptron:")
print(confusion_matrix(y_val, mlp_predict))

In [63]:
# Hyperparameter grid for MLP
parameters_mlp = {'hidden_layer_sizes': [(50,), (30, 10)],
                  'activation': ['relu'],
                  'learning_rate': ['adaptive']
                  }

# Initialize the MLPClassifier
mlp = MLPClassifier()

# RandomizedSearchCV for hyperparameters tuning
mlp_random = RandomizedSearchCV(mlp, parameters_mlp, n_iter=5, cv=2, random_state=42, n_jobs=-1)
mlp_random.fit(X_train1, y_train)

# Best hyperparameters after tuning
grid_parm = mlp_random.best_params_
print("Best Hyperparameters from RandomizedSearchCV:", grid_parm)

# Using the best hyperparameters to train the model
mlp = MLPClassifier(**grid_parm)
mlp.fit(X_train1, y_train)

# Make predictions on the validation set
mlp_predict = mlp.predict(X_val)

# Calculate accuracy on the validation set
accuracy = accuracy_score(y_val, mlp_predict)
print(f"Accuracy Score (test) after Hyperparameter Tuning for MultiLayer Perceptron: {accuracy:.6f}")

# Confusion Matrix
print("Confusion Matrix after Hyperparameter Tuning for MultiLayer Perceptron:")
print(confusion_matrix(y_val, mlp_predict))

# Classification Report
print("=== Classification Report ===")
print(classification_report(y_val, mlp_predict))

# Cross-validation AUC scores
mlp_cv_score = cross_val_score(mlp, X_train1, y_train, cv=5, scoring="roc_auc")
print("=== All AUC Scores ===")
print(mlp_cv_score)

print("\n")
print("=== Mean AUC Score ===")
print(f"Mean AUC Score - MultiLayer Perceptron: {mlp_cv_score.mean():.6f}")

Best Hyperparameters from RandomizedSearchCV: {'learning_rate': 'adaptive', 'hidden_layer_sizes': (50,), 'activation': 'relu'}
Accuracy Score (test) after Hyperparameter Tuning for MultiLayer Perceptron: 0.609538
Confusion Matrix after Hyperparameter Tuning for MultiLayer Perceptron:
[[5883 4672]
 [ 404 2041]]
=== Classification Report ===
              precision    recall  f1-score   support

           0       0.94      0.56      0.70     10555
           1       0.30      0.83      0.45      2445

    accuracy                           0.61     13000
   macro avg       0.62      0.70      0.57     13000
weighted avg       0.82      0.61      0.65     13000

=== All AUC Scores ===
[0.84001733 0.6812623  0.84425955 0.87626999 0.83369988]


=== Mean AUC Score ===
Mean AUC Score - MultiLayer Perceptron: 0.815102


In [12]:
#Linear Support Vector Machine Model

linsvm = LinearSVC(max_iter=100)
linsvm.fit(X_train1, y_train)

# Predictions on the validation set
linsvm_predict = linsvm.predict(X_val)

# Accuracy on the validation set
accuracy = accuracy_score(y_val, linsvm_predict)
print(f"Accuracy Score (validation) for Linear SVM Classifier: {accuracy:.6f}")

# Confusion matrix
print("Confusion Matrix for Linear SVM Classifier:")
print(confusion_matrix(y_val, linsvm_predict))


Accuracy Score (validation) for Linear SVM Classifier: 0.706538
Confusion Matrix for Linear SVM Classifier:
[[8104 2451]
 [1364 1081]]


In [14]:
#K-Nearest Neighbor Model

# Initialize and train the KNeighborsClassifier model
neigh = KNeighborsClassifier(n_neighbors=3)
neigh.fit(X_train1, y_train)

# Predict on validation set
neigh_predict = neigh.predict(X_val)

# Calculate accuracy
accuracy = accuracy_score(y_val, neigh_predict)
print(f"Accuracy Score (validation) for KNeighborsClassifier: {accuracy:.6f}")

# Print Confusion Matrix
print("Confusion Matrix for KNeighborsClassifier:")
print(confusion_matrix(y_val, neigh_predict))

Accuracy Score (validation) for KNeighborsClassifier: 0.759692
Confusion Matrix for KNeighborsClassifier:
[[9606  949]
 [2175  270]]


In [64]:
#Hyperparameter tuning for K-Nearest Neighbor classifier

parameters = {'n_neighbors': [3, 5], 'weights': ['uniform', 'distance'], 'p': [2]}

# Perform RandomizedSearchCV to tune hyperparameters
neigh = KNeighborsClassifier()
neigh_random = RandomizedSearchCV(neigh, parameters, n_iter=5, random_state=42, n_jobs=-1)
neigh_random.fit(X_train1, y_train)

# Best hyperparameters
grid_parm = neigh_random.best_params_
print("Best Hyperparameters:", grid_parm)

# Train KNeighborsClassifier using best parameters
neigh = KNeighborsClassifier(**grid_parm)
neigh.fit(X_train1, y_train)

# Predict on validation data
neigh_predict = neigh.predict(X_val)

# Calculate accuracy
accuracy = accuracy_score(y_val, neigh_predict)
print(f"Accuracy Score (validation) after hyperparameter tuning: {accuracy:.6f}")

# Confusion Matrix
print("Confusion Matrix after hyperparameter tuning for KNeighborsClassifier:")
print(confusion_matrix(y_val, neigh_predict))

# Classification Report
print("=== Classification Report ===")
print(classification_report(y_val, neigh_predict))

# Cross-validation AUC scores
neigh_cv_score = cross_val_score(neigh, X_train1, y_train, cv=5, scoring="roc_auc")
print("=== All AUC Scores ===")
print(neigh_cv_score)

# Mean AUC Score
print("=== Mean AUC Score ===")
print(f"Mean AUC Score - KNeighborsClassifier: {neigh_cv_score.mean():.6f}")

Best Hyperparameters: {'weights': 'uniform', 'p': 2, 'n_neighbors': 5}
Accuracy Score (validation) after hyperparameter tuning: 0.782462
Confusion Matrix after hyperparameter tuning for KNeighborsClassifier:
[[10037   518]
 [ 2310   135]]
=== Classification Report ===
              precision    recall  f1-score   support

           0       0.81      0.95      0.88     10555
           1       0.21      0.06      0.09      2445

    accuracy                           0.78     13000
   macro avg       0.51      0.50      0.48     13000
weighted avg       0.70      0.78      0.73     13000

=== All AUC Scores ===
[0.52077107 0.52705416 0.53566399 0.5315863  0.53492147]
=== Mean AUC Score ===
Mean AUC Score - KNeighborsClassifier: 0.529999


In [76]:
from collections import Counter

print("\nSMOTE")
print('Original dataset shape %s' % Counter(y_train))

# Apply SMOTE with sampling_strategy
#sm = SMOTE(sampling_strategy=0.3, random_state=42) 
sm = SMOTE(sampling_strategy=0.4, random_state=42) 
#sm = SMOTE(sampling_strategy=0.5, random_state=42) 
X_res, y_res = sm.fit_resample(X_train1, y_train)

print('Resampled dataset shape %s' % Counter(y_res))


SMOTE
Original dataset shape Counter({0: 42183, 1: 9817})
Resampled dataset shape Counter({0: 42183, 1: 16873})


In [77]:
print("\nEnsemble Methods Predictions\n")
# Predict using each individual model on the validation data
predictions = []
models = [ dt, rf, mlp, linsvm, neigh]
for model in models:
    pred = model.predict(X_val)
    predictions.append(pred)
# Stack the predictions using vecstack
S_Train, S_Test = stacking(models,
                           X_res, y_res, test_data,
                           regression=False,
                           mode='oof_pred_bag',
                           needs_proba=False,
                           save_dir=None,
                           metric=accuracy_score,
                           n_folds=2,
                           stratified=True,
                           shuffle=True,
                           random_state=0,
                           verbose=2)


Ensemble Methods Predictions

task:         [classification]
n_classes:    [2]
metric:       [accuracy_score]
mode:         [oof_pred_bag]
n_models:     [5]

model  0:     [DecisionTreeClassifier]
    fold  0:  [0.90893389]
    fold  1:  [0.90930642]
    ----
    MEAN:     [0.90912016] + [0.00018626]
    FULL:     [0.90912016]

model  1:     [RandomForestClassifier]
    fold  0:  [0.91296397]
    fold  1:  [0.91072880]
    ----
    MEAN:     [0.91184638] + [0.00111758]
    FULL:     [0.91184638]

model  2:     [MLPClassifier]
    fold  0:  [0.80107017]
    fold  1:  [0.77851531]
    ----
    MEAN:     [0.78979274] + [0.01127743]
    FULL:     [0.78979274]

model  3:     [LinearSVC]
    fold  0:  [0.71335681]
    fold  1:  [0.49722297]
    ----
    MEAN:     [0.60528989] + [0.10806692]
    FULL:     [0.60528989]

model  4:     [KNeighborsClassifier]
    fold  0:  [0.69361284]
    fold  1:  [0.69097128]
    ----
    MEAN:     [0.69229206] + [0.00132078]
    FULL:     [0.69229206]



In [78]:
meta_learner = GradientBoostingClassifier()
meta_learner.fit(S_Train, y_res)
# Make final predictions using the meta-learner
final_predictions = meta_learner.predict(S_Test)

In [79]:
param_dist = {
    'n_estimators': [50, 100],  # Reduced number of estimators
    'learning_rate': [0.05, 0.1],  # Reduced learning rate range
    'max_depth': [3, 5],  # Limited to 2 possible values
}

# Initialize the GradientBoostingClassifier
model = GradientBoostingClassifier()

# Perform RandomizedSearchCV to tune hyperparameters
random_search = RandomizedSearchCV(estimator=model, param_distributions=param_dist, n_iter=5, cv=3, random_state=42, verbose=2)

# Fit the RandomizedSearchCV on the training data (stacked features)
random_search.fit(S_Train, y_res)

# Get the best hyperparameters found during the search
print("Best hyperparameters for Gradient Boosting meta-model:", random_search.best_params_)

# Use the best estimator (meta-model) from the random search
best_model = random_search.best_estimator_

# Re-train the meta-model with the best hyperparameters
best_model.fit(S_Train, y_res)

# Predict using the best model on the test data
y_pred = best_model.predict(S_Test)


Fitting 3 folds for each of 5 candidates, totalling 15 fits
[CV] END ..learning_rate=0.05, max_depth=3, n_estimators=100; total time=   2.1s
[CV] END ..learning_rate=0.05, max_depth=3, n_estimators=100; total time=   2.1s
[CV] END ..learning_rate=0.05, max_depth=3, n_estimators=100; total time=   2.3s
[CV] END ...learning_rate=0.1, max_depth=3, n_estimators=100; total time=   2.7s
[CV] END ...learning_rate=0.1, max_depth=3, n_estimators=100; total time=   2.4s
[CV] END ...learning_rate=0.1, max_depth=3, n_estimators=100; total time=   2.5s
[CV] END ...learning_rate=0.05, max_depth=3, n_estimators=50; total time=   1.2s
[CV] END ...learning_rate=0.05, max_depth=3, n_estimators=50; total time=   1.3s
[CV] END ...learning_rate=0.05, max_depth=3, n_estimators=50; total time=   1.1s
[CV] END ...learning_rate=0.1, max_depth=5, n_estimators=100; total time=   3.4s
[CV] END ...learning_rate=0.1, max_depth=5, n_estimators=100; total time=   3.4s
[CV] END ...learning_rate=0.1, max_depth=5, n_est

In [80]:
#Get Prediction Probability for the predicted class as a dataframe
pred_Probability = pd.DataFrame(best_model.predict_proba(S_Test))

pred_Probability.head()

Unnamed: 0,0,1
0,0.947232,0.052768
1,0.947232,0.052768
2,0.947232,0.052768
3,0.947232,0.052768
4,0.947232,0.052768


In [81]:
print(type(pred_Probability))

<class 'pandas.core.frame.DataFrame'>


In [82]:
test_ids = test_data['QuoteNumber']  # Adjust this based on your test data

quote_conversion_prob = pred_Probability.iloc[:, 1]
# Create a submission DataFrame
submission = pd.DataFrame({
    'QuoteNumber': test_ids,                # Include the QuoteNumber column
    'QuoteConversion_Flag': quote_conversion_prob  # Include the predicted probability
})

In [83]:
path = "C:/Users/kriti/Downloads/submission.csv"

# Save the submission file
submission.to_csv(path, index=False)

print(f"Submission file saved to: {path}")

Submission file saved to: C:/Users/kriti/Downloads/submission.csv
