# Kalpesh Patil ___ Machine Learning ___ Assignemnt 2

### Import necessary libraries

In [73]:
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, train_test_split
from sklearn.impute import SimpleImputer
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score
from skopt import BayesSearchCV
from scipy.stats import randint, uniform

### Load the datasets

In [76]:
train_data = pd.read_csv('Insurance Fraud - TRAIN-3000.csv')
test_data = pd.read_csv('Insurance Fraud -TEST-12900.csv')

In [80]:
print(f"Test data shape: {test_data.shape}")
print(f"Number of rows in test data: {test_data.shape[0]}")
print(f"Number of columns in test data: {test_data.shape[1]}")

Test data shape: (12918, 32)
Number of rows in test data: 12918
Number of columns in test data: 32


### Prepare the data

In [46]:
X_train = train_data.drop(columns=['FRAUDFOUND'])
y_train = train_data['FRAUDFOUND']
X_test = test_data.drop(columns=['FRAUDFOUND'])
y_test = test_data['FRAUDFOUND']

### Define the preprocessor

In [49]:
numerical_features = X_train.select_dtypes(include=['float64', 'int64']).columns
categorical_features = X_train.select_dtypes(include=['object']).columns

# Preprocessing for numerical data: impute missing values and scale
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),  # Impute missing values with the mean
    ('scaler', StandardScaler())  # Scale numerical features
])

# Preprocessing for categorical data: impute missing values and encode
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),  # Impute missing values with the most frequent value
    ('encoder', OneHotEncoder(handle_unknown='ignore'))  # One-hot encode categorical variables
])

# Combine the numerical and categorical transformers into one
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])

### Define the model pipeline

In [52]:
pipeline_dt = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', DecisionTreeClassifier(random_state=42))
])

pipeline_rf = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42))
])


### Hyperparameter tuning

In [55]:
# Define hyperparameter grids
dt_param_grid = {
    'classifier__max_depth': [3, 5, 10, None],  # Tuning max depth
    'classifier__min_samples_split': [2, 5, 10],  # Tuning min_samples_split
    'classifier__min_samples_leaf': [1, 2, 5]  # Tuning min_samples_leaf
}

rf_param_grid = {
    'classifier__n_estimators': [50, 100, 200],  # Number of trees in the forest
    'classifier__max_depth': [10, 20, None],  # Maximum depth of trees
    'classifier__min_samples_split': [2, 5, 10],  # Split size for nodes
}

### Tuning using GridSearchCV, RandomizedSearchCV, and BayesSearchCV

In [58]:
# Grid Search for Decision Tree
grid_search_dt = GridSearchCV(pipeline_dt, dt_param_grid, cv=5, scoring='f1_weighted', n_jobs=-1)
grid_search_dt.fit(X_train, y_train)
print(f"Grid Search - Best parameters (Decision Tree): {grid_search_dt.best_params_}")

# Random Search for Decision Tree
dt_random_search = RandomizedSearchCV(pipeline_dt, dt_param_grid, n_iter=10, cv=5, scoring='f1_weighted', n_jobs=-1)
dt_random_search.fit(X_train, y_train)
print(f"Random Search - Best parameters (Decision Tree): {dt_random_search.best_params_}")

# Bayesian Search for Decision Tree
dt_bayes_search = BayesSearchCV(pipeline_dt, dt_param_grid, n_iter=10, cv=5, scoring='f1_weighted', n_jobs=-1)
dt_bayes_search.fit(X_train, y_train)
print(f"Bayes Search - Best parameters (Decision Tree): {dt_bayes_search.best_params_}")

# Grid Search for Random Forest
grid_search_rf = GridSearchCV(pipeline_rf, rf_param_grid, cv=5, scoring='f1_weighted', n_jobs=-1)
grid_search_rf.fit(X_train, y_train)
print(f"Grid Search - Best parameters (Random Forest): {grid_search_rf.best_params_}")

# Random Search for Random Forest
rf_random_search = RandomizedSearchCV(pipeline_rf, rf_param_grid, n_iter=10, cv=5, scoring='f1_weighted', n_jobs=-1)
rf_random_search.fit(X_train, y_train)
print(f"Random Search - Best parameters (Random Forest): {rf_random_search.best_params_}")

# Bayesian Search for Random Forest
rf_bayes_search = BayesSearchCV(pipeline_rf, rf_param_grid, n_iter=10, cv=5, scoring='f1_weighted', n_jobs=-1)
rf_bayes_search.fit(X_train, y_train)
print(f"Bayes Search - Best parameters (Random Forest): {rf_bayes_search.best_params_}")


Grid Search - Best parameters (Decision Tree): {'classifier__max_depth': 3, 'classifier__min_samples_leaf': 5, 'classifier__min_samples_split': 2}
Random Search - Best parameters (Decision Tree): {'classifier__min_samples_split': 10, 'classifier__min_samples_leaf': 5, 'classifier__max_depth': 3}
Bayes Search - Best parameters (Decision Tree): OrderedDict({'classifier__max_depth': 3, 'classifier__min_samples_leaf': 5, 'classifier__min_samples_split': 5})
Grid Search - Best parameters (Random Forest): {'classifier__max_depth': None, 'classifier__min_samples_split': 5, 'classifier__n_estimators': 100}
Random Search - Best parameters (Random Forest): {'classifier__n_estimators': 200, 'classifier__min_samples_split': 10, 'classifier__max_depth': None}
Bayes Search - Best parameters (Random Forest): OrderedDict({'classifier__max_depth': None, 'classifier__min_samples_split': 5, 'classifier__n_estimators': 100})


### Build new models based on the best parameters

In [60]:
# Decision Tree models from best parameters
best_dt_model_grid = grid_search_dt.best_estimator_
best_dt_model_random = dt_random_search.best_estimator_
best_dt_model_bayes = dt_bayes_search.best_estimator_

# Random Forest models from best parameters
best_rf_model_grid = grid_search_rf.best_estimator_
best_rf_model_random = rf_random_search.best_estimator_
best_rf_model_bayes = rf_bayes_search.best_estimator_

### Evaluate models on the test set

In [62]:
def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')
    return accuracy, precision, recall, f1

# Evaluate all models
# Decision Tree evaluations
dt_grid_scores = evaluate_model(best_dt_model_grid, X_test, y_test)
dt_random_scores = evaluate_model(best_dt_model_random, X_test, y_test)
dt_bayes_scores = evaluate_model(best_dt_model_bayes, X_test, y_test)

# Random Forest evaluations
rf_grid_scores = evaluate_model(best_rf_model_grid, X_test, y_test)
rf_random_scores = evaluate_model(best_rf_model_random, X_test, y_test)
rf_bayes_scores = evaluate_model(best_rf_model_bayes, X_test, y_test)


# Print scores for comparison in a more readable format
def print_comparison(model_name, scores):
    accuracy, precision, recall, f1 = scores
    print(f"{model_name} - Accuracy: {accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1: {f1:.4f}")

# Print Decision Tree model scores
print_comparison("Decision Tree - Grid Search", dt_grid_scores)
print_comparison("Decision Tree - Random Search", dt_random_scores)
print_comparison("Decision Tree - Bayes Search", dt_bayes_scores)

# Print Random Forest model scores
print_comparison("Random Forest - Grid Search", rf_grid_scores)
print_comparison("Random Forest - Random Search", rf_random_scores)
print_comparison("Random Forest - Bayes Search", rf_bayes_scores)

Decision Tree - Grid Search - Accuracy: 0.8511, Precision: 0.9392, Recall: 0.8511, F1: 0.8893
Decision Tree - Random Search - Accuracy: 0.8511, Precision: 0.9392, Recall: 0.8511, F1: 0.8893
Decision Tree - Bayes Search - Accuracy: 0.8511, Precision: 0.9392, Recall: 0.8511, F1: 0.8893
Random Forest - Grid Search - Accuracy: 0.9578, Precision: 0.9683, Recall: 0.9578, F1: 0.9620
Random Forest - Random Search - Accuracy: 0.9508, Precision: 0.9552, Recall: 0.9508, F1: 0.9528
Random Forest - Bayes Search - Accuracy: 0.9578, Precision: 0.9683, Recall: 0.9578, F1: 0.9620


In [63]:
# Print scores for comparison in a more readable format
def print_comparison(model_name, scores):
    accuracy, precision, recall, f1 = scores
    print(f"{model_name} - Accuracy: {accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1: {f1:.4f}")

# Print Decision Tree model scores
print_comparison("Decision Tree - Grid Search", dt_grid_scores)
print_comparison("Decision Tree - Random Search", dt_random_scores)
print_comparison("Decision Tree - Bayes Search", dt_bayes_scores)

# Print Random Forest model scores
print_comparison("Random Forest - Grid Search", rf_grid_scores)
print_comparison("Random Forest - Random Search", rf_random_scores)
print_comparison("Random Forest - Bayes Search", rf_bayes_scores)


Decision Tree - Grid Search - Accuracy: 0.8511, Precision: 0.9392, Recall: 0.8511, F1: 0.8893
Decision Tree - Random Search - Accuracy: 0.8511, Precision: 0.9392, Recall: 0.8511, F1: 0.8893
Decision Tree - Bayes Search - Accuracy: 0.8511, Precision: 0.9392, Recall: 0.8511, F1: 0.8893
Random Forest - Grid Search - Accuracy: 0.9578, Precision: 0.9683, Recall: 0.9578, F1: 0.9620
Random Forest - Random Search - Accuracy: 0.9508, Precision: 0.9552, Recall: 0.9508, F1: 0.9528
Random Forest - Bayes Search - Accuracy: 0.9578, Precision: 0.9683, Recall: 0.9578, F1: 0.9620


### Further Analysis and Conclusion

In [65]:
best_dt_accuracy = max(dt_grid_scores[0], dt_random_scores[0], dt_bayes_scores[0])
best_rf_accuracy = max(rf_grid_scores[0], rf_random_scores[0], rf_bayes_scores[0])
print(f"Best Decision Tree Model by Accuracy: {best_dt_accuracy}")
print(f"Best Random Forest Model by Accuracy: {best_rf_accuracy}")

Best Decision Tree Model by Accuracy: 0.8511379470506271
Best Random Forest Model by Accuracy: 0.9578108066264127
