# 4.0 Model Experimentation

[![Static Badge](https://img.shields.io/badge/Back_to_README.md-red?style=for-the-badge&logo=github&labelColor=black)](https://github.com/izzad2413/sustainable_ota)

### Table of Contents

- 4.1 Dataset Preparation
- 4.2 Model Development
- 4.3 Model Experimentation

In [1]:
from sklearn.model_selection import GridSearchCV, RepeatedStratifiedKFold, train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, precision_recall_curve, auc
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.feature_selection import RFE
from sklearn.model_selection import cross_validate
import datetime
import pandas as pd
import os

In [2]:
model_data_dir = './../data/modeling/modeling_data.csv' # dataset directory
df = pd.read_csv(model_data_dir)

### 4.1 Dataset Preparation

Based on the 3.0 feature selection experiment, it was decided that the selected 22 features will be used for model building.

In [3]:
df = df[['quality_rating', 'preferred_partner', 'count_room_types',
       'overall_reviews', 'count_reviews', 'count_topattractions',
       'count_closestairports', 'count_groups of friends',
       'lan_eng_proportion', 'lan_malay_proportion',
       'average_all_closestairports_distances', 'bathroom',
       'business_facilities', 'common_areas', 'languages_spoken',
       'living_area', 'miscellaneous', 'outdoors', 'reception_services',
       'safety_&_security', 'services_&_extras', 'swimming_pool','sustainable_label']]

### 4.2 Model Development

The exprimented classifiers with its respective hyperparameter & values.

In [4]:
all_features = [
    'quality_rating', 'preferred_partner', 'count_room_types', 'overall_reviews',
    'count_reviews', 'count_topattractions', 'count_closestairports', 'lan_eng_proportion',
    'lan_malay_proportion', 'average_all_closestairports_distances', 'bathroom',
    'business_facilities', 'common_areas', 'languages_spoken', 'living_area', 
    'miscellaneous', 'outdoors', 'reception_services', 'safety_&_security', 
    'services_&_extras', 'swimming_pool'
]

# Function to generate feature sets
def generate_feature_sets(all_features, min_features=11, max_features=22):
    feature_sets = {}
    for num_features in range(min_features, max_features + 1):
        feature_set_name = f'f_{num_features}'
        feature_sets[feature_set_name] = df[all_features[:num_features]]
    return feature_sets

# Generate feature sets
feature_sets = generate_feature_sets(all_features)

# Define classifiers and their parameter grids
classifiers_param_grid = {
    'SVM': {
        'model': SVC(),
        'param_grid': {
            'classifier__C': [0.1, 1, 10],
            'classifier__gamma': [0.1, 0.01, 0.001],
            'classifier__kernel': ['linear', 'rbf', 'poly']
        }
    },
    'RF': {
        'model': RandomForestClassifier(),
        'param_grid': {
            'classifier__n_estimators': [50, 100, 200],
            'classifier__max_depth': [None, 10, 20],
            'classifier__min_samples_split': [2, 5, 10],
            'classifier__min_samples_leaf': [1, 2, 4],
            'classifier__bootstrap': [True, False]
        }
    },
    'DT': {
        'model': DecisionTreeClassifier(),
        'param_grid': {
            'classifier__max_depth': [None, 10, 20],
            'classifier__min_samples_split': [2, 5, 10],
            'classifier__min_samples_leaf': [1, 2, 4],
            'classifier__criterion': ['gini', 'entropy']
        }
    },
    'KNN': {
        'model': KNeighborsClassifier(),
        'param_grid': {
            'classifier__n_neighbors': [3, 5, 7],
            'classifier__weights': ['uniform', 'distance'],
            'classifier__algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
            'classifier__p': [1, 2]
        }
    },
    'MLP': {
        'model': MLPClassifier(),
        'param_grid': {
            'classifier__hidden_layer_sizes': [(50,), (100,), (50, 50)],
            'classifier__activation': ['relu', 'tanh', 'logistic'],
            'classifier__solver': ['adam', 'sgd'],
            'classifier__alpha': [0.0001, 0.001, 0.01],
            'classifier__learning_rate': ['constant', 'adaptive'],
            'classifier__max_iter': [200, 500, 1000],
            'classifier__early_stopping': [True],
            'classifier__validation_fraction': [0.1],
            'classifier__n_iter_no_change': [5],
            'classifier__random_state': [42] 
            
        }
    }
}

def classifier_grid_search(X_train, y_train, X_test, y_test, model, param_grid, scoring=['accuracy', 'precision', 'recall', 'f1', 'roc_auc'], cv=5):
    # to prevent data leakage
    pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('smote', SMOTE(random_state=42)),
        ('classifier', model)
    ])

    cv_strategy = RepeatedStratifiedKFold(n_splits=cv, n_repeats=2, random_state=42)

    grid_search = GridSearchCV(pipeline, param_grid, scoring=scoring, refit='precision', cv=cv_strategy)
    grid_search.fit(X_train, y_train)

    best_params = grid_search.best_params_

    best_index = grid_search.best_index_
    mean_scores = {scorer: grid_search.cv_results_[f'mean_test_{scorer}'][best_index] for scorer in scoring}

    y_pred = grid_search.predict(X_test)

    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)

    precision_curve, recall_curve, _ = precision_recall_curve(y_test, y_pred)
    pr_auc = auc(recall_curve, precision_curve)

    test_scores = {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1,
        'pr_auc': pr_auc
    }

    return grid_search.best_estimator_, best_params, mean_scores, test_scores

### 4.3 Model Experimentation

In [6]:
# Run model experiment
# Target feature
y = df['sustainable_label']

# Collect results for each feature set and classifier
results = []

for feature_set_name, X in feature_sets.items():
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
    
    for classifier_name, classifier_info in classifiers_param_grid.items():
        model = classifier_info['model']
        param_grid = classifier_info['param_grid']
        
        best_estimator, best_params, mean_scores, test_scores = classifier_grid_search(X_train, y_train, X_test, y_test, model, param_grid)
        
        result = {
            'feature_set': feature_set_name,
            'classifier': classifier_name,
            'best_params': best_params,
            **{f'val_{score}': score_value for score, score_value in mean_scores.items()},
            **{f'test_{score}': score_value for score, score_value in test_scores.items()}
        }
        results.append(result)

# Convert results to DataFrame for better analysis
results_df = pd.DataFrame(results)
print(results_df)

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize