# 4.0 Model Experimentation

[![Static Badge](https://img.shields.io/badge/Back_to_README.md-red?style=for-the-badge&logo=github&labelColor=black)](https://github.com/izzad2413/sustainable_ota)

### Table of Contents

- 4.1 Dataset Preparation
- 4.2 Model Development

In [1]:
from sklearn.model_selection import GridSearchCV, RepeatedStratifiedKFold, train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, precision_recall_curve, auc
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
import datetime
import pandas as pd
import os

In [2]:
model_data_dir = './../data/modeling/modeling_data.csv' # dataset directory
df = pd.read_csv(model_data_dir)

### 4.1 Dataset Preparation

Based on the 3.0 feature selection experiment, it was decided that the selected 22 features will be used for model building.

In [3]:
df_22 = df[['quality_rating', 'preferred_partner', 'count_room_types',
       'overall_reviews', 'count_reviews', 'count_topattractions',
       'count_closestairports', 'count_groups of friends',
       'lan_eng_proportion', 'lan_malay_proportion',
       'average_all_closestairports_distances', 'bathroom',
       'business_facilities', 'common_areas', 'languages_spoken',
       'living_area', 'miscellaneous', 'outdoors', 'reception_services',
       'safety_&_security', 'services_&_extras', 'swimming_pool','sustainable_label']]

### 4.2 Model Development

In [None]:
def classifier_grid_search(X_train, y_train, X_test, y_test, classifier_name, param_grid, scoring=['accuracy', 'precision', 'recall', 'f1', 'roc_auc'], cv=5):
    classifiers = {
        'SVM': SVC(),
        'RF': RandomForestClassifier(),
        'DT': DecisionTreeClassifier(),
        'KNN': KNeighborsClassifier(),
        'MLP': MLPClassifier()
    }

    # to prevent data leakage
    pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('smote', SMOTE(random_state=42)),
        ('classifier', classifiers[classifier_name])
    ])

    cv_strategy = RepeatedStratifiedKFold(n_splits=cv, n_repeats=2, random_state=42)

    grid_search = GridSearchCV(pipeline, param_grid, scoring=scoring, refit='precision', cv=cv_strategy)
    grid_search.fit(X_train, y_train)

    best_params = grid_search.best_params_

    best_index = grid_search.best_index_
    mean_scores = {scorer: grid_search.cv_results_[f'mean_test_{scorer}'][best_index] for scorer in scoring}

    y_pred = grid_search.predict(X_test)

    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)

    precision_curve, recall_curve, _ = precision_recall_curve(y_test, y_pred)
    pr_auc = auc(recall_curve, precision_curve)

    test_scores = {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1,
        'pr_auc': pr_auc
    }

    return grid_search.best_estimator_, best_params, mean_scores, test_scores

def save_results_to_csv(results, save_dir, file_name):
    df = pd.DataFrame(results)
    file_path = os.path.join(save_dir, file_name)
    df.to_csv(file_path, index=False)

# Directory to save the results
date_today = datetime.datetime.now().strftime("%d-%m-%Y")
model_result_dir = './../data/result/'
os.makedirs(model_result_dir, exist_ok=True)