In [1]:
import sys
sys.path.append("..")

import pandas as pd

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB

from xgboost import XGBClassifier

from src.features.preprocessing import preprocess_data, isolationforest_detect
from src.models.train_model import evaluate_model

from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

import warnings
warnings.filterwarnings('ignore')

In [2]:
# Importing dataset
df = pd.read_csv("../data/city_dataframe.csv")

# Removing outliers
isolationforest_detect(df,['adr','days_in_waiting_list','total_booked_nights'],0.05,'combined_outliers')
df = df[(df['combined_outliers'] != -1)]
df.drop('combined_outliers', inplace=True, axis=1)

y = df.iloc[:,0]
X = df.iloc[:,1:]

# Selecting features choosen in Feature Engineering.ipynb
selected_features = ['lead_time',
                    'country',
                    'adr',
                    'deposit_type',
                    'total_of_special_requests',
                    'market_segment',
                    'customer_type',
                    'total_booked_nights',
                    'distribution_channel',
                    'previous_cancellations']

X = X[selected_features]

# Running general preprocessing pipeline
X = preprocess_data(X)

# Initiation of score list
scores = []

 1    75364
-1     3966
Name: combined_outliers, dtype: int64


#### Algorithms to be compared:
1. Logistic Regression
2. k-Nearest Neighbor
3. Decision tree
4. Random Forest
5. Naive Bayes
6. LinearSVC
7. AdaBoostClassifier
8. XGBoost

Algorithms with the best accuracy score will be tuned.

In [None]:
scores.append(('Logistic Regression', evaluate_model(X, y, LogisticRegression(), kfolds=10)))

In [None]:
scores.append(('Logistic Regression', evaluate_model(X, y, KNeighborsClassifier(), kfolds=10)))

In [None]:
scores.append(('Decision Tree', evaluate_model(X, y, DecisionTreeClassifier(max_depth=12), kfolds=10)))

In [None]:
scores.append(('Random Forest', evaluate_model(X, y, RandomForestClassifier(), kfolds=10)))

In [None]:
scores.append(('Gaussian Naive Bayes', evaluate_model(X, y, GaussianNB(), kfolds=10)))

In [None]:
scores.append(('LinearSVC', evaluate_model(X, y, LinearSVC(), kfolds=10, roc=False)))

In [None]:
scores.append(('AdaBoostClassifier', evaluate_model(X, y, AdaBoostClassifier(), kfolds=10)))

In [None]:
scores.append(('XGBoost', evaluate_model(X, y, XGBClassifier(), kfolds=10)))

In [None]:
scores_df = pd.DataFrame(scores, columns =['Model', 'Score']).sort_values('Score', ascending=False).set_index('Model')
scores_df