In [25]:
# Voting
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.datasets import make_moons

X, y = make_moons(n_samples = 300, noise = 0.25, random_state = 42)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 42)

lr = LogisticRegression()
knn = KNeighborsClassifier()
estimator = [('LR', lr), ('KNN', knn)]

clf = VotingClassifier(estimators = estimator, voting = 'soft')

clf.fit(X_train, y_train)

pred = clf.predict(X_test)

print(accuracy_score(pred, y_test))

0.9444444444444444


In [26]:
df = pd.read_csv('../data/Europe Hotel Booking Satisfaction Score.csv')

# 함수화
from sklearn.preprocessing import LabelEncoder

def preprocess(df, label_column, drop_columns = None, onehot_columns = None):
    if(drop_columns):
        df.drop(drop_columns, axis = 1, inplace = True)
    
    encoder = LabelEncoder()
    encoder.fit(df[label_column])
    df[label_column] = encoder.transform(df[label_column])

    df = pd.get_dummies(df, columns = onehot_columns)

    X = df.drop(label_column, axis = 1)
    y = df[label_column]

    return X, y

X, y = preprocess(df, 'satisfaction', ['id'], ['Gender', 'purpose_of_travel', 'Type of Travel', 'Type Of Booking'])

In [27]:
# 표준화 혹은 정규화
from sklearn.preprocessing import StandardScaler, MinMaxScaler

scaler = MinMaxScaler()
scaler.fit(X)
scaled_X = scaler.transform(X)
X = pd.DataFrame(scaled_X, columns = X.columns)

X

Unnamed: 0,Age,Hotel wifi service,Departure/Arrival convenience,Ease of Online booking,Hotel location,Food and drink,Stay comfort,Common Room entertainment,Checkin/Checkout service,Other service,...,purpose_of_travel_academic,purpose_of_travel_aviation,purpose_of_travel_business,purpose_of_travel_personal,purpose_of_travel_tourism,Type of Travel_Group Travel,Type of Travel_Personal Travel,Type Of Booking_Group bookings,Type Of Booking_Individual/Couple,Type Of Booking_Not defined
0,0.076923,0.6,0.8,0.6,0.2,1.0,1.0,1.0,0.8,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
1,0.230769,0.6,0.4,0.6,0.6,0.2,0.2,0.2,0.2,0.8,...,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0
2,0.243590,0.4,0.4,0.4,0.4,1.0,1.0,1.0,0.8,0.8,...,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0
3,0.230769,0.4,1.0,1.0,1.0,0.4,0.4,0.4,0.2,0.8,...,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0
4,0.692308,0.6,0.6,0.6,0.6,0.8,1.0,0.6,0.6,0.6,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
103899,0.205128,0.4,0.2,0.4,0.6,0.4,0.4,0.4,0.4,0.6,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
103900,0.538462,0.8,0.8,0.8,0.8,0.4,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0
103901,0.294872,0.2,0.2,0.2,0.6,0.8,1.0,0.8,1.0,1.0,...,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0
103902,0.192308,0.2,0.2,0.2,1.0,0.2,0.2,0.2,1.0,0.8,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0


In [28]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .3, stratify = y, random_state = 121)

clf = RandomForestClassifier()

clf.fit(X_train, y_train)

In [43]:
from sklearn.metrics import accuracy_score

pred = clf.predict(X_test)

print('score: {:.4f}'.format(accuracy_score(y_test, pred)))

score: 0.9491


In [44]:
# Confusion Matrix
from sklearn.metrics import confusion_matrix

confusion_matrix(y_test, pred)

array([[17001,   663],
       [  924, 12584]])

In [45]:
# 정밀도
from sklearn.metrics import precision_score

print('{:.4f}'.format(precision_score(y_test, pred)))

0.9500


In [46]:
# 재현율
from sklearn.metrics import recall_score

print('{:.4f}'.format(recall_score(y_test, pred)))

0.9316


In [47]:
# Auc
from sklearn.metrics import roc_auc_score

pred = clf.predict_proba(X_test)

print('{:.4f}'.format(roc_auc_score(y_test, pred[:, -1])))

pd.DataFrame(pred)

0.9906


Unnamed: 0,0,1
0,0.999389,0.000611
1,0.069510,0.930490
2,0.923731,0.076269
3,0.999620,0.000380
4,0.015323,0.984677
...,...,...
31167,0.999991,0.000009
31168,0.999227,0.000773
31169,0.999421,0.000579
31170,0.000002,0.999998


In [48]:
# XGBoost

from xgboost import XGBClassifier

clf = XGBClassifier()

clf.fit(X_train, y_train)

In [49]:
from sklearn.metrics import accuracy_score

pred = clf.predict(X_test)

print('score: {:.4f}'.format(accuracy_score(y_test, pred)))

score: 0.9491


In [50]:
# Confusion Matrix
from sklearn.metrics import confusion_matrix

confusion_matrix(y_test, pred)

array([[17001,   663],
       [  924, 12584]])

In [51]:
# 정밀도
from sklearn.metrics import precision_score

print('정밀도: {:.4f}'.format(precision_score(y_test, pred)))

0.9500


In [52]:
# 재현율
from sklearn.metrics import recall_score

print('재현율: {:.4f}'.format(recall_score(y_test, pred)))

0.9316


In [53]:
# AUC
from sklearn.metrics import roc_auc_score

pred = clf.predict_proba(X_test)

print('AUC: {:.4f}'.format(roc_auc_score(y_test, pred[:, -1])))

pd.DataFrame(pred)

0.9906


Unnamed: 0,0,1
0,0.999389,0.000611
1,0.069510,0.930490
2,0.923731,0.076269
3,0.999620,0.000380
4,0.015323,0.984677
...,...,...
31167,0.999991,0.000009
31168,0.999227,0.000773
31169,0.999421,0.000579
31170,0.000002,0.999998
