## Taking only in imporatant features as selected from the previous file 
## Main purpose is to implement pipleline

In [159]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder

from sklearn.impute import SimpleImputer
from sklearn.pipeline import FeatureUnion
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV 
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score


from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier


from sklearn.metrics import confusion_matrix,classification_report
from imblearn.metrics import sensitivity_specificity_support
from imblearn.metrics import sensitivity_score ,specificity_score
from sklearn.metrics import recall_score , accuracy_score ,precision_score,f1_score,roc_auc_score

In [160]:
data=pd.read_csv('hotel_bookings.csv',parse_dates=['reservation_status_date'])

In [161]:
## Country , Company ,Ageny ,children has null values
## Company has more then 90% null values so Company can be removed easily let us remove and check

data.drop('company',axis=1,inplace=True)

## after removing Company still children country and Ageny is having null values we will check them later

In [162]:
data['is_canceled'].value_counts()

0    75166
1    44224
Name: is_canceled, dtype: int64

In [163]:
### Data is imbalanced

In [164]:
##  0 is max count so let us fill null value of children with 0
data.children.fillna(0,inplace=True)

### Let us combine Adults, Children and Babies to one family


In [165]:
def family(df):
    if((df.adults>0) & (df.children>0)):
        fam= 1
    elif((df.adults>0) & (df.babies>0)):
        fam= 1
    else:
        fam= 0
    return fam

        

In [166]:
data['family']=data.apply(family,axis=1)

In [167]:
data['family'].value_counts()

0    110281
1      9109
Name: family, dtype: int64

In [168]:
data['total_customers']=data.adults+data.children+data.babies

In [169]:
data.drop(['adults','children','babies'], axis=1, inplace=True)

In [170]:

data.country.fillna('PRT', inplace=True)

In [171]:
## we can convert previous_cancellations as yes or no
data['previous_cancellations']=np.where(data['previous_cancellations']>0,'Yes','No')

In [172]:
data['previous_cancellations'].value_counts()

No     112906
Yes      6484
Name: previous_cancellations, dtype: int64

In [173]:
data.drop('reservation_status', axis=1 , inplace=True)

In [174]:
data.drop('arrival_date_week_number', axis=1 , inplace=True)

In [175]:
data.drop('reservation_status_date',axis=1, inplace=True)

In [176]:
data.drop('agent', axis=1 , inplace=True)

### Now our data set is cleaned with 0 null values

### arrival_date_day_of_month  and  stays_in_weekend_nights  is very less correlated to is_cancelled so we can remove and try once

In [177]:
data.drop(['arrival_date_day_of_month', 'stays_in_weekend_nights'],axis=1, inplace=True)

### Setting the list of numeric and categorial values saperatly

In [178]:
df=data.copy()

In [179]:
df['arrival_date_month'].unique()

array(['July', 'August', 'September', 'October', 'November', 'December',
       'January', 'February', 'March', 'April', 'May', 'June'],
      dtype=object)

In [180]:
months={'January':1,'February':2,'March':3,'April':4,'May':5,'June':6,'July':7,
        'August':8,'September':9,'October':10,'November':11,'December':12}
df['arrival_date_month'].replace(months,inplace=True)

In [181]:
df['arrival_date_month'].dtypes

dtype('int64')

In [182]:
category_col=df.select_dtypes(include=['object']).columns
numeric_col=df.select_dtypes(include=['number']).columns

### For features with greater than 5 option we will use label encoding and for features less than 5 we will use on hot encoding and for arrival_date_month we will do manually as using label encoder it migth set diff labels

In [183]:
cat_ser=pd.Series(index=category_col)
for i in category_col:
    cat_ser[i]=df[i].nunique()

  """Entry point for launching an IPython kernel.


In [184]:
cat_ser

hotel                       2.0
meal                        5.0
country                   177.0
market_segment              8.0
distribution_channel        5.0
previous_cancellations      2.0
reserved_room_type         10.0
assigned_room_type         12.0
deposit_type                3.0
customer_type               4.0
dtype: float64

In [185]:
lab_var=[]
for i in df:
    if (df[i].dtype=='O' and (df[i].nunique()==2 or df[i].nunique()>5 )):
        lab_var.append(i)
        

In [186]:
lab_var

['hotel',
 'country',
 'market_segment',
 'previous_cancellations',
 'reserved_room_type',
 'assigned_room_type']

In [187]:
lab= LabelEncoder()
col_count=0
for i in df:
    if (df[i].dtype=='O' and (df[i].nunique()==2 or df[i].nunique()>5 )):
        if df[i].isnull().any()==False:
            lab.fit(df[i])
            df[i]=lab.transform(df[i])
            print(i)
            col_count+=1
            
print("total_columns updated",col_count)

hotel
country
market_segment
previous_cancellations
reserved_room_type
assigned_room_type
total_columns updated 6


In [188]:
df.dtypes.value_counts()

int64      12
int32       6
object      4
float64     2
dtype: int64

### Saperating the data set in test and train

In [189]:
X=df.drop('is_canceled',axis=1)
y=df['is_canceled']
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.33, random_state=42)
X_train.shape , X_test.shape , y_train.shape , y_test.shape

((79991, 23), (39399, 23), (79991,), (39399,))

In [190]:
numeric_features = X_train.select_dtypes(include=['int64','int32', 'float64']).drop(lab_var,axis=1).columns
categorical_features = X_train.select_dtypes(include=['object']).columns
#lab_features=df.select_dtypes(include=['object']).drop(categorical_features,axis=1).columns

In [191]:
y_train.value_counts()

0    50412
1    29579
Name: is_canceled, dtype: int64

In [192]:
numeric_transform=Pipeline(steps=[('imputer',SimpleImputer(strategy='median')),
                                 ('scaler',StandardScaler())])

#lab_transformer=Pipeline(steps=[('label',LabelEncoder())])
onhot_transformer=Pipeline(steps=[('onhot',OneHotEncoder(handle_unknown='ignore'))])

preprocessor=ColumnTransformer(transformers=[('num',numeric_transform,numeric_features),
                                            ('onhot',onhot_transformer,categorical_features)])

classifiers =[LogisticRegression(solver = "liblinear"),
              LogisticRegression(penalty='l1',solver = "liblinear"),
              KNeighborsClassifier(),
              DecisionTreeClassifier(criterion = 'entropy'),
              RandomForestClassifier(n_estimators=58,max_depth=28,max_features=10,min_samples_split=4,min_samples_leaf=1,criterion='gini')
              ]

In [193]:
for classifier in classifiers:
    pipe=Pipeline(steps=[('preprocessor', preprocessor),
                         ('classifier', classifier)])
    pipe.fit(X_train, y_train)   
    print(classifier)
    print("model score: %.3f" % pipe.score(X_test, y_test))
    algo_predict=pipe.predict(X_test)
    cm_algo = confusion_matrix(y_test, algo_predict)
    algo_acc = accuracy_score(y_test, algo_predict)
    print('Validation Results')
    print("Recall score ",recall_score(y_test, algo_predict))
    print("F1 score",f1_score(y_test, algo_predict))
    print("roc auc score",roc_auc_score(y_test, algo_predict))
    print("Precision Score",precision_score(y_test, algo_predict))
    print("Accuracy score : ",algo_acc)
    



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)
model score: 0.780
Validation Results
Recall score  0.5104131102765449
F1 score 0.6329381879762913
roc auc score 0.7249084215033044
Precision Score 0.8328690807799443
Accuracy score :  0.779943653392218
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l1',
                   random_state=None, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)
model score: 0.780
Validation Results
Recall score  0.5097985660635029
F1 score 0.6324438797119866
roc auc score 0.7246213481525401
Preci