In [None]:
def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn

import numpy as np
import pandas as pd 
pd.set_option('display.max_columns', None)
pd.options.mode.chained_assignment = None

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error

from sklearn.preprocessing import LabelEncoder

from sklearn.preprocessing import MaxAbsScaler
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression

from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor

from keras.models import Sequential
from keras.layers import Dense

In [None]:
month = {
    'January' : "01",
    'February' : "02",
    'March' : "03",
    'April' : "04",
    'May' : "05",
    'June' : "06",
    'July' : "07",
    'August' : "08",
    'September' : "09", 
    'October' : "10",
    'November' : "11",
    'December' : "12"
}   
def mergeDate(df):
    df['arrival_date_month'] = df['arrival_date_month'].map(month)
    df['arrival_date']=df['arrival_date_year'].astype(str)+"-"+df['arrival_date_month'].astype(str)+"-"+df['arrival_date_day_of_month'].astype(str).str.zfill(2)

def predictRevenue(df):
    df=df[df.is_canceled<0.5]
    mergeDate(df)
    df['label']=(df['stays_in_weekend_nights']+df['stays_in_week_nights'])*df['adr']
    df=df[['label','arrival_date']]
    df=df.groupby(df['arrival_date']).agg('sum')
    df['label']=np.floor(df['label'] / 10000)
    return df

In [None]:
def cleanData(df,removed=True, model='canceled'):
    
    df.children.dropna()
    
    df['duration']=df['stays_in_week_nights']+df['stays_in_weekend_nights']
    df['booking_size']=df['adults']+df['children']
        
    if removed == True:
        upper_lim = df['adr'].mean () + df['adr'].std () * 3
        df=df[df.adr < upper_lim]
        df=df[df.adr >= -5]
        df=df[df.stays_in_weekend_nights <= 6]
        df=df[df.previous_cancellations <= 10]
        df=df[df.booking_size != 0]
        df=df[df.duration != 0]
        df=df[df.adults  <= 6]
        df=df[df.children <= 3]
        df=df[df.babies <= 2]
        
    drop_list = ['ID','reservation_status','reservation_status_date','is_canceled', 'adr']
    if model == 'canceled':
        y=df['is_canceled']
    if model == 'adr':
        y=df['adr']
        
    df=df.drop(drop_list,axis=1)
    df['country']=df['country'].fillna('')
    df=df.fillna(0)    
    return df,y

def cleanTestData(df):
    df['duration']=df['stays_in_week_nights']+df['stays_in_weekend_nights']
    df['booking_size']=df['adults']+df['children']
    df=df.drop(['ID'],axis=1)
    df['country']=df['country'].fillna('')
    df=df.fillna(0)
    return df

In [None]:
def labelEncoding(df):
    labelencoder = LabelEncoder()
    for col in df.select_dtypes('object'):
        df[col]=labelencoder.fit_transform(df[col])
    return

def oneHotEncoding(df):
    onehot_list=['arrival_date_year','arrival_date_week_number',
                 'arrival_date_day_of_month','company', 'agent', 'country']
    for col in onehot_list:
        df[col]=df[col].astype(str)
    df=pd.get_dummies(df)
    return df

def maxAbsScale(df):
    df=MaxAbsScaler().fit(df).transform(df)
    return

def standardScale(df):
    df=StandardScaler().fit(df).transform(df)
    return

def preprocessing(x_raw, x_test_raw, model='canceled', removed=True, encoding='label', scaling='max-abs'):
    #Clean data
    x, y=cleanData(x_raw.copy(), removed=removed, model=model)
    x_len=x.shape[0]
    
    x_test=cleanTestData(x_test_raw.copy())
    
    x=pd.concat([x,x_test],axis=0)
    
    if encoding == 'label':
        labelEncoding(x)
    elif encoding == 'one-hot':
        x=oneHotEncoding(x)
    
    if scaling == 'standard':
        standardScale(x)
    elif scaling == 'max-abs':
        maxAbsScale(x)
    
    x_test=x[x_len:]
    x=x[:x_len]
    return x, y, x_test

def nnAdrModel(dim,layers):
    model = Sequential()
    for i,layer in enumerate(layers):
        if i==0:
            model.add(Dense(layer, input_dim=dim, activation='relu'))
        else:
            model.add(Dense(layer, activation='relu'))
    model.add(Dense(1))
    model.compile(loss='mean_squared_error', optimizer='adam')
    return model

def nnCanceledModel(dim,layers):
    model = Sequential()
    for i,layer in enumerate(layers):
        if i==0:
            model.add(Dense(layer, input_dim=dim, activation='relu'))
        else:
            model.add(Dense(layer, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='adam')
    return model

## Random Forest
### Compare different n_estimators

In [None]:
#Read csv
x_raw=pd.read_csv('./raw/train.csv')
x_test_raw=pd.read_csv('./raw/test.csv')

#Preprocessing data
x_canceled,y_canceled,x_test=preprocessing(x_raw, x_test_raw, model='canceled', removed=True, 
                                           encoding='label', scaling='max-abs')
x_adr,y_adr,x_test=preprocessing(x_raw, x_test_raw, model='adr', removed=True, 
                                 encoding='label', scaling='max-abs')

models=[RandomForestRegressor(n_estimators=100, min_samples_split=2),
        RandomForestRegressor(n_estimators=300, min_samples_split=2),
        RandomForestRegressor(n_estimators=500, min_samples_split=2),
        RandomForestClassifier(n_estimators=100, min_samples_split=2),
        RandomForestClassifier(n_estimators=300, min_samples_split=2),
        RandomForestClassifier(n_estimators=500, min_samples_split=2)]

for i,model in enumerate(models):
    if i == 0:
        print('adr:')
    elif i == 3:
        print('canceled:')
        
    cur_type='adr' if i < 3 else 'canceled'
  
    #Random 20% for validation, mean of 3 times
    val_scores=[]
    train_scores=[]
    for j in range(3):
        if cur_type == 'adr':
            x_train, x_val, y_train, y_val=train_test_split(x_adr, y_adr, test_size=0.2, shuffle=True)
        elif cur_type == 'canceled':
            x_train, x_val, y_train, y_val=train_test_split(x_canceled, y_canceled, test_size=0.2, shuffle=True)
            
        model.fit(x_train, y_train)
        
        #Validation score
        y_pred=model.predict(x_val)
        if cur_type == 'adr':
            val_score=mean_squared_error(y_pred, y_val)
        elif cur_type == 'canceled':
            val_score=accuracy_score(y_pred, y_val)
        val_scores.append(val_score)
        
        #Train score
        y_pred=model.predict(x_train)
        if cur_type == 'adr':
            train_score=mean_squared_error(y_pred, y_train)
        elif cur_type == 'canceled':
            train_score=accuracy_score(y_pred, y_train)
        train_scores.append(train_score)
        
    print('e_in=', sum(train_scores) / len(train_scores))
    print('e_val(random 20%)=', sum(val_scores) / len(val_scores))
    
    #Last 20% for validation
    if cur_type == 'adr':
        x_train, x_val, y_train, y_val=train_test_split(x_adr, y_adr, test_size=0.2, shuffle=False)
    elif cur_type == 'canceled':
        x_train, x_val, y_train, y_val=train_test_split(x_canceled, y_canceled, test_size=0.2, shuffle=False)
    
    model.fit(x_train, y_train)
    
    y_pred=model.predict(x_val)
    if cur_type == 'adr':
        val_score=mean_squared_error(y_pred, y_val)
    elif cur_type == 'canceled':
        val_score=accuracy_score(y_pred, y_val)
        
    print('e_val(last 20%)=', val_score)
    print('======')

### Compare different min_samples_split

In [None]:
#Read csv
x_raw=pd.read_csv('./raw/train.csv')
x_test_raw=pd.read_csv('./raw/test.csv')

#Preprocessing data
x_canceled,y_canceled,x_test=preprocessing(x_raw, x_test_raw, model='canceled', removed=True, 
                                           encoding='label', scaling='max-abs')
x_adr,y_adr,x_test=preprocessing(x_raw, x_test_raw, model='adr', removed=True, 
                                 encoding='label', scaling='max-abs')

models=[RandomForestRegressor(n_estimators=100, min_samples_split=2),
        RandomForestRegressor(n_estimators=100, min_samples_split=10),
        RandomForestRegressor(n_estimators=100, min_samples_split=25),
        RandomForestClassifier(n_estimators=100, min_samples_split=2),
        RandomForestClassifier(n_estimators=100, min_samples_split=10),
        RandomForestClassifier(n_estimators=100, min_samples_split=25)]

for i,model in enumerate(models):
    if i == 0:
        print('adr:')
    elif i == 3:
        print('canceled:')
        
    cur_type='adr' if i < 3 else 'canceled'
  
    #Random 20% for validation, mean of 3 times
    val_scores=[]
    train_scores=[]
    for j in range(3):
        if cur_type == 'adr':
            x_train, x_val, y_train, y_val=train_test_split(x_adr, y_adr, test_size=0.2, shuffle=True)
        elif cur_type == 'canceled':
            x_train, x_val, y_train, y_val=train_test_split(x_canceled, y_canceled, test_size=0.2, shuffle=True)
            
        model.fit(x_train, y_train)
        
        #Validation score
        y_pred=model.predict(x_val)
        if cur_type == 'adr':
            val_score=mean_squared_error(y_pred, y_val)
        elif cur_type == 'canceled':
            val_score=accuracy_score(y_pred, y_val)
        val_scores.append(val_score)
        
        #Train score
        y_pred=model.predict(x_train)
        if cur_type == 'adr':
            train_score=mean_squared_error(y_pred, y_train)
        elif cur_type == 'canceled':
            train_score=accuracy_score(y_pred, y_train)
        train_scores.append(train_score)
        
    print('e_in=', sum(train_scores) / len(train_scores))
    print('e_val(random 20%)=', sum(val_scores) / len(val_scores))
    
    #Last 20% for validation
    if cur_type == 'adr':
        x_train, x_val, y_train, y_val=train_test_split(x_adr, y_adr, test_size=0.2, shuffle=False)
    elif cur_type == 'canceled':
        x_train, x_val, y_train, y_val=train_test_split(x_canceled, y_canceled, test_size=0.2, shuffle=False)
    
    model.fit(x_train, y_train)
    
    y_pred=model.predict(x_val)
    if cur_type == 'adr':
        val_score=mean_squared_error(y_pred, y_val)
    elif cur_type == 'canceled':
        val_score=accuracy_score(y_pred, y_val)
        
    print('e_val(last 20%)=', val_score)
    print('======')

## Neural Network
### Compare different layers

In [None]:
#Read csv
x_raw=pd.read_csv('./raw/train.csv')
x_test_raw=pd.read_csv('./raw/test.csv')

#Preprocessing data
x_canceled,y_canceled,x_test=preprocessing(x_raw, x_test_raw, model='canceled', removed=True, 
                                           encoding='label', scaling='max-abs')
x_adr,y_adr,x_test=preprocessing(x_raw, x_test_raw, model='adr', removed=True, 
                                 encoding='label', scaling='max-abs')
    
models=[nnAdrModel(dim=x_adr.shape[1], layers=[15,15]),
        nnAdrModel(dim=x_adr.shape[1], layers=[15,15,15,15]),
        nnAdrModel(dim=x_adr.shape[1], layers=[30,30]),
        nnAdrModel(dim=x_adr.shape[1], layers=[30,30,30,30]),
        nnCanceledModel(dim=x_canceled.shape[1], layers=[15,15]),
        nnCanceledModel(dim=x_canceled.shape[1], layers=[15,15,15,15]),
        nnCanceledModel(dim=x_canceled.shape[1], layers=[30,30]),
        nnCanceledModel(dim=x_canceled.shape[1], layers=[30,30,30,30]),]

epochs=50
batch_size=50
for i,model in enumerate(models):
    if i == 0:
        print('adr:')
    elif i == 4:
        print('canceled:')
        
    cur_type='adr' if i < 4 else 'canceled'
  
    #Random 20% for validation, mean of 3 times
    val_scores=[]
    train_scores=[]
    for j in range(1):
        if cur_type == 'adr':
            x_train, x_val, y_train, y_val=train_test_split(x_adr, y_adr, test_size=0.2, shuffle=True)
        elif cur_type == 'canceled':
            x_train, x_val, y_train, y_val=train_test_split(x_canceled, y_canceled, test_size=0.2, shuffle=True)
            
        model.fit(x_train, y_train, epochs=epochs, batch_size=batch_size, verbose=0)
        
        #Validation score
        if cur_type == 'adr':
            y_pred=model.predict(x_val, verbose=0)
            val_score=mean_squared_error(y_pred, y_val)
        elif cur_type == 'canceled':
            y_pred=model.predict_classes(x_val, verbose=0)
            val_score=accuracy_score(y_pred, y_val)
        val_scores.append(val_score)
        
        #Train score
        if cur_type == 'adr':
            y_pred=model.predict(x_train, verbose=0)
            train_score=mean_squared_error(y_pred, y_train)
        elif cur_type == 'canceled':
            y_pred=model.predict_classes(x_train, verbose=0)
            train_score=accuracy_score(y_pred,y_train)
        train_scores.append(train_score)
        
    print('e_in=', sum(train_scores) / len(train_scores))
    print('e_val(random 20%)=', sum(val_scores) / len(val_scores))
    
    #Last 20% for validation
    if cur_type == 'adr':
        x_train, x_val, y_train, y_val=train_test_split(x_adr, y_adr, test_size=0.2, shuffle=False)
    elif cur_type == 'canceled':
        x_train, x_val, y_train, y_val=train_test_split(x_canceled, y_canceled, test_size=0.2, shuffle=False)
    
    model.fit(x_train, y_train, epochs=epochs, batch_size=batch_size, verbose=0)
    
    if cur_type == 'adr':
        y_pred=model.predict(x_val, verbose=0)
        val_score=mean_squared_error(y_pred, y_val)
    elif cur_type == 'canceled':
        y_pred=model.predict_classes(x_val, verbose=0)
        val_score=accuracy_score(y_pred, y_val)
        
    print('e_val(last 20%)=', val_score)
    print('======')

### Compare different epochs

In [None]:
#Read csv
x_raw=pd.read_csv('./raw/train.csv')
x_test_raw=pd.read_csv('./raw/test.csv')

#Preprocessing data
x_canceled,y_canceled,x_test=preprocessing(x_raw, x_test_raw, model='canceled', removed=True, 
                                           encoding='label', scaling='max-abs')
x_adr,y_adr,x_test=preprocessing(x_raw, x_test_raw, model='adr', removed=True, 
                                 encoding='label', scaling='max-abs')
    
models=[nnAdrModel(dim=x_adr.shape[1], layers=[30,30,30,30]),
        nnAdrModel(dim=x_adr.shape[1], layers=[30,30,30,30]),
        nnAdrModel(dim=x_adr.shape[1], layers=[30,30,30,30]),
        nnCanceledModel(dim=x_canceled.shape[1], layers=[30,30,30,30]),
        nnCanceledModel(dim=x_canceled.shape[1], layers=[30,30,30,30]),
        nnCanceledModel(dim=x_canceled.shape[1], layers=[30,30,30,30]),]

epochs=[25,50,100]
batch_size=50
for i,model in enumerate(models):
    if i == 0:
        print('adr:')
    elif i == 3:
        print('canceled:')
        
    cur_type='adr' if i < 3 else 'canceled'
  
    #Random 20% for validation, mean of 3 times
    val_scores=[]
    train_scores=[]
    for j in range(3):
        if cur_type == 'adr':
            x_train, x_val, y_train, y_val=train_test_split(x_adr, y_adr, test_size=0.2, shuffle=True)
        elif cur_type == 'canceled':
            x_train, x_val, y_train, y_val=train_test_split(x_canceled, y_canceled, test_size=0.2, shuffle=True)
            
        model.fit(x_train, y_train, epochs=epochs[i % 2], batch_size=batch_size, verbose=0)
        
        #Validation score
        if cur_type == 'adr':
            y_pred=model.predict(x_val, verbose=0)
            val_score=mean_squared_error(y_pred, y_val)
        elif cur_type == 'canceled':
            y_pred=model.predict_classes(x_val, verbose=0)
            val_score=accuracy_score(y_pred, y_val)
        val_scores.append(val_score)
        
        #Train score
        if cur_type == 'adr':
            y_pred=model.predict(x_train, verbose=0)
            train_score=mean_squared_error(y_pred, y_train)
        elif cur_type == 'canceled':
            y_pred=model.predict_classes(x_train, verbose=0)
            train_score=accuracy_score(y_pred,y_train)
        train_scores.append(train_score)
        
    print('e_in=', sum(train_scores) / len(train_scores))
    print('e_val(random 20%)=', sum(val_scores) / len(val_scores))
    
    #Last 20% for validation
    if cur_type == 'adr':
        x_train, x_val, y_train, y_val=train_test_split(x_adr, y_adr, test_size=0.2, shuffle=False)
    elif cur_type == 'canceled':
        x_train, x_val, y_train, y_val=train_test_split(x_canceled, y_canceled, test_size=0.2, shuffle=False)
    
    model.fit(x_train, y_train, epochs=epochs[i % 2], batch_size=batch_size, verbose=0)
    
    if cur_type == 'adr':
        y_pred=model.predict(x_val, verbose=0)
        val_score=mean_squared_error(y_pred, y_val)
    elif cur_type == 'canceled':
        y_pred=model.predict_classes(x_val, verbose=0)
        val_score=accuracy_score(y_pred, y_val)
        
    print('e_val(last 20%)=', val_score)
    print('======')

## Linear Regression / Logistic Regression
### Compare different regularizers and C

In [None]:
#Read csv
x_raw=pd.read_csv('./raw/train.csv')
x_test_raw=pd.read_csv('./raw/test.csv')

#Preprocessing data
x_canceled,y_canceled,x_test=preprocessing(x_raw, x_test_raw, model='canceled', removed=True, 
                                           encoding='label', scaling='max-abs')
x_adr,y_adr,x_test=preprocessing(x_raw, x_test_raw, model='adr', removed=True, 
                                 encoding='label', scaling='max-abs')

models=[LinearRegression(),
        LogisticRegression(penalty='l2', C=0.1),
        LogisticRegression(penalty='l2', C=1),
        LogisticRegression(penalty='l2', C=10),
        LogisticRegression(penalty='none')]

for i,model in enumerate(models):
    
    if i == 0:
        print('adr:')
    elif i == 1:
        print('canceled:')
        
    cur_type='adr' if i < 1 else 'canceled'
  
    #Random 20% for validation, mean of 3 times
    val_scores=[]
    train_scores=[]
    for j in range(3):
        if cur_type == 'adr':
            x_train, x_val, y_train, y_val=train_test_split(x_adr, y_adr, test_size=0.2, shuffle=True)
        elif cur_type == 'canceled':
            x_train, x_val, y_train, y_val=train_test_split(x_canceled, y_canceled, test_size=0.2, shuffle=True)
            
        model.fit(x_train, y_train)
        
        #Validation score
        y_pred=model.predict(x_val)
        if cur_type == 'adr':
            val_score=mean_squared_error(y_pred, y_val)
        elif cur_type == 'canceled':
            val_score=accuracy_score(y_pred, y_val)
        val_scores.append(val_score)
        
        #Train score
        y_pred=model.predict(x_train)
        if cur_type == 'adr':
            train_score=mean_squared_error(y_pred, y_train)
        elif cur_type == 'canceled':
            train_score=accuracy_score(y_pred, y_train)
        train_scores.append(train_score)
        
    print('e_in=', sum(train_scores) / len(train_scores))
    print('e_val(random 20%)=', sum(val_scores) / len(val_scores))
    
    #Last 20% for validation
    if cur_type == 'adr':
        x_train, x_val, y_train, y_val=train_test_split(x_adr, y_adr, test_size=0.2, shuffle=False)
    elif cur_type == 'canceled':
        x_train, x_val, y_train, y_val=train_test_split(x_canceled, y_canceled, test_size=0.2, shuffle=False)
    
    model.fit(x_train, y_train)
    
    y_pred=model.predict(x_val)
    if cur_type == 'adr':
        val_score=mean_squared_error(y_pred, y_val)
    elif cur_type == 'canceled':
        val_score=accuracy_score(y_pred, y_val)
        
    print('e_val(last 20%)=', val_score)
    print('======')

## Removing noise

In [None]:
#Read csv
x_raw=pd.read_csv('./raw/train.csv')
x_test_raw=pd.read_csv('./raw/test.csv')

#Preprocessing data
x_canceled,y_canceled,x_test=preprocessing(x_raw, x_test_raw, model='canceled', removed=False, 
                                           encoding='label', scaling='max-abs')
x_adr,y_adr,x_test=preprocessing(x_raw, x_test_raw, model='adr', removed=False, 
                                 encoding='label', scaling='max-abs')

models=[RandomForestRegressor(n_estimators=300),
        nnAdrModel(dim=x_adr.shape[1], layers=[30,30,30,30]),
        LinearRegression(),
        RandomForestClassifier(n_estimators=100),
        nnCanceledModel(dim=x_canceled.shape[1], layers=[30,30,30,30]),
        LogisticRegression(penalty='l2', C=1)]

epochs=50
batch_size=50
for i,model in enumerate(models):
    if i == 0:
        print('adr:')
    elif i == 3:
        print('canceled:')
    cur_type='adr' if i < 3 else 'canceled'
    isNN=True if i == 1 or i == 4 else False
    
    #Last 20% for validation
    if cur_type == 'adr':
        x_train, x_val, y_train, y_val=train_test_split(x_adr, y_adr, test_size=0.2, shuffle=False)
    elif cur_type == 'canceled':
        x_train, x_val, y_train, y_val=train_test_split(x_canceled, y_canceled, test_size=0.2, shuffle=False)
    
    if isNN == False:
        model.fit(x_train, y_train)
    elif isNN == True:
        model.fit(x_train, y_train, epochs=epochs, batch_size=batch_size, verbose=0)
    
    #Train score 
    if cur_type == 'adr':
        if isNN == False:
            y_pred=model.predict(x_train)
        elif isNN == True:
            y_pred=model.predict(x_train, verbose=0)
        train_score=mean_squared_error(y_pred, y_train)
    elif cur_type == 'canceled':
        if isNN == False:
            y_pred=model.predict(x_train)
        elif isNN == True:
            y_pred=model.predict_classes(x_train, verbose=0)
        train_score=accuracy_score(y_pred, y_train)
            
    #Val score 
    if cur_type == 'adr':
        if isNN == False:
            y_pred=model.predict(x_val)
        elif isNN == True:
            y_pred=model.predict(x_val, verbose=0)
        val_score=mean_squared_error(y_pred, y_val)
    elif cur_type == 'canceled':
        if isNN == False:
            y_pred=model.predict(x_val)
        elif isNN == True:
            y_pred=model.predict_classes(x_val, verbose=0)
        val_score=accuracy_score(y_pred, y_val)
    
    print('e_in=', train_score)
    print('e_val=', val_score)
    print('======')

## Encoding methods

In [None]:
#Read csv
x_raw=pd.read_csv('./raw/train.csv')
x_test_raw=pd.read_csv('./raw/test.csv')

#Preprocessing data
x_canceled,y_canceled,x_test=preprocessing(x_raw, x_test_raw, model='canceled', removed=True, 
                                           encoding='one-hot', scaling='max-abs')
x_adr,y_adr,x_test=preprocessing(x_raw, x_test_raw, model='adr', removed=True, 
                                 encoding='one-hot', scaling='max-abs')

models=[#RandomForestRegressor(n_estimators=300),
        nnAdrModel(dim=x_adr.shape[1], layers=[30,30,30,30]),
        LinearRegression(),
        RandomForestClassifier(n_estimators=100),
        nnCanceledModel(dim=x_canceled.shape[1], layers=[30,30,30,30]),
        LogisticRegression(penalty='l2', C=1)]

epochs=50
batch_size=50
for i,model in enumerate(models):
    if i == 0:
        print('adr:')
    elif i == 2:
        print('canceled:')
    cur_type='adr' if i < 2 else 'canceled'
    isNN=True if i == 0 or i == 3 else False
    
    #Last 20% for validation
    if cur_type == 'adr':
        x_train, x_val, y_train, y_val=train_test_split(x_adr, y_adr, test_size=0.2, shuffle=False)
    elif cur_type == 'canceled':
        x_train, x_val, y_train, y_val=train_test_split(x_canceled, y_canceled, test_size=0.2, shuffle=False)
    
    if isNN == False:
        model.fit(x_train, y_train)
    elif isNN == True:
        model.fit(x_train, y_train, epochs=epochs, batch_size=batch_size, verbose=0)
    
    #Train score 
    if cur_type == 'adr':
        if isNN == False:
            y_pred=model.predict(x_train)
        elif isNN == True:
            y_pred=model.predict(x_train, verbose=0)
        train_score=mean_squared_error(y_pred, y_train)
    elif cur_type == 'canceled':
        if isNN == False:
            y_pred=model.predict(x_train)
        elif isNN == True:
            y_pred=model.predict_classes(x_train, verbose=0)
        train_score=accuracy_score(y_pred, y_train)
            
    #Val score 
    if cur_type == 'adr':
        if isNN == False:
            y_pred=model.predict(x_val)
        elif isNN == True:
            y_pred=model.predict(x_val, verbose=0)
        val_score=mean_squared_error(y_pred, y_val)
    elif cur_type == 'canceled':
        if isNN == False:
            y_pred=model.predict(x_val)
        elif isNN == True:
            y_pred=model.predict_classes(x_val, verbose=0)
        val_score=accuracy_score(y_pred, y_val)
    
    print('e_in=', train_score)
    print('e_val=', val_score)
    print('======')

In [None]:
#Read csv
x_raw=pd.read_csv('./raw/train.csv')
x_test_raw=pd.read_csv('./raw/test.csv')

#Preprocessing data
x_canceled,y_canceled,x_test=preprocessing(x_raw, x_test_raw, model='canceled', removed=True, 
                                           encoding='label', scaling='standard')
x_adr,y_adr,x_test=preprocessing(x_raw, x_test_raw, model='adr', removed=True, 
                                 encoding='label', scaling='standard')

models=[RandomForestRegressor(n_estimators=300),
        nnAdrModel(dim=x_adr.shape[1], layers=[30,30,30,30]),
        LinearRegression(),
        RandomForestClassifier(n_estimators=100),
        nnCanceledModel(dim=x_canceled.shape[1], layers=[30,30,30,30]),
        LogisticRegression(penalty='l2', C=1)]

epochs=50
batch_size=50
for i,model in enumerate(models):
    if i == 0:
        print('adr:')
    elif i == 3:
        print('canceled:')
    cur_type='adr' if i < 3 else 'canceled'
    isNN=True if i == 1 or i == 4 else False
    
    #Last 20% for validation
    if cur_type == 'adr':
        x_train, x_val, y_train, y_val=train_test_split(x_adr, y_adr, test_size=0.2, shuffle=False)
    elif cur_type == 'canceled':
        x_train, x_val, y_train, y_val=train_test_split(x_canceled, y_canceled, test_size=0.2, shuffle=False)
    
    if isNN == False:
        model.fit(x_train, y_train)
    elif isNN == True:
        model.fit(x_train, y_train, epochs=epochs, batch_size=batch_size, verbose=0)
    
    #Train score 
    if cur_type == 'adr':
        if isNN == False:
            y_pred=model.predict(x_train)
        elif isNN == True:
            y_pred=model.predict(x_train, verbose=0)
        train_score=mean_squared_error(y_pred, y_train)
    elif cur_type == 'canceled':
        if isNN == False:
            y_pred=model.predict(x_train)
        elif isNN == True:
            y_pred=model.predict_classes(x_train, verbose=0)
        train_score=accuracy_score(y_pred, y_train)
            
    #Val score 
    if cur_type == 'adr':
        if isNN == False:
            y_pred=model.predict(x_val)
        elif isNN == True:
            y_pred=model.predict(x_val, verbose=0)
        val_score=mean_squared_error(y_pred, y_val)
    elif cur_type == 'canceled':
        if isNN == False:
            y_pred=model.predict(x_val)
        elif isNN == True:
            y_pred=model.predict_classes(x_val, verbose=0)
        val_score=accuracy_score(y_pred, y_val)
    
    print('e_in=', train_score)
    print('e_val=', val_score)
    print('======')

In [None]:
#Read csv
x_raw=pd.read_csv('./raw/train.csv')
x_test_raw=pd.read_csv('./raw/test.csv')
x_raw=x_raw[x_raw.hotel=='Resort Hotel']

#Preprocessing data
x_canceled,y_canceled,x_test=preprocessing(x_raw, x_test_raw, model='canceled', removed=True, 
                                           encoding='label', scaling='standard')
x_canceled.shape[0]

In [None]:
#Read csv
x_raw=pd.read_csv('./raw/train.csv')
x_test_raw=pd.read_csv('./raw/test.csv')
x_raw=x_raw[x_raw.hotel=='City Hotel']

#Preprocessing data
x_canceled,y_canceled,x_test=preprocessing(x_raw, x_test_raw, model='canceled', removed=True, 
                                           encoding='label', scaling='standard')
x_canceled.shape[0]

In [None]:
#Read csv
x_raw=pd.read_csv('./raw/train.csv')
x_test_raw=pd.read_csv('./raw/test.csv')
x_raw=x_raw[x_raw.hotel=='City Hotel']

#Preprocessing data
x_canceled,y_canceled,x_test=preprocessing(x_raw, x_test_raw, model='canceled', removed=True, 
                                           encoding='label', scaling='standard')
x_adr,y_adr,x_test=preprocessing(x_raw, x_test_raw, model='adr', removed=True, 
                                 encoding='label', scaling='standard')

models=[RandomForestRegressor(n_estimators=300),
        nnAdrModel(dim=x_adr.shape[1], layers=[30,30,30,30]),
        LinearRegression(),
        RandomForestClassifier(n_estimators=100),
        nnCanceledModel(dim=x_canceled.shape[1], layers=[30,30,30,30]),
        LogisticRegression(penalty='l2', C=1)]

epochs=50
batch_size=50
for i,model in enumerate(models):
    if i == 0:
        print('adr:')
    elif i == 3:
        print('canceled:')
    cur_type='adr' if i < 3 else 'canceled'
    isNN=True if i == 1 or i == 4 else False
    
    #Last 20% for validation
    if cur_type == 'adr':
        x_train, x_val, y_train, y_val=train_test_split(x_adr, y_adr, test_size=0.2, shuffle=False)
    elif cur_type == 'canceled':
        x_train, x_val, y_train, y_val=train_test_split(x_canceled, y_canceled, test_size=0.2, shuffle=False)
    
    if isNN == False:
        model.fit(x_train, y_train)
    elif isNN == True:
        model.fit(x_train, y_train, epochs=epochs, batch_size=batch_size, verbose=0)
    
    #Train score 
    if cur_type == 'adr':
        if isNN == False:
            y_pred=model.predict(x_train)
        elif isNN == True:
            y_pred=model.predict(x_train, verbose=0)
        train_score=mean_squared_error(y_pred, y_train)
    elif cur_type == 'canceled':
        if isNN == False:
            y_pred=model.predict(x_train)
        elif isNN == True:
            y_pred=model.predict_classes(x_train, verbose=0)
        train_score=accuracy_score(y_pred, y_train)
            
    #Val score 
    if cur_type == 'adr':
        if isNN == False:
            y_pred=model.predict(x_val)
        elif isNN == True:
            y_pred=model.predict(x_val, verbose=0)
        val_score=mean_squared_error(y_pred, y_val)
    elif cur_type == 'canceled':
        if isNN == False:
            y_pred=model.predict(x_val)
        elif isNN == True:
            y_pred=model.predict_classes(x_val, verbose=0)
        val_score=accuracy_score(y_pred, y_val)
    
    print('e_in=', train_score)
    print('e_val=', val_score)
    print('======')

In [None]:
#Read csv
x_raw=pd.read_csv('./raw/train.csv')
x_test_raw=pd.read_csv('./raw/test.csv')
x_raw=x_raw[x_raw.hotel=='Resort Hotel']

#Preprocessing data
x_canceled,y_canceled,x_test=preprocessing(x_raw, x_test_raw, model='canceled', removed=True, 
                                           encoding='label', scaling='standard')
x_adr,y_adr,x_test=preprocessing(x_raw, x_test_raw, model='adr', removed=True, 
                                 encoding='label', scaling='standard')

models=[RandomForestRegressor(n_estimators=300),
        nnAdrModel(dim=x_adr.shape[1], layers=[30,30,30,30]),
        LinearRegression(),
        RandomForestClassifier(n_estimators=100),
        nnCanceledModel(dim=x_canceled.shape[1], layers=[30,30,30,30]),
        LogisticRegression(penalty='l2', C=1)]

epochs=50
batch_size=50
for i,model in enumerate(models):
    if i == 0:
        print('adr:')
    elif i == 3:
        print('canceled:')
    cur_type='adr' if i < 3 else 'canceled'
    isNN=True if i == 1 or i == 4 else False
    
    #Last 20% for validation
    if cur_type == 'adr':
        x_train, x_val, y_train, y_val=train_test_split(x_adr, y_adr, test_size=0.2, shuffle=False)
    elif cur_type == 'canceled':
        x_train, x_val, y_train, y_val=train_test_split(x_canceled, y_canceled, test_size=0.2, shuffle=False)
    
    if isNN == False:
        model.fit(x_train, y_train)
    elif isNN == True:
        model.fit(x_train, y_train, epochs=epochs, batch_size=batch_size, verbose=0)
    
    #Train score 
    if cur_type == 'adr':
        if isNN == False:
            y_pred=model.predict(x_train)
        elif isNN == True:
            y_pred=model.predict(x_train, verbose=0)
        train_score=mean_squared_error(y_pred, y_train)
    elif cur_type == 'canceled':
        if isNN == False:
            y_pred=model.predict(x_train)
        elif isNN == True:
            y_pred=model.predict_classes(x_train, verbose=0)
        train_score=accuracy_score(y_pred, y_train)
            
    #Val score 
    if cur_type == 'adr':
        if isNN == False:
            y_pred=model.predict(x_val)
        elif isNN == True:
            y_pred=model.predict(x_val, verbose=0)
        val_score=mean_squared_error(y_pred, y_val)
    elif cur_type == 'canceled':
        if isNN == False:
            y_pred=model.predict(x_val)
        elif isNN == True:
            y_pred=model.predict_classes(x_val, verbose=0)
        val_score=accuracy_score(y_pred, y_val)
    
    print('e_in=', train_score)
    print('e_val=', val_score)
    print('======')