In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [0]:
import pandas as pd
import numpy as np
import re
import lightgbm as lgb
from datetime import date
from sklearn import preprocessing
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.metrics import mean_squared_error

In [0]:
train=pd.read_csv("/content/gdrive/My Drive/ML/ClubMahindra/train.csv")
test=pd.read_csv("/content/gdrive/My Drive/ML/ClubMahindra/test.csv")

In [4]:
print(train.shape)
print(test.shape)

(341424, 24)
(146765, 23)


In [5]:
train.isnull().sum()

reservation_id                           0
booking_date                             0
checkin_date                             0
checkout_date                            0
channel_code                             0
main_product_code                        0
numberofadults                           0
numberofchildren                         0
persontravellingid                       0
resort_region_code                       0
resort_type_code                         0
room_type_booked_code                    0
roomnights                               0
season_holidayed_code                  114
state_code_residence                  4764
state_code_resort                        0
total_pax                                0
member_age_buckets                       0
booking_type_code                        0
memberid                                 0
cluster_code                             0
reservationstatusid_code                 0
resort_id                                0
amount_spen

In [6]:
test.isnull().sum()

reservation_id                 0
booking_date                   0
checkin_date                   0
checkout_date                  0
channel_code                   0
main_product_code              0
numberofadults                 0
numberofchildren               0
persontravellingid             0
resort_region_code             0
resort_type_code               0
room_type_booked_code          0
roomnights                     0
season_holidayed_code         35
state_code_residence        2260
state_code_resort              0
total_pax                      0
member_age_buckets             0
booking_type_code              0
memberid                       0
cluster_code                   0
reservationstatusid_code       0
resort_id                      0
dtype: int64

In [0]:
X=train.drop(['amount_spent_per_room_night_scaled'],axis=1)

In [8]:
df = pd.concat([X, test],sort=False,ignore_index=True)
from sklearn.utils import shuffle
df = shuffle(df)
df.shape

(488189, 23)

In [0]:
df['season_holidayed_code']=df['season_holidayed_code'].fillna(value=2)
df['state_code_residence']=df['state_code_residence'].fillna(value=8.0)

In [0]:
df['booking_date']=pd.to_datetime(df['booking_date'], format='%d/%m/%y')
df['checkin_date']=pd.to_datetime(df['checkin_date'], format='%d/%m/%y')
df['checkout_date']=pd.to_datetime(df['checkout_date'], format='%d/%m/%y')

In [0]:
df['no_of_days']=(df['checkout_date']-df['checkin_date']).dt.days
df['prior_days']=(df['checkin_date']-df['booking_date']).dt.days

In [0]:
def add_datepart(df, fldname, drop=True):
    fld = df[fldname]
    if not np.issubdtype(fld.dtype, np.datetime64):
        df[fldname] = fld = pd.to_datetime(fld, infer_datetime_format=True)
    targ_pre = re.sub('[Dd]ate$', '', fldname)
    for n in ('Year', 'Month', 'Week', 'Day', 'Dayofweek', 'Dayofyear',
            'Is_month_end', 'Is_month_start', 'Is_quarter_end', 'Is_quarter_start', 'Is_year_end', 'Is_year_start'):
        df[targ_pre+n] = getattr(fld.dt,n.lower())
    df[targ_pre+'Elapsed'] = fld.astype(np.int64) // 10**9
    if drop: df.drop(fldname, axis=1, inplace=True)

In [0]:
add_datepart(df, 'booking_date')
add_datepart(df, 'checkin_date')
add_datepart(df, 'checkout_date')

In [0]:
df['total_people']=df['numberofadults']+df['numberofchildren']

In [0]:
df=df.drop(['reservation_id','memberid'],axis=1)

In [16]:
a=df.columns
a.tolist()

['channel_code',
 'main_product_code',
 'numberofadults',
 'numberofchildren',
 'persontravellingid',
 'resort_region_code',
 'resort_type_code',
 'room_type_booked_code',
 'roomnights',
 'season_holidayed_code',
 'state_code_residence',
 'state_code_resort',
 'total_pax',
 'member_age_buckets',
 'booking_type_code',
 'cluster_code',
 'reservationstatusid_code',
 'resort_id',
 'no_of_days',
 'prior_days',
 'booking_Year',
 'booking_Month',
 'booking_Week',
 'booking_Day',
 'booking_Dayofweek',
 'booking_Dayofyear',
 'booking_Is_month_end',
 'booking_Is_month_start',
 'booking_Is_quarter_end',
 'booking_Is_quarter_start',
 'booking_Is_year_end',
 'booking_Is_year_start',
 'booking_Elapsed',
 'checkin_Year',
 'checkin_Month',
 'checkin_Week',
 'checkin_Day',
 'checkin_Dayofweek',
 'checkin_Dayofyear',
 'checkin_Is_month_end',
 'checkin_Is_month_start',
 'checkin_Is_quarter_end',
 'checkin_Is_quarter_start',
 'checkin_Is_year_end',
 'checkin_Is_year_start',
 'checkin_Elapsed',
 'checkout_

In [17]:
df['booking_Dayofyear'].value_counts().count()

366

In [18]:
df['checkin_Year'].value_counts()

2017    125762
2018    119443
2016    117802
2015    107143
2019     18025
2012        14
Name: checkin_Year, dtype: int64

In [19]:
df['checkout_Week'].head()

74601     21
33385      7
306782    16
309078    28
445230    12
Name: checkout_Week, dtype: int64

In [0]:
cat_vars = ['channel_code','main_product_code', 'persontravellingid','resort_region_code','resort_type_code','room_type_booked_code',
            'season_holidayed_code','state_code_residence','state_code_resort','member_age_buckets','booking_type_code','cluster_code',
            'reservationstatusid_code','resort_id','checkout_Year','checkin_Year','booking_Year','booking_Month','checkin_Month','checkout_Month',
           'booking_Dayofweek','checkin_Dayofweek','checkout_Dayofweek']

In [0]:
for col in cat_vars:
    df[col] = df[col].astype('str')

In [22]:
encoder = {}
for col in cat_vars:
    print ('Processing ', col)
    le = preprocessing.LabelEncoder()
    ohe = OneHotEncoder()    
    le.fit(df[col])
    df[col] = le.fit_transform(df[col])
    df_ohe = ohe.fit_transform(df[col].values.reshape(-1,1)).toarray()
    dfOneHot = pd.DataFrame(df_ohe, columns = [col+str(int(i)) for i in range(df_ohe.shape[1])])
    df= pd.concat([df, dfOneHot], axis=1)

Processing  channel_code


In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


Processing  main_product_code


In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


Processing  persontravellingid


In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


Processing  resort_region_code


In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


Processing  resort_type_code


In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


Processing  room_type_booked_code


In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


Processing  season_holidayed_code


In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


Processing  state_code_residence


In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


Processing  state_code_resort


In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


Processing  member_age_buckets


In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


Processing  booking_type_code


In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


Processing  cluster_code


In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


Processing  reservationstatusid_code


In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


Processing  resort_id


In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


Processing  checkout_Year


In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


Processing  checkin_Year


In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


Processing  booking_Year


In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


Processing  booking_Month


In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


Processing  checkin_Month


In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


Processing  checkout_Month


In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


Processing  booking_Dayofweek


In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


Processing  checkin_Dayofweek


In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


Processing  checkout_Dayofweek


In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


In [0]:
df=df.drop(cat_vars,axis=1)

In [0]:
# from sklearn.preprocessing import StandardScaler
# scaler = StandardScaler()
# df_values = scaler.fit_transform(df.values)
# df_=pd.DataFrame(df_values,index=df.index, columns=df.columns)

In [0]:
# from sklearn.preprocessing import MinMaxScaler
# scaler = MinMaxScaler()
# df_values = scaler.fit_transform(df.values)
# df_=pd.DataFrame(df_values,index=df.index, columns=df.columns)

In [26]:
df.shape

(488189, 248)

In [0]:
X=df[:341424]
y=train['amount_spent_per_room_night_scaled']
test_=df[341424:]

In [0]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.15, random_state=42)

In [0]:
random_state=42
lgb_params = {
    "objective" : "regression",
    "metric" : "rmse",
    "boosting": 'gbdt',
    "device_type":"gpu",
    "max_depth" : -1,
    "num_leaves" : 30,
    "learning_rate" : 0.006,
    "bagging_freq": 5,
    "bagging_fraction" : 0.8,
    "feature_fraction" : 0.9,
    "min_data_in_leaf": 80,
    "min_sum_heassian_in_leaf": 10,
    "tree_learner": "serial",
    "boost_from_average": "false",
    "bagging_seed" : random_state,
    "verbosity" : 1,
    "seed": random_state
}

In [0]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=random_state)
NFOLDS=5
kf=KFold(n_splits=NFOLDS,shuffle=True,random_state=random_state)
val_aucs = []
oof_train = np.zeros((len(X_train),))
oof_val = np.zeros((len(X_val),))
oof_test=np.zeros((len(test_),))
oof_test_skf = np.empty((NFOLDS, len(X_val)))
features=X.columns

In [86]:
# for  (trn_idx, val_idx) in (kf.split(X_train, y_train)):
p_valid,yp = 0,0
for fold, (train_index, val_index) in enumerate(kf.split(X,y)):
  print("Fold {}".format(fold))
#   N = 5
  
#   for i in range(N):

  X_train, X_val, y_train, y_val =X[features].iloc[train_index], X[features].iloc[val_index], y.iloc[train_index], y.iloc[val_index]


  trn_data = lgb.Dataset(X_train, label=y_train)
  val_data = lgb.Dataset(X_val, label=y_val)

  evals_result = {}
  lgb_clf = lgb.train(lgb_params,
                  trn_data,
                  10000,
                  valid_sets = [trn_data, val_data],
                  early_stopping_rounds=3000,
                  verbose_eval = 1000,
                  evals_result=evals_result
                 )
  p_valid = lgb_clf.predict(X_val)
#   oof_test_skf[i, :]    = lgb_clf.predict(test_,num_iteration=clf.best_iteration_)[:, 1]
  yp += lgb_clf.predict(test_, num_iteration=lgb_clf.best_iteration)

  val_score = mean_squared_error(y_val, p_valid)
  val_aucs.append(val_score)

# oof_test[:] = oof_test_skf.mean(axis=0)
  
#   p_valid += lgb_clf.predict(X_val)
#   yp += lgb_clf.predict(test_)

# val_score = mean_squared_error(y_val, p_valid)
# val_aucs.append(val_score)
    
#     predictions['fold{}'.format(fold+1)] = yp/N

Fold 0
Training until validation scores don't improve for 3000 rounds.
[1000]	training's rmse: 0.978863	valid_1's rmse: 0.982818
[2000]	training's rmse: 0.969805	valid_1's rmse: 0.97879
[3000]	training's rmse: 0.964176	valid_1's rmse: 0.977837
[4000]	training's rmse: 0.959315	valid_1's rmse: 0.977554
[5000]	training's rmse: 0.954852	valid_1's rmse: 0.977356
[6000]	training's rmse: 0.950661	valid_1's rmse: 0.977273
[7000]	training's rmse: 0.946646	valid_1's rmse: 0.977289
[8000]	training's rmse: 0.942739	valid_1's rmse: 0.977364
[9000]	training's rmse: 0.93893	valid_1's rmse: 0.977456
Early stopping, best iteration is:
[6226]	training's rmse: 0.949732	valid_1's rmse: 0.977248
Fold 1
Training until validation scores don't improve for 3000 rounds.
[1000]	training's rmse: 0.979096	valid_1's rmse: 0.982303
[2000]	training's rmse: 0.970106	valid_1's rmse: 0.978171
[3000]	training's rmse: 0.964525	valid_1's rmse: 0.977367
[4000]	training's rmse: 0.959668	valid_1's rmse: 0.976895
[5000]	traini

In [71]:
val_aucs

[0.956137644612614,
 0.9538180942717261,
 0.9514619597221198,
 0.9532228084371132,
 0.9628330722309405]

In [0]:
prediction=pd.DataFrame()
prediction['reservation_id']=test['reservation_id']
prediction['amount_spent_per_room_night_scaled']=yp/5
prediction.to_csv('submission_15.csv',index=False)

In [0]:
from google.colab import files
files.download('submission_15.csv')
# print("Hi")