In [1]:
import pandas as pd
from sklearn.preprocessing import Imputer
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelBinarizer
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeRegressor
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.ensemble import BaggingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor

import time
import numpy as np

  from numpy.core.umath_tests import inner1d


In [2]:
def display_scores(scores):
    print("Scores:",scores)
    print("Mean:", scores.mean())
    print("Std:", scores.std())

In [24]:
df = pd.read_csv("data/trivago_cleaned_dummy.csv",delimiter=",")
df_test = pd.read_csv("data/X_test_all_cat.csv",delimiter=",")

In [25]:
df_test.head()

Unnamed: 0,row_num,locale_1,locale_2,locale_3,locale_4,locale_5,locale_6,day_of_week_1,day_of_week_2,day_of_week_3,...,traffic_type_5,traffic_type_6,traffic_type_7,session_duration,path_1,path_2,path_3,path_4,path_5,no_of_ids
0,988681,0,0,0,0,0,1,1,0,0,...,1,0,0,7037,10000,10001,0,0,0,2
1,988666,0,0,1,0,0,0,0,0,0,...,0,0,0,5189,10001,10002,10003,0,0,3
2,988665,0,0,0,1,0,0,0,0,0,...,0,0,0,5,10004,10001,0,0,0,2
3,988664,0,0,1,0,0,0,0,1,0,...,0,0,0,8041,10001,10005,10006,0,0,3
4,988663,0,0,1,0,0,0,0,0,0,...,0,0,0,117,10007,10001,10008,0,0,3


In [26]:
X_test = df_test.drop(columns='row_num', axis =1)
df.drop(columns='row_num', inplace=True)
print(df.shape)
df.head()

(619235, 14)


Unnamed: 0,locale,day_of_week,hour_of_day,agent_id,entry_page,traffic_type,session_duration,path_1,path_2,path_3,path_4,path_5,no_of_ids,hits
0,2,4,22,1,1,2,49,10000,10001,0,0,0,2.0,14
1,4,6,21,5,3,1,1892,10001,10002,0,0,0,2.0,14
2,3,6,19,4,1,6,0,10003,0,0,0,0,1.0,1
3,2,2,6,1,2,1,2,10004,10001,0,0,0,2.0,3
4,3,1,1,4,3,1,0,10001,0,0,0,0,1.0,2


The below class was used to experiment with the data to try out various combinations of encoding and models, to see which would work best

In [27]:
class Dataprep(BaseEstimator, TransformerMixin):
    def __init__(self, encode_locale=False, encode_day_of_week=False, 
                 encode_agent_id=False, encode_entry_page=False, encode_traffic_type=False, encode_all=False):
        self.encode_all = encode_all
        if encode_all:
            self.encode_locale = True
            self.encode_day_of_week = True
            self.encode_agent_id = True
            self.encode_entry_page = True
            self.encode_traffic_type = True
        else:
            self.encode_locale = encode_locale
            self.encode_day_of_week = encode_day_of_week
            self.encode_agent_id = encode_agent_id
            self.encode_entry_page = encode_entry_page
            self.encode_traffic_type = encode_traffic_type
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        encoder = LabelBinarizer()
        if self.encode_locale:
            locale_1hot = encoder.fit_transform(X['locale'])
            for i in range(locale_1hot.shape[1]):
                X['locale_'+str(i+1)] = locale_1hot[:,i]
            cols1 = X.columns[len(X.columns)-locale_1hot.shape[1]:]
        else: cols1 =['locale']
            
        if self.encode_day_of_week:
            day_of_week_1hot = encoder.fit_transform(X['day_of_week'])
            for i in range(day_of_week_1hot.shape[1]):
                X['day_of_week_'+str(i+1)] = day_of_week_1hot[:,i]
            cols2 = X.columns[len(X.columns)-day_of_week_1hot.shape[1]:]
        else: cols2 =['day_of_week']
            
        if self.encode_agent_id:
            agent_id_1hot = encoder.fit_transform(X['agent_id'])
            for i in range(agent_id_1hot.shape[1]):
                X['agent_id_'+str(i+1)] = agent_id_1hot[:,i]
                cols3 = X.columns[len(X.columns)-agent_id_1hot.shape[1]:]
        else: cols3 =['agent_id']
            
        if self.encode_entry_page:
            entry_page_1hot = encoder.fit_transform(X['entry_page'])
            for i in range(entry_page_1hot.shape[1]):
                X['entry_page_'+str(i+1)] = entry_page_1hot[:,i]
            cols4 = X.columns[len(X.columns)-entry_page_1hot.shape[1]:]
        else: cols4 =['entry_page']
            
        if self.encode_traffic_type:
            traffic_type_1hot = encoder.fit_transform(X['traffic_type'])
            for i in range(traffic_type_1hot.shape[1]):
                X['traffic_type_'+str(i+1)] = traffic_type_1hot[:,i]
            cols5 = X.columns[len(X.columns)-traffic_type_1hot.shape[1]:]
        else: cols5 =['traffic_type']  

        cols=[]
        cols.extend(cols1)
        cols.extend(cols2)
        cols.extend(['hour_of_day'])
        cols.extend(cols3)
        cols.extend(cols4)
        cols.extend(cols5)
        cols.extend(['session_duration', 'path_1', 'path_2', 'path_3', 'path_4', 'path_5', 'no_of_ids'])
        return X[cols]


In [28]:
X_data = df.drop(columns=['hits'])
Y_data = df['hits'].copy()

In [29]:
X_data.head()

Unnamed: 0,locale,day_of_week,hour_of_day,agent_id,entry_page,traffic_type,session_duration,path_1,path_2,path_3,path_4,path_5,no_of_ids
0,2,4,22,1,1,2,49,10000,10001,0,0,0,2.0
1,4,6,21,5,3,1,1892,10001,10002,0,0,0,2.0
2,3,6,19,4,1,6,0,10003,0,0,0,0,1.0
3,2,2,6,1,2,1,2,10004,10001,0,0,0,2.0
4,3,1,1,4,3,1,0,10001,0,0,0,0,1.0


In [30]:
attr_data = Dataprep(encode_all=True)
X_data=attr_data.transform(X_data)
X_data.head()

Unnamed: 0,locale_1,locale_2,locale_3,locale_4,locale_5,locale_6,day_of_week_1,day_of_week_2,day_of_week_3,day_of_week_4,...,traffic_type_5,traffic_type_6,traffic_type_7,session_duration,path_1,path_2,path_3,path_4,path_5,no_of_ids
0,0,1,0,0,0,0,0,0,0,1,...,0,0,0,49,10000,10001,0,0,0,2.0
1,0,0,0,1,0,0,0,0,0,0,...,0,0,0,1892,10001,10002,0,0,0,2.0
2,0,0,1,0,0,0,0,0,0,0,...,1,0,0,0,10003,0,0,0,0,1.0
3,0,1,0,0,0,0,0,1,0,0,...,0,0,0,2,10004,10001,0,0,0,2.0
4,0,0,1,0,0,0,1,0,0,0,...,0,0,0,0,10001,0,0,0,0,1.0


In [31]:
scaler = StandardScaler()
scaled_data = scaler.fit_transform(X_data[['hour_of_day','session_duration', 'path_1', 'path_2', 'path_3', 'path_4', 'path_5', 'no_of_ids']])#
standardized_data = pd.DataFrame(data=scaled_data, columns=['hour_of_day','session_duration','path_1', 'path_2', 'path_3', 'path_4', 'path_5', 'no_of_ids'])
X_data[['hour_of_day','session_duration', 'path_1', 'path_2', 'path_3', 'path_4', 'path_5', 'no_of_ids']] = standardized_data[['hour_of_day','session_duration', 'path_1', 'path_2', 'path_3', 'path_4', 'path_5', 'no_of_ids']]
X_data.head()

Unnamed: 0,locale_1,locale_2,locale_3,locale_4,locale_5,locale_6,day_of_week_1,day_of_week_2,day_of_week_3,day_of_week_4,...,traffic_type_5,traffic_type_6,traffic_type_7,session_duration,path_1,path_2,path_3,path_4,path_5,no_of_ids
0,0,1,0,0,0,0,0,0,0,1,...,0,0,0,-0.197157,-0.478078,0.229254,-0.355613,-0.185178,-0.12278,-0.06404
1,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0.560436,-0.477827,0.229486,-0.355613,-0.185178,-0.12278,-0.06404
2,0,0,1,0,0,0,0,0,0,0,...,1,0,0,-0.217299,-0.477326,-2.087439,-0.355613,-0.185178,-0.12278,-1.075777
3,0,1,0,0,0,0,0,1,0,0,...,0,0,0,-0.216477,-0.477075,0.229254,-0.355613,-0.185178,-0.12278,-0.06404
4,0,0,1,0,0,0,1,0,0,0,...,0,0,0,-0.217299,-0.477827,-2.087439,-0.355613,-0.185178,-0.12278,-1.075777


In [32]:
scaled_data = scaler.transform(X_test[['hour_of_day','session_duration', 
                                       'path_1', 'path_2', 'path_3', 'path_4', 'path_5', 'no_of_ids']])#
standardized_data = pd.DataFrame(data=scaled_data, columns=['hour_of_day','session_duration'
                                                            ,'path_1', 'path_2', 'path_3', 'path_4', 'path_5', 'no_of_ids'])
X_test[['hour_of_day','session_duration', 
        'path_1', 'path_2', 'path_3', 'path_4', 'path_5', 'no_of_ids']] = standardized_data[['hour_of_day','session_duration', 
                                                                                             'path_1', 'path_2', 'path_3', 
                                                                                             'path_4', 'path_5', 'no_of_ids']]
X_test.head()

Unnamed: 0,locale_1,locale_2,locale_3,locale_4,locale_5,locale_6,day_of_week_1,day_of_week_2,day_of_week_3,day_of_week_4,...,traffic_type_5,traffic_type_6,traffic_type_7,session_duration,path_1,path_2,path_3,path_4,path_5,no_of_ids
0,0,0,0,0,0,1,1,0,0,0,...,1,0,0,2.675364,-0.478078,0.229254,-0.355613,-0.185178,-0.12278,-0.06404
1,0,0,1,0,0,0,0,0,0,1,...,0,0,0,1.915716,-0.477827,0.229486,1.75074,-0.185178,-0.12278,0.947697
2,0,0,0,1,0,0,0,0,0,0,...,0,0,0,-0.215244,-0.477075,0.229254,-0.355613,-0.185178,-0.12278,-0.06404
3,0,0,1,0,0,0,0,1,0,0,...,0,0,0,3.088073,-0.477827,0.230181,1.751372,-0.185178,-0.12278,0.947697
4,0,0,1,0,0,0,0,0,0,0,...,0,0,0,-0.169204,-0.476322,0.229254,1.751793,-0.185178,-0.12278,0.947697


In [33]:
X_train, X_val, Y_train, Y_val = train_test_split(X_data, Y_data, test_size=0.2, random_state=42)

It was found from various analysis that the random forest regressor with the following parameters worked best with the train set.

In [34]:
ren_reg = RandomForestRegressor(n_estimators=500, max_leaf_nodes=16, n_jobs=-1)
ren_reg.fit(X_train, Y_train)

preds = ren_reg.predict(X_val)
mse = mean_squared_error(Y_val, preds)
rmse = np.sqrt(mse)
rmse

31.750866722757586

In [88]:
test_preds = ren_reg.predict(X_test)
output_data = {'row_num': df_test['row_num'], 'hits': test_preds}
output = pd.DataFrame(data=output_data)
output.to_csv("result_ran_reg.csv", index=False)

The below codes are the other approches which were tried but could not dig deeper because of the time constraint:

In [83]:
gbrt=GradientBoostingRegressor(max_depth=2,warm_start=True)
min_val_error=float("inf")
error_going_up=0
for n_estimators in range(1, 120):
    gbrt.n_estimators=n_estimators
    gbrt.fit(X_train,Y_train)
    y_pred=gbrt.predict(X_val)
    val_error =mean_squared_error(Y_val,y_pred)
    if val_error<min_val_error:
        min_val_error=val_error
        error_going_up=0
    else:
        error_going_up+=1
        if error_going_up==5:
            break#early stopping

preds = gbrt.predict(X_val)
mse = mean_squared_error(Y_val, preds)
rmse = np.sqrt(mse)
rmse

36.182602893237046

In [25]:
gbrt=GradientBoostingRegressor(max_depth=2,n_estimators=3,learning_rate=1.0)
gbrt.fit(X_train,Y_train)

preds = gbrt.predict(X_val)
mse = mean_squared_error(Y_val, preds)
rmse = np.sqrt(mse)
rmse

32.298896061329074

In [75]:
tree_reg1=DecisionTreeRegressor(max_depth=2)
tree_reg1.fit(X_train,Y_train)

y2=Y_train-tree_reg1.predict(X_train)
tree_reg2=DecisionTreeRegressor(max_depth=2)
tree_reg2.fit(X_train,y2)

y3=y2-tree_reg2.predict(X_train)
tree_reg3=DecisionTreeRegressor(max_depth=2)
tree_reg3.fit(X_train,y3)

preds=sum(tree.predict(X_val) for tree in (tree_reg1,tree_reg2,tree_reg3))
mse = mean_squared_error(Y_val, preds)
rmse = np.sqrt(mse)
rmse

32.29889606132975

In [27]:
ada_reg = AdaBoostRegressor(DecisionTreeRegressor(max_depth=1),
                            n_estimators =100, learning_rate =0.5
                           )
ada_reg.fit(X_train, Y_train)
preds = ada_reg.predict(X_val)
mse = mean_squared_error(Y_val, preds)
rmse = np.sqrt(mse)
rmse

49.004766084151534

In [29]:
bag_reg = BaggingRegressor(DecisionTreeRegressor(), n_estimators=500, max_samples=100, bootstrap=True, n_jobs=-1)
bag_reg.fit(X_train, Y_train)

preds = bag_reg.predict(X_val)
mse = mean_squared_error(Y_val, preds)
rmse = np.sqrt(mse)
rmse

32.6526487816792

In [86]:
lin_reg = LinearRegression()
lin_reg.fit(X_train,Y_train)

preds = lin_reg.predict(X_val)
mse = mean_squared_error(Y_val, preds)
rmse = np.sqrt(mse)
rmse

33.74560852997539

In [31]:
tree_reg = DecisionTreeRegressor()
tree_reg.fit(X_train,Y_train)

preds = tree_reg.predict(X_val)
mse = mean_squared_error(Y_val, preds)
rmse = np.sqrt(mse)
rmse

43.02587125582501