In [1]:
import pandas as pd
from pandas.io import sql
import MySQLdb
import numpy as notebookp
import sklearn.ensemble
import math
import pprint
import time

In [7]:
DEFAULT_COLS = [
#                 'prop_log_historical_price',
                'prop_location_score1',
                'prop_location_score2',
                'loghistp_by_mean',
#                 'srch_room_count',
                'avg_by_price_by_mean',
#                 'star_by_price',
#                 'rev_by_price_by_mean',
                'price_by_med',
#                 'srch_booking_window',
                'ploc_score2_by_mean',
                'book_per_pcnt_by_mean',
                'prop_cnt',
                'promo_per_procnt',
                'train_price_avg',
#                 'random_bool',
                'avg_by_price',
#                 'srch_query_affinity_score',
                'ploc_score1_by_mean',
#                 'srch_length_of_stay',
#                 'visitor_hist_adr_usd',
#                 'price_usd',
                'promo_perprocnt_by_mean',
                'book_per_pcnt',
#                 'srch_children_count',
                'rev_by_price',
#                 'promotion_cnt',
#                 'srch_adults_count',
                'star_by_price_by_mean',
                'click_nobook_per_pcnt',
#                 'prop_review_score',
                'click_nobper_pcnt_by_mean']

def trainer(train_df, col_list = None, model = None, train_loc1 = 3499709, train_loc2 = 4499607, cv_loc1 = 4499608, cv_loc2 = 4958347, print_factors = True, return_model = False, target= 'click_bool'):
    """
    >>trainer(train_df, col_list, model = None, train_loc1 = 7000014, train_loc2 = 9000007, 
              cv_loc1 = 9000008, cv_loc2 = 9917530, print_factors = True)
    
    Given pd.DataFrame of Expedia Personalized Search training data, 
    [list of columns in DF to train on], and optionally a SciKitLearn model.  Fit the model to classify rows
    based on "click_bool" parameter (train on the rows train_loc1 to train_loc2 inclusive).  Calculate the NDCG 
    on a validation sample (rows: cv_loc1 to cv_loc2).
    Optionally print an ordered list of the column names and their "feature_importances" with print_factors.
    Optionally return_model, e.g. for use with Test data.
    """
    
    if col_list is None:
        col_list = DEFAULT_COLS


    #exclude outlying prices for training.
    train_df = train_df.loc[(train_df['price_usd'] <= 2000.0) * (train_df['price_usd'] != 0.0)]
    
    if model is None:
        model = sklearn.ensemble.RandomForestClassifier(n_estimators = 100, min_samples_split = 1000)
    
#     model.fit(train_df.loc[train_loc1:train_loc2, col_list], train_df.loc[train_loc1:train_loc2, "click_bool"])
    model.fit(train_df.loc[train_loc1:train_loc2, col_list], train_df.loc[train_loc1:train_loc2, target])
    
    feature_scores_pairs = [[model.feature_importances_[i], col_list[i]] for i in range(len(col_list))]
    
    if hasattr(model, 'predict_proba'):
        crossval_pred_arr = model.predict_proba(train_df.loc[cv_loc1:cv_loc2, col_list])[:, 1]
    else:
        crossval_pred_arr = model.predict(train_df.loc[cv_loc1:cv_loc2, col_list])

    ndcg = ndcg_calc(train_df.loc[cv_loc1:cv_loc2], crossval_pred_arr)
    
    if print_factors:
        print ("NDCG:", ndcg)
        print ("Feature Importances:")
        pprint.pprint(sorted(feature_scores_pairs, reverse = True))
        print (model)
    
    if return_model:
        return model
    else:
        return ndcg


def ndcg_calc(train_df, pred_scores):
    """
    >>ndcg_calc(train_df, pred_scores)
       train_df: pd.DataFrame with Expedia Columns: 'srch_id', 'booking_bool', 'click_bool'
       pred_scores: np.Array like vector of scores with length = num. rows in train_df
       
    Calculate Normalized Discounted Cumulative Gain for a dataset is ranked with pred_scores (higher score = higher rank).
    If 'booking_bool' == 1 then that result gets 5 points.  If 'click_bool' == 1 then that result gets 1 point (except:
    'booking_bool' = 1 implies 'click_bool' = 1, so only award 5 points total).  
    
    NDCG = DCG / IDCG
    DCG = Sum( (2 ** points - 1) / log2(rank_in_results + 1) )
    IDCG = Maximum possible DCG given the set of bookings/clicks in the training sample.
    
    """
    eval_df = train_df[['srch_id', 'booking_bool', 'click_bool']]
    eval_df['score'] = pred_scores

    logger = lambda x: math.log(x + 1, 2)
    eval_df['log_rank'] = eval_df.groupby(by = 'srch_id')['score'].rank(ascending = False).map(logger)

    book_dcg = (eval_df['booking_bool'] * 31.0 / eval_df['log_rank']).sum() #where 2 ** 5 - 1.0 = 31.0
    book_idcg = (31.0 * eval_df['booking_bool']).sum()
    
    click_dcg = (eval_df['click_bool'] * (eval_df['booking_bool'] == 0) / eval_df['log_rank']).sum()
    
    # Max number of clicks in training set is 30.
    # Calculate the 31 different contributions to IDCG that 0 to 30 clicks have
    # and put in dict {num of click: IDCG value}.
    disc = [1.0 / math.log(i + 1, 2) if i != 0 else 0 for i in range(31)]
    disc_dict = { i: np.array(disc).cumsum()[i] for i in range(31)}
    
    # Map the number of clicks to its IDCG and subtract off any clicks due to bookings
    # since these were accounted for in book_idcg.
    click_idcg = (eval_df.groupby(by = 'srch_id')['click_bool'].sum().map(disc_dict) - eval_df.groupby(by = 'srch_id')['booking_bool'].sum()).sum()

    return (book_dcg + click_dcg) / (book_idcg + click_idcg)


def df_from_query(row_start = 3499709, row_end = 4958347, training = True, srch_start = None, srch_end = None):
    """
    >>df_from_query(row_start = 7000014, row_end = 9917530, training = True, srch_start = None, srch_end = None)
      row_start,row_end: Row number range (inclusive) to request from MySQL db (faster than srch_start,srch_end)
      srch_start,srch_end: Search ID range (inclusive) to request (slower than by row_start,row_end)
      training: BOOL, to pull from Training data set to True, set to False for Test data.
    
    Query local MySQL database, (see dbcon, below for permissions) to build a pd.DataFrame for training/testing.
    
    Several columns depend on summary statistics about Properties in the PropFactors/PropFactors7M tables in the
    database.  The "7M" suffix denotes summary statistics from the first 7000013 rows which for training
    are used with the latter ~3M rows. (This is to have these latter rows more closely resemble the test set, since some 
    properties only appear a few times so the "booking_cnt" statistic is an overly strong signal if it includes 
    information from the row that is being used for training.)  However for building the Test Set predictions, it
    is better to use statistics from the entire training set.
    Since some properties will appear in the training/test sets but not among the summary sets, columns that depend
    on these statistics are set to zero.
    Can select data by row_num or srch_id (row_num is the primary key so is faster, but it may be preferrable to 
    select by srch_id where splitting searches between chunks could cause problems, like when compiling test data).
    The presence of srch_start and srch_end overrides row_start/row_end.
    
    
    """
    
    dbcon = MySQLdb.connect('localhost', 'root', 'root', 'DMT')
    #print('Run Ok')
    # MySQL server, user, password, and database

    main_cols = "SELECT row_num, srch_id, s.prop_id, price_usd, prop_location_score1, prop_location_score2, prop_log_historical_price, prop_review_score, random_bool, srch_adults_count, srch_booking_window, srch_children_count, srch_length_of_stay, srch_query_affinity_score, srch_room_count, visitor_hist_adr_usd, (prop_starrating / price_usd) AS star_by_price, (prop_review_score / price_usd) AS rev_by_price"

    agg_cols = ", prop_cnt, promotion_cnt, train_price_avg, (booking_cnt / prop_cnt) AS book_per_pcnt, (promotion_flag / promotion_cnt) AS promo_per_procnt, (train_price_avg / price_usd) AS avg_by_price, ((click_cnt - booking_cnt) / prop_cnt) AS click_nobook_per_pcnt"

    agg_missing = ", 0 AS prop_cnt, 0 AS promotion_cnt, 0 AS train_price_avg, 0 AS book_per_pcnt, 0 AS promo_per_procnt, 0 AS avg_by_price, 0 AS click_nobook_per_pcnt"

    if training:
        from_tables1 = ", booking_bool, click_bool FROM TrainSearch AS s, PropFactors3M AS p"
        from_tables2 = ", booking_bool, click_bool FROM TrainSearch AS s"
        missing_from_table = "PropFactors3M"
    else:
        from_tables1 = " FROM TestSearch AS s, PropFactors AS p"
        from_tables2 = " FROM TestSearch AS s"
        missing_from_table = "PropFactors"

    if (srch_start is not None) or (srch_end is not None):
        if (srch_start is None):
            raise Exception("Expected selection pair (srch_start, srch_end) OR (row_start, row_end). Got: srch_start = None")
        elif (srch_end is None):
            raise Exception("Expected selection pair (srch_start, srch_end) OR (row_start, row_end). Got: srch_end = None")
        else: 
            where_str = " AND srch_id >= " + str(srch_start) + " AND srch_id <= " + str(srch_end) + ";"
    else:
        where_str = " AND row_num >= " + str(row_start) + " AND row_num <= "+ str(row_end) + ";"

    train_dfmost = pd.read_sql(main_cols + agg_cols + from_tables1 + " WHERE p.prop_id = s.prop_id" + where_str, con = dbcon, index_col = 'row_num')
    # DataFrame.to_sql(name, con, schema=None, if_exists='fail', index=True, index_label=None, chunksize=None, dtype=None, method=None)
    train_dfpropmissing = pd.read_sql(main_cols + agg_missing + from_tables2 + " WHERE prop_id NOT IN (SELECT prop_id FROM " + missing_from_table + ")" + where_str, con = dbcon, index_col = 'row_num')
    
    dbcon.close()
    
    #display(train_dfmost.head(3))
    #print(train_dfmost.shape)
    #display(train_dfpropmissing.head(3))
    #print(train_dfpropmissing.shape)

    train_df = pd.concat([train_dfmost, train_dfpropmissing])
    train_df.sort_values(by='row_num',inplace = True)
    
    
    train_df.fillna(value = 0, inplace = True)

    return train_df


In [8]:
def agg_by_srch(train_df):
    """
    >>agg_by_srch(train_df)
    
    Given a training/testing dataset that includes the columns listed in cols_in below, calculate
    new columns that normalize these values among the search in which they appear by dividing by
    the mean (or median for 'price_usd' since it has higher variance).  i.e. given prop_location_score "j"
    for search "i" add column: prop_location_score[i][j] / mean(prop_location_score[i])
    """
    
    cols_in = ['prop_location_score1', 
               'prop_location_score2', 
               'book_per_pcnt', 
               'avg_by_price', 
               'rev_by_price', 
               'star_by_price', 
               'promo_per_procnt',
               'prop_log_historical_price', 
               'click_nobook_per_pcnt']

    cols_out = ['srch_ploc_score1_mean', 
                'srch_ploc_score2_mean', 
                'srch_book_per_pcnt_mean', 
                'srch_avg_by_price_mean', 
                'srch_rev_by_price_mean', 
                'srch_star_by_price_mean', 
                'srch_promo_per_procnt_mean', 
                'srch_prop_log_historical_price_mean', 
                'click_nobook_per_pcnt_mean']

    srch_df = train_df[['srch_id'] + cols_in].groupby(by='srch_id').mean()
    srch_df.columns = cols_out

    srch_df['srch_price_med'] = train_df[['srch_id','price_usd']].groupby(by = 'srch_id').median()
    srch_df['srch_id'] = srch_df.index
    
    cols_inp = list(cols_in)
    cols_outp = list(cols_out)
    cols_inp.append('price_usd')
    cols_outp.append('srch_price_med')

    cols_agg = ['ploc_score1_by_mean', 
                'ploc_score2_by_mean', 
                'book_per_pcnt_by_mean', 
                'avg_by_price_by_mean', 
                'rev_by_price_by_mean', 
                'star_by_price_by_mean', 
                'promo_perprocnt_by_mean', 
                'loghistp_by_mean', 
                'click_nobper_pcnt_by_mean',
                'price_by_med']

    for i in range(len(cols_inp)):
        train_df[cols_agg[i]] = train_df[cols_inp[i]] / train_df.join(srch_df[['srch_id', cols_outp[i]]], on='srch_id', rsuffix = '_agg')[cols_outp[i]]
    
    train_df.fillna(value = 0, inplace = True)
    train_df.loc[train_df.price_by_med == np.inf, 'price_by_med'] = 0

    return train_df

def test_pred_sorted(test_df, model, cols = None, regress_model = False):
    """
    >>test_pred_sorted(test_df, model, cols, regress_model = False)
        test_df: pd.DataFrame that contains columns listed in cols.
        model: SciKitLearn model used for ranking.
        cols: [list of strs] columns in test_df to send to model.
        regress: BOOL, Regressor Model used, as opposed to Classifier.
    
    Return a pd.DataFrame that contains 'srch_id', and 'property_id' columns such that
    the properties are listed in descending order of their model score within each search.
    
    To save output use: test_out_df.to_csv("FILE OUT", index = False, cols = ['srch_id', 'prop_id'], header = ['SearchId','PropertyId'])
    """
    if cols is None:
        cols = DEFAULT_COLS

    if regress_model:
        scores = model.predict(test_df[cols])
    else:
        scores = model.predict_proba(test_df[cols])[:, 1]
    test_df['sort_score'] = scores
     
    return test_df[['srch_id', 'prop_id', 'sort_score']].sort_values(by=['srch_id', 'sort_score'], ascending = [True, False])

In [9]:
train_df = df_from_query()

In [10]:
train_df['Relevance_target']= train_df['click_bool']*1
train_df['Relevance_target']+= train_df['booking_bool']*4

In [11]:
agg_by_srch(train_df)  

NameError: name 'np' is not defined

In [12]:
train_df.shape

(1458639, 37)

In [None]:
### Pipeline does not work yet...

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler

# Transformer for numerical data
numerical_transformer = MinMaxScaler()

# Combine transformers into preprocessor
preprocessor= ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, Selected_features),
    ])

# Define model:
model1_RF = RandomForestRegressor(n_estimators=100, random_state=0)

# Bundle preprocessing and modeling code in a pipeline
my_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                              ('model', model1_RF)
                             ])


In [25]:
begin= time.time()
rfc_model = trainer(train_df, return_model = True, train_loc2 = 4958347, target='click_bool')
end= time.time()

time= end-begin
time

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  eval_df['score'] = pred_scores


NDCG: 0.6048998107298655
Feature Importances:
[[0.18489516422073168, 'ploc_score2_by_mean'],
 [0.10658968104406165, 'avg_by_price_by_mean'],
 [0.09904872148402624, 'star_by_price_by_mean'],
 [0.08662530767329052, 'book_per_pcnt_by_mean'],
 [0.0818562210079774, 'book_per_pcnt'],
 [0.07864837802518011, 'prop_location_score2'],
 [0.04116854013287702, 'avg_by_price'],
 [0.03946375963190873, 'ploc_score1_by_mean'],
 [0.03855456694804317, 'loghistp_by_mean'],
 [0.0383121463709553, 'price_by_med'],
 [0.033125150299384495, 'train_price_avg'],
 [0.032488472487729236, 'prop_location_score1'],
 [0.030346941231182995, 'click_nobook_per_pcnt'],
 [0.0289076348365934, 'click_nobper_pcnt_by_mean'],
 [0.025589044797382447, 'prop_cnt'],
 [0.023307851589574668, 'rev_by_price'],
 [0.017733273101582465, 'promo_perprocnt_by_mean'],
 [0.013339145117518485, 'promo_per_procnt']]
RandomForestClassifier(min_samples_split=1000)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  eval_df['log_rank'] = eval_df.groupby(by = 'srch_id')['score'].rank(ascending = False).map(logger)


1461.812427520752

In [None]:
model= sklearn.ensemble.RandomForestRegressor(n_estimators = 100, min_samples_split = 1000)
rfc_model2 = trainer(train_df, return_model = True, train_loc2 = 4958347, model=rfr_model,  target='Relevance_target')



In [None]:
test_df = df_from_query(training = False)

In [None]:
agg_by_srch(test_df) 

In [None]:
test_df.shape
test_df.describe

In [None]:
pred_df = test_pred_sorted(test_df, rfc_model)
pred_df

In [None]:
pred2_df = test_pred_sorted(test_df, rfc_model2)
pred2_df

In [19]:
FullTrain_df = df_from_query(row_start=1)
agg_by_srch(FullTrain_df) 

Unnamed: 0_level_0,srch_id,prop_id,price_usd,prop_location_score1,prop_location_score2,prop_log_historical_price,prop_review_score,random_bool,srch_adults_count,srch_booking_window,...,rev_by_price,prop_cnt,promotion_cnt,train_price_avg,book_per_pcnt,promo_per_procnt,avg_by_price,click_nobook_per_pcnt,booking_bool,click_bool
row_num,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1,893,104.77,2.83,0.0438,4.95,3.5,1,4,0,...,0.033407,436,44,118.147729,0.0206,0.0,1.127687,0.0115,0,0
2,1,10404,170.74,2.2,0.0149,5.03,4.0,1,4,0,...,0.023427,412,22,158.150146,0.0073,0.0,0.926263,0.0097,0,0
3,1,21315,179.8,2.2,0.0245,4.92,4.5,1,4,0,...,0.025028,391,0,173.062839,0.0051,,0.96253,0.0077,0,0


(4951924, 26)


Unnamed: 0_level_0,srch_id,prop_id,price_usd,prop_location_score1,prop_location_score2,prop_log_historical_price,prop_review_score,random_bool,srch_adults_count,srch_booking_window,...,rev_by_price,prop_cnt,promotion_cnt,train_price_avg,book_per_pcnt,promo_per_procnt,avg_by_price,click_nobook_per_pcnt,booking_bool,click_bool
row_num,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
3500619,235014,118276,69.71,2.89,0.0,3.93,0.0,0,2,149,...,0.0,0,0,0,0,0,0,0,0,0
3500726,235022,94415,115.58,0.0,0.0,0.0,0.0,0,2,41,...,0.0,0,0,0,0,0,0,0,0,0
3501055,235047,8689,78.88,0.69,0.0,4.78,4.5,0,2,7,...,0.057049,0,0,0,0,0,0,0,0,0


(6423, 26)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  eval_df['score'] = pred_scores


NDCG: 0.515711736670059
Feature Importances:
[[0.1447112753519211, 'book_per_pcnt'],
 [0.14232395341234902, 'book_per_pcnt_by_mean'],
 [0.10073920788630637, 'ploc_score2_by_mean'],
 [0.08807291056297208, 'click_nobook_per_pcnt'],
 [0.06902802676812146, 'avg_by_price_by_mean'],
 [0.05804895685394906, 'click_nobper_pcnt_by_mean'],
 [0.05007554213230087, 'prop_location_score2'],
 [0.0473205511176717, 'star_by_price_by_mean'],
 [0.03267736103354789, 'random_bool'],
 [0.032066093860010754, 'prop_cnt'],
 [0.02816274254648815, 'rev_by_price_by_mean'],
 [0.021518937024200095, 'avg_by_price'],
 [0.019278247683824676, 'loghistp_by_mean'],
 [0.018257169832826745, 'ploc_score1_by_mean'],
 [0.017596390619145145, 'price_by_med'],
 [0.01589746973568862, 'prop_location_score1'],
 [0.012999468552225321, 'train_price_avg'],
 [0.012432481163112607, 'price_usd'],
 [0.011638870719641052, 'star_by_price'],
 [0.011109028129089715, 'rev_by_price'],
 [0.009118500370403567, 'promo_perprocnt_by_mean'],
 [0.00823

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  eval_df['log_rank'] = eval_df.groupby(by = 'srch_id')['score'].rank(ascending = False).map(logger)


In [None]:
full_model = trainer(FullTrain_df, return_model = True, train_loc1 = 1, target='click_bool')

model= sklearn.ensemble.RandomForestRegressor(n_estimators = 100, min_samples_split = 1000)
full_model2 = trainer(FullTrain_df, return_model = True, train_loc1 = 1, model=model, target='Relevance_target')

In [None]:
pred_df = test_pred_sorted(FullTest_df, full_model)

In [None]:
pred2_df = test_pred_sorted(FullTest_df, full_model2)

## Appendix

In [23]:
pred_df

Unnamed: 0_level_0,srch_id,prop_id,sort_score
row_num,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
6,1,28181,0.156558
10,1,54937,0.136544
24,1,99484,0.133501
13,1,61934,0.088148
2,1,5543,0.077118
...,...,...,...
4958337,332734,40644,0.012349
4958332,332734,33046,0.005262
4958334,332734,34904,0.004785
4958346,332734,68553,0.001502


In [23]:
pred2_df

Unnamed: 0_level_0,srch_id,prop_id,sort_score
row_num,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
6,1,28181,0.156558
10,1,54937,0.136544
24,1,99484,0.133501
13,1,61934,0.088148
2,1,5543,0.077118
...,...,...,...
4958337,332734,40644,0.012349
4958332,332734,33046,0.005262
4958334,332734,34904,0.004785
4958346,332734,68553,0.001502


In [20]:
FullTest_df = df_from_query(training = False, row_start=1)
agg_by_srch(FullTest_df) 

Unnamed: 0_level_0,srch_id,prop_id,price_usd,prop_location_score1,prop_location_score2,prop_log_historical_price,prop_review_score,random_bool,srch_adults_count,srch_booking_window,...,visitor_hist_adr_usd,star_by_price,rev_by_price,prop_cnt,promotion_cnt,train_price_avg,book_per_pcnt,promo_per_procnt,avg_by_price,click_nobook_per_pcnt
row_num,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1,3180,119.0,2.94,0.0691,5.03,4.5,0,2,10,...,0.0,0.02521,0.037815,186,12,125.140161,0.043,0.0,1.051598,0.0
2,1,5543,118.0,2.64,0.0843,4.93,4.5,0,2,10,...,0.0,0.025424,0.038136,244,7,115.981148,0.0533,0.0,0.982891,0.0082
3,1,14142,49.0,2.71,0.0556,4.16,3.5,0,2,10,...,0.0,0.040816,0.071429,149,6,52.375235,0.0134,0.0,1.068882,0.0134


(4944500, 24)


Unnamed: 0_level_0,srch_id,prop_id,price_usd,prop_location_score1,prop_location_score2,prop_log_historical_price,prop_review_score,random_bool,srch_adults_count,srch_booking_window,...,visitor_hist_adr_usd,star_by_price,rev_by_price,prop_cnt,promotion_cnt,train_price_avg,book_per_pcnt,promo_per_procnt,avg_by_price,click_nobook_per_pcnt
row_num,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1072,70,48175,103.76,0.69,0.0,4.68,5.0,0,1,27,...,0.0,0.009638,0.048188,0,0,0,0,0,0,0
1077,70,77336,156.39,0.0,0.0,4.69,0.0,0,1,27,...,0.0,0.025577,0.0,0,0,0,0,0,0,0
1078,70,77520,209.02,0.0,0.0,0.0,0.0,0,1,27,...,0.0,0.0,0.0,0,0,0,0,0,0,0


(13847, 24)


Unnamed: 0_level_0,srch_id,prop_id,price_usd,prop_location_score1,prop_location_score2,prop_log_historical_price,prop_review_score,random_bool,srch_adults_count,srch_booking_window,...,ploc_score1_by_mean,ploc_score2_by_mean,book_per_pcnt_by_mean,avg_by_price_by_mean,rev_by_price_by_mean,star_by_price_by_mean,promo_perprocnt_by_mean,loghistp_by_mean,click_nobper_pcnt_by_mean,price_by_med
row_num,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1,3180,119.0,2.94,0.0691,5.03,4.5,0,2,10,...,1.104977,0.551112,1.857590,1.040978,0.807452,0.815330,0.0,1.084293,0.000000,1.414141
2,1,5543,118.0,2.64,0.0843,4.93,4.5,0,2,10,...,0.992224,0.672341,2.302547,0.972965,0.814295,0.822239,0.0,1.062737,1.561392,1.402258
3,1,14142,49.0,2.71,0.0556,4.16,3.5,0,2,10,...,1.018533,0.443442,0.578877,1.058088,1.525188,1.320058,0.0,0.896752,2.551543,0.582294
4,1,22393,143.0,2.40,0.0561,5.03,4.5,0,2,10,...,0.902022,0.447430,0.596157,0.900182,0.671936,0.678491,0.0,1.084293,0.000000,1.699346
5,1,24194,79.0,2.94,0.2090,4.72,4.5,0,2,10,...,1.104977,1.666896,0.630717,1.116166,1.216289,1.228155,0.0,1.017468,0.933027,0.938800
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4958343,332734,60220,101.0,1.61,0.0004,4.63,5.0,1,4,127,...,0.897055,0.244898,1.604512,1.102845,1.316544,1.126350,0.0,0.990627,2.352000,0.918182
4958344,332734,65642,119.0,2.40,0.0028,5.42,4.5,1,4,127,...,1.337225,1.714286,0.385847,1.453056,1.005663,1.274637,0.0,1.159654,0.634667,1.081818
4958345,332734,68168,89.0,1.79,0.0001,4.25,3.5,1,4,127,...,0.997347,0.061224,0.000000,0.878487,1.045839,0.852145,0.0,0.909322,0.000000,0.809091
4958346,332734,68553,105.0,1.10,0.0001,4.29,3.5,1,4,127,...,0.612895,0.061224,0.000000,0.838384,0.886473,0.722295,0.0,0.917881,0.000000,0.954545
