Repeat all the steps for processing on test dataset as on train and doing predictions on model

In [2]:
import os
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pickle
import seaborn as sns
import xgboost as xgb

from fancyimpute import SoftImpute

from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.externals import joblib
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import PCA 

from xgboost.sklearn import XGBClassifier

INPUT_DIR = '/Users/himani/workspace/assignment/pervazive/expedia/input'
MODELS_DIR = '/Users/himani/workspace/assignment/pervazive/expedia/models'
OUTPUT_DIR = '/Users/himani/workspace/assignment/pervazive/expedia/output'
%matplotlib inline

In [3]:
train_file = os.path.join(INPUT_DIR, 'train_clean1.csv')
test_file = os.path.join(INPUT_DIR, 'test_clean.csv')
destinations_file = os.path.join(INPUT_DIR, 'destinations.csv')

In [4]:
# Now reading in test data and applying similar functions as on train

test = pd.read_csv(test_file, parse_dates = ['date_time','srch_ci','srch_co'])

# converting srch_ci and srch_co to datetime 
test['srch_ci'] = pd.to_datetime(test['srch_ci'],format='%Y-%m-%d', errors="coerce")
test['srch_co'] = pd.to_datetime(test['srch_co'],format='%Y-%m-%d', errors="coerce")

# calculate duration of stay (in days) and creating month

test['duration'] = (test['srch_co'] - test['srch_ci']).dt.days
print("Most frequent duration of stay in test is",test['duration'].mode()[0])

# month
test['month'] = test['srch_ci'].dt.month


# now deleting date_time, srch_co and srch_ci 

test.drop(['srch_co','srch_ci'], inplace = True, axis =1)


test['duration'].fillna(test['duration'].mode()[0], inplace = True)
test['month'].fillna(test['date_time'].dt.month, inplace = True)
test['duration'] = test['duration'].astype(int)
test['month'] = test['month'].astype(int)


df2 = test.loc[:,['user_location_city','hotel_market','orig_destination_distance']]
df2_matrix = df2.as_matrix()


from fancyimpute import SoftImpute
df2_matrix_filled = SoftImpute().complete(df2_matrix)

test['user_location_city'] = pd.Series(df2_matrix_filled[:,0])
test['hotel_market'] = pd.Series(df2_matrix_filled[:,1])
test['orig_destination_distance'] = pd.Series(df2_matrix_filled[:,2])

Most frequent duration of stay in test is 1.0
[SoftImpute] Max Singular Value of X_init = 51671740.464037
[SoftImpute] Iter 1: observed MAE=497.931547 rank=2
[SoftImpute] Iter 2: observed MAE=501.394849 rank=2
[SoftImpute] Iter 3: observed MAE=504.073970 rank=2
[SoftImpute] Iter 4: observed MAE=506.094780 rank=2
[SoftImpute] Iter 5: observed MAE=507.594727 rank=2
[SoftImpute] Iter 6: observed MAE=508.699851 rank=2
[SoftImpute] Iter 7: observed MAE=509.515133 rank=2
[SoftImpute] Iter 8: observed MAE=510.120486 rank=2
[SoftImpute] Iter 9: observed MAE=510.572392 rank=2
[SoftImpute] Iter 10: observed MAE=510.911707 rank=2
[SoftImpute] Iter 11: observed MAE=511.167107 rank=2
[SoftImpute] Iter 12: observed MAE=511.360370 rank=2
[SoftImpute] Iter 13: observed MAE=511.507241 rank=2
[SoftImpute] Iter 14: observed MAE=511.619003 rank=2
[SoftImpute] Iter 15: observed MAE=511.704230 rank=2
[SoftImpute] Iter 16: observed MAE=511.769349 rank=2
[SoftImpute] Iter 17: observed MAE=511.819165 rank=2
[S

In [5]:
destination = pd.read_csv(destinations_file)
print("Number of rows and cols in destination file -",destination.shape)

pca = PCA(n_components=149)
pca.fit_transform(destination[["d{0}".format(i + 1) for i in range(149)]])

# creating 20 features from destination file
p = PCA(n_components=20, random_state = 3)
df = p.fit_transform(destination[["d{0}".format(i + 1) for i in range(20)]])
df = pd.DataFrame(df)
df["srch_destination_id"] = destination["srch_destination_id"]
col = df.columns.tolist()

# append pca features to train & test data on srch_destination_id using left join
new_test = pd.merge(test, df, on = 'srch_destination_id', how = 'left')


Number of rows and cols in destination file - (62106, 150)


In [6]:
# replacing missing values of destination file
# this will be improved by using collaborative filtering to find similar destinations

for i in col[:-1]:
    new_test[i].fillna(new_test[i].mean(), inplace = True)

In [7]:
# saving final test file 

new_test.to_csv(os.path.join(INPUT_DIR, 'test_final.csv'))

# Predictions on Models

In [None]:
# loading saved random forests model 

rfc = joblib.load(rf_models_file)

# loading saved xgboost model

clf = joblib.load(xg_models_file)

In [None]:
# predicting top clusters
# same function can be used to predict random forests and xgboosts top clusters

def predict():
    print("Loading model")
    clf = joblib.load(os.path.join(MODELS_DIR, 'model4.pkl'))
    print("Model loaded")
    print("Loading test data")
    new_test = pd.read_csv(os.path.join(INPUT_DIR, 'test_final.csv'), nrows=NROWS)
    print("Test data loaded")
    test_predictors = [c for c in new_test.columns if c not in ['id','user_id','date_time']]
    new_test = new_test[test_predictors]

    print("Begin prediction")
    n = new_test.shape[0]
    top_5 = []
    for i in range(0, n, 10000):
        end = i + 10000 if i + 10000 < n else n
        t = new_test.iloc[i:end]
        predicted_prob = clf.predict_proba(t)
        for lista in predicted_prob:
            top_5.append(' '.join([str(x) for x in lista.argsort()[-5:].tolist()]))
        print("Completed {} rows".format(end))

    xgb_5recos_df = pd.DataFrame(top_5, columns=['hotel_cluster'])
    xgb_5recos_df.to_csv(os.path.join(OUTPUT_DIR, 'xgb_5recos.csv'), index_label='id')