In [1]:
import pandas as pd
import random
import ml_metrics as metrics

In [2]:
destinations = pd.read_csv("destinations.csv")
test = pd.read_csv("test.csv")
train = pd.read_csv("train.csv")

In [3]:
train["date_time"] = pd.to_datetime(train["date_time"])
train["year"] = train["date_time"].dt.year
train["month"] = train["date_time"].dt.month

In [4]:
unique_users = train.user_id.unique()

In [9]:
sel_user_ids = [unique_users[i] for i in sorted(random.sample(range(len(unique_users)), 10000)) ]
sel_train = train[train.user_id.isin(sel_user_ids)]

In [6]:
t1 = sel_train[((sel_train.year == 2013) | ((sel_train.year == 2014) & (sel_train.month < 8)))]
t2 = sel_train[((sel_train.year == 2014) & (sel_train.month >= 8))]

In [7]:
t2 = t2[t2.is_booking == True]

In [14]:
from sklearn.decomposition import PCA

pca = PCA(n_components=3)
dest_small = pca.fit_transform(destinations[["d{0}".format(i + 1) for i in range(149)]])
dest_small = pd.DataFrame(dest_small)
dest_small["srch_destination_id"] = destinations["srch_destination_id"]

In [15]:
def calc_fast_features(df):
     df["date_time"] = pd.to_datetime(df["date_time"])
     df["srch_ci"] = pd.to_datetime(df["srch_ci"], format='%Y-%m-%d', errors="coerce")
     df["srch_co"] = pd.to_datetime(df["srch_co"], format='%Y-%m-%d', errors="coerce")

     props = {}
     for prop in ["month", "day", "hour", "minute", "dayofweek", "quarter"]:
         props[prop] = getattr(df["date_time"].dt, prop)

     carryover = [p for p in df.columns if p not in ["date_time", "srch_ci", "srch_co"]]
     for prop in carryover:
         props[prop] = df[prop]

     date_props = ["month", "day", "dayofweek", "quarter"]
     for prop in date_props:
         props["ci_{0}".format(prop)] = getattr(df["srch_ci"].dt, prop)
         props["co_{0}".format(prop)] = getattr(df["srch_co"].dt, prop)
     props["stay_span"] = (df["srch_co"] - df["srch_ci"]).astype('timedelta64[h]')

     ret = pd.DataFrame(props)

     ret = ret.join(dest_small, on="srch_destination_id", how='left', rsuffix="dest")
     ret = ret.drop("srch_destination_iddest", axis=1)
     return ret

In [16]:
df = calc_fast_features(t1)