In [20]:
import pandas as pd

In [None]:
destinations = pd.read_csv("destinations.csv")
test = pd.read_csv("test.csv")
train = pd.read_csv("train.csv")

In [None]:
train.head(5)

In [None]:
test_ids = set(test.user_id.unique())
train_ids = set(train.user_id.unique())
intersection_count = len(test_ids & train_ids)
intersection_count == len(test_ids)

In [None]:
train["date_time"] = pd.to_datetime(train["date_time"])
train["year"] = train["date_time"].dt.year
train["month"] = train["date_time"].dt.month

In [None]:
import random

unique_users = train.user_id.unique()

sel_user_ids = [unique_users[i] for i in sorted(random.sample(range(len(unique_users)), 10000)) ]
sel_train = train[train.user_id.isin(sel_user_ids)]

In [None]:
t1 = sel_train[((sel_train.year == 2013) | ((sel_train.year == 2014) & (sel_train.month < 8)))]
t2 = sel_train[((sel_train.year == 2014) & (sel_train.month >= 8))]

In [None]:
t2 = t2[t2.is_booking == True]

In [None]:
most_common_clusters = list(train.hotel_cluster.value_counts().head().index)

In [None]:
predictions = [most_common_clusters for i in range(t2.shape[0])]

In [None]:
import ml_metrics as metrics
target = [[l] for l in t2["hotel_cluster"]]
metrics.mapk(target, predictions, k=5)

In [None]:
train.corr()["hotel_cluster"]

In [None]:
from sklearn.decomposition import PCA

pca = PCA(n_components=3)
dest_small = pca.fit_transform(destinations[["d{0}".format(i + 1) for i in range(149)]])
dest_small = pd.DataFrame(dest_small)
dest_small["srch_destination_id"] = destinations["srch_destination_id"]

In [None]:
dest_small

In [None]:
def calc_fast_features(df):
    df["date_time"] = pd.to_datetime(df["date_time"])
    df["srch_ci"] = pd.to_datetime(df["srch_ci"], format='%Y-%m-%d', errors="coerce")
    df["srch_co"] = pd.to_datetime(df["srch_co"], format='%Y-%m-%d', errors="coerce")
    
    props = {}
    for prop in ["month", "day", "hour", "minute", "dayofweek", "quarter"]:
        props[prop] = getattr(df["date_time"].dt, prop)
    
    carryover = [p for p in df.columns if p not in ["date_time", "srch_ci", "srch_co"]]
    for prop in carryover:
        props[prop] = df[prop]
    
    date_props = ["month", "day", "dayofweek", "quarter"]
    for prop in date_props:
        props["ci_{0}".format(prop)] = getattr(df["srch_ci"].dt, prop)
        props["co_{0}".format(prop)] = getattr(df["srch_co"].dt, prop)
    props["stay_span"] = (df["srch_co"] - df["srch_ci"]).astype('timedelta64[h]')
        
    ret = pd.DataFrame(props)
    
    ret = ret.join(dest_small, on="srch_destination_id", how='left', rsuffix="dest")
    ret = ret.drop("srch_destination_iddest", axis=1)

    return ret

df = calc_fast_features(t1)
df.fillna(-1, inplace=True)

In [None]:
df

In [None]:
predictors = [c for c in df.columns if c not in ["hotel_cluster"]]
from sklearn import cross_validation
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(n_estimators=10, min_weight_fraction_leaf=0.05)
scores = cross_validation.cross_val_score(clf, df[predictors], df['hotel_cluster'], cv=3)
scores

In [77]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import KFold
from itertools import chain

all_probs = []
unique_clusters = df["hotel_cluster"].unique()
for cluster in unique_clusters:
    df["target"] = 1
    df["target"][df["hotel_cluster"] != cluster] = 0
    predictors = [col for col in df if col not in ['hotel_cluster', "target"]]
    probs = []
    cv = KFold(len(df["target"]), n_folds=2)
    clf = RandomForestClassifier(n_estimators=10, min_weight_fraction_leaf=0.1)
    for i, (tr, te) in enumerate(cv):
        clf.fit(df[predictors].iloc[tr], df["target"].iloc[tr])
        preds = clf.predict_proba(df[predictors].iloc[te])
        probs.append([p[1] for p in preds])
    full_probs = chain.from_iterable(probs)
    all_probs.append(list(full_probs))

prediction_frame = pd.DataFrame(all_probs).T
prediction_frame.columns = unique_clusters
def find_top_5(row):
    return list(row.nlargest(5).index)

preds_m = []
for index, row in prediction_frame.iterrows():
    preds_m.append(find_top_5(row))

metrics.mapk([[l] for l in t2["hotel_cluster"]], preds_m, k=5)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


0.044268287837247192

In [78]:
metrics.mapk([[l] for l in t2["hotel_cluster"]], preds_m, k=5)

0.044268287837247192

In [79]:
def make_key(items):
    return "_".join([str(i) for i in items])

match_cols = ["srch_destination_id"]
cluster_cols = match_cols + ['hotel_cluster']
groups = t1.groupby(cluster_cols)
top_clusters = {}
for name, group in groups:
    clicks = len(group.is_booking[group.is_booking == False])
    bookings = len(group.is_booking[group.is_booking == True])
    
    score = bookings + .3 * clicks
    
    clus_name = make_key(name[:len(match_cols)])
    if clus_name not in top_clusters:
        top_clusters[clus_name] = {}
    top_clusters[clus_name][name[-1]] = score

In [80]:
top_clusters

{'28996': {48: 1.3},
 '11547': {6: 1.3,
  8: 0.6,
  10: 0.6,
  15: 0.3,
  17: 0.3,
  27: 0.8999999999999999,
  32: 1.3,
  33: 0.6,
  47: 0.3,
  48: 0.3,
  50: 2.1,
  72: 0.3,
  75: 0.8999999999999999,
  80: 10.4,
  93: 0.6},
 '5989': {30: 0.3, 62: 0.3, 67: 1.6, 78: 1.9},
 '19399': {62: 0.3},
 '5980': {60: 1.3},
 '51264': {50: 2.6},
 '22261': {30: 0.3,
  43: 0.6,
  60: 0.6,
  61: 1.2,
  62: 0.6,
  78: 0.8999999999999999},
 '22260': {2: 0.3,
  8: 0.3,
  9: 1.9,
  15: 0.8999999999999999,
  18: 0.3,
  29: 0.6,
  30: 0.3,
  37: 0.3,
  50: 0.6,
  90: 1.6},
 '22265': {30: 0.6, 53: 0.3, 78: 0.3, 85: 1.3},
 '22266': {3: 0.8999999999999999,
  36: 0.3,
  44: 0.3,
  53: 0.3,
  60: 0.3,
  61: 1.5,
  78: 0.6,
  89: 0.3},
 '22268': {57: 0.3},
 '25061': {8: 0.6, 18: 3.8, 41: 1.3, 68: 4.2, 70: 5.4, 95: 2.9, 98: 1.9},
 '25067': {23: 0.3, 50: 2.2, 90: 1.3},
 '25064': {4: 0.3, 55: 0.3, 83: 0.8999999999999999},
 '25065': {14: 0.3,
  33: 0.3,
  39: 0.3,
  42: 0.3,
  48: 1.2,
  90: 0.6,
  91: 1.7999999999999

In [81]:
import operator

cluster_dict = {}
for n in top_clusters:
    tc = top_clusters[n]
    top = [l[0] for l in sorted(tc.items(), key=operator.itemgetter(1), reverse=True)[:5]]
    cluster_dict[n] = top

In [82]:
cluster_dict

{'25604': [48],
 '28996': [48],
 '11547': [80, 50, 32, 6, 75],
 '5989': [78, 67, 62, 30],
 '19399': [62],
 '5980': [60],
 '51264': [50],
 '22261': [61, 78, 43, 60, 62],
 '22260': [9, 90, 15, 50, 29],
 '22265': [85, 30, 78, 53],
 '22266': [61, 3, 78, 36, 44],
 '22268': [57],
 '25061': [70, 68, 18, 95, 98],
 '25067': [50, 90, 23],
 '25064': [83, 4, 55],
 '25065': [91, 94, 48, 90, 33],
 '43871': [57],
 '21091': [67, 35, 20],
 '46136': [44],
 '46133': [28, 39],
 '6798': [32, 43, 50, 7, 28],
 '6799': [29],
 '25289': [89],
 '25286': [23],
 '25284': [14, 51, 48, 5, 47],
 '25285': [72, 18, 91, 40, 11],
 '25282': [28, 7, 72, 42, 8],
 '32013': [35],
 '55837': [89, 44, 30],
 '32011': [57],
 '273': [89, 44, 93],
 '32017': [57],
 '16708': [84, 91],
 '16255': [12],
 '16704': [13, 7],
 '19979': [93],
 '12019': [34, 63, 65, 26, 78],
 '12018': [48],
 '22482': [89, 82, 85],
 '12015': [42, 48, 5, 31, 28],
 '12014': [91, 23, 15, 59, 55],
 '12017': [32, 91, 2, 94, 7],
 '12016': [42, 18, 91, 16, 48],
 '1201

In [83]:
preds = []
for index, row in t2.iterrows():
    key = make_key([row[m] for m in match_cols])
    if key in cluster_dict:
        preds.append(cluster_dict[key])
    else:
        preds.append([])

In [84]:
preds

[[63, 92, 58, 86, 44],
 [1, 79, 45, 88, 54],
 [1, 79, 45, 88, 54],
 [82, 85, 46, 6, 90],
 [41, 70, 21, 83, 97],
 [],
 [95, 68, 18, 98, 91],
 [42, 16, 28, 94, 77],
 [77, 18, 23, 50, 5],
 [71, 34, 77, 0, 48],
 [8, 78, 36, 58, 43],
 [59, 82, 68, 73, 9],
 [59, 85, 30, 2, 50],
 [59, 85, 30, 2, 50],
 [98, 95, 21, 68, 83],
 [91, 48, 59, 18, 16],
 [1, 79, 45, 88, 54],
 [39, 77, 6, 28, 50],
 [],
 [40, 48, 91],
 [77, 31, 17, 90, 15],
 [40, 48, 91],
 [40, 48, 91],
 [40, 48, 91],
 [68, 97, 99, 95, 18],
 [77, 91, 0, 51, 47],
 [48, 28, 37, 72, 13],
 [95, 10, 18, 91, 69],
 [34, 40, 91, 68, 77],
 [],
 [26, 91, 13, 28, 31],
 [26, 91, 13, 28, 31],
 [68, 55],
 [98, 56, 70, 41, 55],
 [91, 7, 77, 13, 28],
 [39, 28],
 [37, 33, 25, 42, 13],
 [95, 68, 21, 2, 69],
 [1, 79, 45, 88, 54],
 [37, 33, 25, 42, 13],
 [9, 55, 95, 21, 49],
 [9, 55, 95, 21, 49],
 [18, 99, 37, 21, 70],
 [59, 91, 33, 42, 16],
 [95, 68, 37, 55, 51],
 [76, 77, 91, 48, 14],
 [2, 95, 21, 68, 25],
 [39, 42],
 [18, 91, 2, 37, 4],
 [82, 29, 11, 7

In [85]:
metrics.mapk([[l] for l in t2["hotel_cluster"]], preds, k=5)

0.2373425517542814

In [86]:
match_cols = ['user_location_country', 'user_location_region', 'user_location_city', 'hotel_market', 'orig_destination_distance']

groups = t1.groupby(match_cols)
    
def generate_exact_matches(row, match_cols):
    index = tuple([row[t] for t in match_cols])
    try:
        group = groups.get_group(index)
    except Exception:
        return []
    clus = list(set(group.hotel_cluster))
    return clus

exact_matches = []
for i in range(t2.shape[0]):
    exact_matches.append(generate_exact_matches(t2.iloc[i], match_cols))

In [103]:
def weigh(seq, weight):
    dict = {}
    for key in seq:
        dict[key] = weight
    return dict

def merge(dicts):
    result = []
    mergeDict = {}
    for dict in dicts:
        for key in dict.keys():
            if key in mergeDict.keys():
                mergeDict[key] += dict[key]
            else:
                mergeDict[key] = dict[key]
    #print mergeDict
    #print sorted(mergeDict, key = lambda x: mergeDict[x], reverse = True)
    for key in sorted(mergeDict, key = lambda x: mergeDict[x], reverse = True):
        #print key, mergeDict[key]
        result.append(key)
    return result

max_pair = None
max_score = 0
for i in range(0,20):
    print 'i', i
    for j in range(0,20):
        print 'j', j
        for k in range(0,20):
            print 'k', k
            for l in range(0,20):
                print 'l', l
                full_preds = [merge(tuple([weigh(exact_matches[p],i/10),weigh(preds[p], j/10),weigh(preds_m[p], k/10),weigh(most_common_clusters, l/10)]))[:5] for p in range(len(preds))]
                score = metrics.mapk([[l] for l in t2["hotel_cluster"]], full_preds, k=5)
                if max_score < score:
                    max_pair = (i,j,k,l)
                    max_score = score
print max_pair,max_score

i 0
j 0
k 0
l 0
l 1
l 2
l 3
l 4
l 5
l 6
l 7
l 8
l 9
l 10
l 11
l 12
l 13
l 14
l 15
l 16
l 17
l 18
l 19
k 1
l 0
l 1
l 2
l 3
l 4
l 5
l 6
l 7
l 8
l 9
l 10
l 11
l 12
l 13
l 14
l 15
l 16
l 17
l 18
l 19
k 2
l 0
l 1
l 2
l 3
l 4
l 5
l 6
l 7
l 8
l 9
l 10
l 11
l 12
l 13
l 14
l 15
l 16
l 17
l 18
l 19
k 3
l 0
l 1
l 2
l 3
l 4
l 5
l 6
l 7
l 8
l 9
l 10
l 11
l 12
l 13
l 14
l 15
l 16
l 17
l 18
l 19
k 4
l 0
l 1
l 2
l 3
l 4
l 5
l 6
l 7
l 8
l 9
l 10
l 11
l 12
l 13
l 14
l 15
l 16
l 17
l 18
l 19
k 5
l 0
l 1
l 2
l 3
l 4
l 5
l 6
l 7
l 8
l 9
l 10
l 11
l 12
l 13
l 14
l 15
l 16
l 17
l 18
l 19
k 6
l 0
l 1
l 2
l 3
l 4
l 5
l 6
l 7
l 8
l 9
l 10
l 11
l 12
l 13
l 14
l 15


KeyboardInterrupt: 