# AirBnB BenchMark Code
https://www.kaggle.com/c/airbnb-recruiting-new-user-bookings

In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import KFold

In [None]:
#データの読み込み
train=pd.read_csv("../airbnb_data/train_users_2.csv")
test=pd.read_csv("../airbnb_data/test_users.csv")

In [None]:
train.shape

In [None]:
#trainとtestを縦に結合
test["country_destination"]=np.nan
whole=pd.concat([train,test],axis=0)

In [None]:
whole.head()

In [None]:
#各特徴量について加工をしていく

#date_account_createdについて
#年、月、日に分割してそれぞれを特徴量に
whole["Year_account_created"]=whole['date_account_created'].apply(lambda x:x[:4])
whole["Month_account_created"]=whole['date_account_created'].apply(lambda x:int(x[5:7]))
whole["Day_account_created"]=whole['date_account_created'].apply(lambda x:int(x[8:]))

#timestamp_first_activeについて
#年、月、日、時間に分割してそれぞれを特徴量に
whole.timestamp_first_active=whole.timestamp_first_active.apply(str)
whole["Year_first_active"]=whole['timestamp_first_active'].apply(lambda x:x[:4])
whole["Month_first_active"]=whole['timestamp_first_active'].apply(lambda x:int(x[4:6]))
whole["Day_first_active"]=whole['timestamp_first_active'].apply(lambda x:int(x[6:8]))
whole["Time_first_active"]=whole['timestamp_first_active'].apply(lambda x:int(x[8:10]))

In [None]:
#gender,signup_method,language~first_browserについて
#カテゴリー変数なので数値に変換
category_columns=["gender","signup_method","language","affiliate_channel","affiliate_provider",
                  "first_affiliate_tracked","signup_app","first_device_type","first_browser"]
whole = pd.get_dummies(whole, columns=category_columns)

In [None]:
#ageの欠損を平均値で補完
whole.age=whole.age.fillna(whole.age.mean())

In [None]:
whole.head()

In [None]:
#trainとtestに再分割
train=whole[whole.timestamp_first_active<="20140630235824"]
test=whole[whole.timestamp_first_active>"20140630235824"]
test_id = test['id']

In [None]:
#予測に用いるデータの作成
X_train=train.drop(["id","date_account_created","timestamp_first_active",
                    "date_first_booking","country_destination"],axis=1)
Y_train=train.country_destination

In [None]:
X_test=test.drop(["id","date_account_created","timestamp_first_active",
              "date_first_booking","country_destination"],axis=1)

In [None]:
train.shape

Countryの辞書を作成

In [None]:
country_unique=list(set(Y_train))
country_dict={country_unique[i]:i for i in range(len(country_unique))}
country_dict

In [None]:
Y_train_map=Y_train.map(country_dict)
Y_train_map[:10]

# NDCGの関数を定義する

In [None]:
y_true = 3
y_pred = np.array([5,3,2,1,0])

In [None]:
country_idx = sorted(list(country_dict.values()))
country_idx = np.array(country_idx)

def get_dcg(y_pred, y_true):
    dcg5 = (2**(y_pred==y_true).astype(np.float) -1) /np.log2(np.arange(1,len(y_pred)+1)+1)
    return sum(dcg5)

def get_mean_dcg_score(proba_array, y_true):
    dcg_scores = []
    for i in range(len(proba_array)):
        each = proba_array[i]
        top5_idx = country_idx[np.argsort(each)][::-1][:5]  #Top5の国のインデックスを取得
        tmp_dcg = get_dcg(y_pred=top5_idx, y_true=y_true[i])  #DCGのスコアを取得
        dcg_scores.append(tmp_dcg)  #全てのレコードのDCGのスコアを保管
    return np.mean(dcg_scores)

テスト:  予測した順番が3,1,2,0 / 正解は3（つまり正しい結果を予測できている）

In [None]:
y_true = np.array([3])
y_pred = np.array([3,1,2,0])
    
get_dcg(y_pred=y_pred, y_true=y_true)

# クロスバリデーション

NumPy Arrayにしておく

In [None]:
X_train_ar = np.array(X_train)
Y_train_map_ar = np.array(Y_train_map)
X_test_ar = np.array(X_test)

クロスバリデーションの実施

In [None]:
k_fold=KFold(Y_train_map_ar.shape[0],5,shuffle=True)
score_list=[]

clf = RandomForestClassifier()

for train_index, test_index in k_fold:
    X1,X2 = X_train_ar[train_index], X_train_ar[test_index]
    Y1,Y2 = Y_train_map_ar[train_index], Y_train_map_ar[test_index]
    clf.fit(X1, Y1)
    ypred = clf.predict_proba(X2)
    ndcg_score = get_mean_dcg_score(ypred, Y2)
    score_list.append(ndcg_score)

In [None]:
print("Mean: ", np.mean(score_list))
print("SD: ", np.std(score_list))

# 最終モデルの作成

In [None]:
clf = RandomForestClassifier()
clf.fit(X_train_ar, Y_train_map_ar)
ypred_proba = clf.predict_proba(X_test_ar)

# Submission fileを書き出す

In [None]:
# country_dictのKeyとValueを入れ替えた形で辞書を作る
country_dict_reverse={v:k for k,v in country_dict.items()}
country_dict_reverse

In [None]:
#Submission用んpデータを作成
submission_data = []
for user_id, proba_each in zip(test_id, ypred_proba):
    top5_idx = country_idx[np.argsort(proba_each)][::-1][:5]  #Top5の国のインデックスを取得
    top5_country = [country_dict_re[idx] for idx in top5_idx]
    for each_country in top5_country:
        tmp_ = [user_id, each_country]
        submission_data.append(tmp_)

In [None]:
#データフレームに変換
submission_df = pd.DataFrame(submission_data, columns=['id', 'country'])

In [None]:
print(submission_df.shape)
submission_df.head(10)

In [None]:
#csvファイルに書き出す
submission_df.to_csv("Airbnb_benchmark_submission.csv",index=False)