In [None]:
import os
import re
import pandas as pd
import numpy as np

train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [None]:
# 最終的に学習・予測に使うDataFrame
train_X_df = pd.DataFrame()
test_X_df = pd.DataFrame()

In [None]:
# strで書かれてしまったリストをndarrayに変換する
def str_to_nd(strs):
    strs = re.sub('[{}"]','',strs)
    str_list = strs.split(',')
    str_nd = np.array(str_list)
    return str_nd

# リストをOne-Hotに変換する
from sklearn.preprocessing import MultiLabelBinarizer
def list_to_onehot(train,test):
    mlb = MultiLabelBinarizer()
    train_onehot = mlb.fit_transform(train.values)
    test_onehot = mlb.transform(test.values)
    train_onehot_df = pd.DataFrame(train_onehot, columns = mlb.classes_)
    columns_list = train_onehot_df.mean()[train_onehot_df.mean()>0.01].index.tolist()
    train_onehot_df = train_onehot_df[columns_list]
    test_onehot_df = pd.DataFrame(test_onehot, columns = mlb.classes_)
    test_onehot_df = test_onehot_df[columns_list]
    return train_onehot_df.astype('int').astype('category'),test_onehot_df.astype('int').astype('category')

# アメニティデータ
train['amenities'] = train['amenities'].map(str_to_nd)
test['amenities'] = test['amenities'].map(str_to_nd)
train_X_df = pd.concat([train_X_df,list_to_onehot(train['amenities'],test['amenities'])[0]], axis=1)
test_X_df = pd.concat([test_X_df,list_to_onehot(train['amenities'],test['amenities'])[1]], axis=1)
# アメニティの数
train_X_df = pd.concat([train_X_df,train['amenities'].map(lambda x:len(x)).rename('am_num')], axis=1)
test_X_df = pd.concat([test_X_df,test['amenities'].map(lambda x:len(x)).rename('am_num')], axis=1)

In [None]:
# カテゴリー系のデータをラベルにする
import itertools
import category_encoders as ce
def df_to_label(train,test):
    train,test = train.fillna('nan'),test.fillna('nan')
    oe = ce.OrdinalEncoder(handle_unknown='return nan')
    train_oe = oe.fit_transform(train).fillna(0).astype(int).astype('category')
    test_oe = oe.transform(test).fillna(0).astype(int).astype('category')
    return train_oe,test_oe

# カテゴリー系データ
cat_list = ['bed_type','cancellation_policy','city','cleaning_fee','host_has_profile_pic','host_identity_verified','instant_bookable','property_type','room_type']
train_X_df = pd.concat([train_X_df,df_to_label(train[cat_list],test[cat_list])[0]], axis=1)
test_X_df = pd.concat([test_X_df,df_to_label(train[cat_list],test[cat_list])[1]], axis=1)

In [None]:
# 日付関連のデータを整形
def str_to_date_zero(df):
    # year型(年情報)に変換、欠損値はそのまま
    df_date = df.applymap(lambda x: datetime.datetime.strptime(x,'%Y-%m-%d').year if x==x else x)

    # (データの中で一番最新の日付)-(データの日付)にすることで、「反応してからn年経っている」という情報に置き換える
    latest_year = df_date.max().max()
    df_date = df_date.applymap(lambda x: latest_year - x)

    # 欠損値は一番遅かった日付のデータに合わせる
    #df_date = df_date.fillna(0)

    return df_date

# 日付関連のデータ
date_list = ['host_since','first_review','last_review']
train_X_df = pd.concat([train_X_df,str_to_date_zero(train[date_list])], axis=1)
test_X_df = pd.concat([test_X_df,str_to_date_zero(test[date_list])], axis=1)

In [None]:
# 月のデータ
str_to_month = lambda x: datetime.datetime.strptime(x,'%Y-%m-%d').month if x==x else x
train_X_df = pd.concat([train_X_df,train['host_since'].map(str_to_month).rename('host_since_month')], axis=1)
test_X_df = pd.concat([test_X_df,test['host_since'].map(str_to_month).rename('host_since_month')], axis=1)

train_X_df = pd.concat([train_X_df,train['first_review'].map(str_to_month).rename('first_review_month')], axis=1)
test_X_df = pd.concat([test_X_df,test['first_review'].map(str_to_month).rename('first_review_month')], axis=1)

train_X_df = pd.concat([train_X_df,train['last_review'].map(str_to_month).rename('last_review_month')], axis=1)
test_X_df = pd.concat([test_X_df,test['last_review'].map(str_to_month).rename('last_review_month')], axis=1)

In [None]:
# 反応があるかないかをOne-Hotデータで表現する
def res_to_onehot(df):
    df = df.applymap(lambda x: 1 if x==x else 0)
    df = df.set_axis([x+'_hot' for x in df.columns], axis=1)
    return df.astype('category')

res_list = ['host_since']
train_X_df = pd.concat([train_X_df,res_to_onehot(train[res_list])], axis=1)
test_X_df = pd.concat([test_X_df,res_to_onehot(test[res_list])], axis=1)

In [None]:
# str型のパーセンテージで書かれたデータを0~1のfloat型にする
def strper_to_float(strper):
    if strper==strper:
        strnum = re.sub('%','',strper)
        num_float = float(strnum) / 100.
    else:
        num_float = np.nan
    return num_float

# str型のパーセンテージ
train['host_response_rate'] = train['host_response_rate'].map(strper_to_float)
test['host_response_rate'] = test['host_response_rate'].map(strper_to_float)

# float型データ
num_list = ['accommodates','bathrooms','bedrooms','beds','latitude','longitude','number_of_reviews','review_scores_rating','host_response_rate']
train_X_df = pd.concat([train_X_df,train[num_list]], axis=1)
test_X_df = pd.concat([test_X_df,test[num_list]], axis=1)

In [None]:
# 緯度経度から地理情報を入手
from uszipcode import SearchEngine
search = SearchEngine()
def latlon_to_geo(lat, lon):
    d = search.by_coordinates(lat, lon, radius=20)[0].to_dict()
    return pd.DataFrame([d])

def data_to_geo(df):
    geo = pd.concat([latlon_to_geo(x,y) for x,y in zip(df['latitude'],df['longitude'])])
    return geo

def geo_cat_num(geo):
    geo['zipcode'] = geo['zipcode'].apply(lambda x:str(x)[:5])
    cat_list = ['zipcode', 
                'major_city']

    num_list = ['population', 
                'population_density', 
                'land_area_in_sqmi', 
                'water_area_in_sqmi', 
                'housing_units', 
                'occupied_housing_units', 
                'median_home_value', 
                'median_household_income']
    
    return geo[cat_list], geo[num_list]

# DataFrameの作成が非常に遅いため、DataFrameを作成したらcsvとして保存し次に使うときはcsvから読み込む

if os.path.isfile('train_geo.csv'):
    train_geo = pd.read_csv('train_geo.csv', index_col=0)
else:
    train_geo = data_to_geo(train).reset_index()
    train_geo.to_csv('train_geo.csv', header=True, index=True)
train_geo_cat, train_geo_num = geo_cat_num(train_geo)

if os.path.isfile('test_geo.csv'):
    test_geo = pd.read_csv('test_geo.csv', index_col=0)
else:
    test_geo = data_to_geo(test).reset_index()
    test_geo.to_csv('test_geo.csv', header=True, index=True)
test_geo_cat, test_geo_num = geo_cat_num(test_geo)

train_X_df = pd.concat([train_X_df,df_to_label(train_geo_cat,test_geo_cat)[0]], axis=1)
test_X_df = pd.concat([test_X_df,df_to_label(train_geo_cat,test_geo_cat)[1]], axis=1)
train_X_df = pd.concat([train_X_df,train_geo_num], axis=1)
test_X_df = pd.concat([test_X_df,test_geo_num], axis=1)

In [None]:
# descriptionの単語数を調べる
from collections import Counter
wordnum = lambda x:sum(Counter(x.split()).values())
train_X_df = pd.concat([train_X_df,train['description'].map(wordnum).rename('word_num')], axis=1)
test_X_df = pd.concat([test_X_df,test['description'].map(wordnum).rename('word_num')], axis=1)

In [None]:
# 部屋数
train_X_df['rooms'] = train_X_df['bedrooms'] + train_X_df['bathrooms']
test_X_df['rooms'] = test_X_df['bedrooms'] + test_X_df['bathrooms']

In [None]:
# 価格が高い地点との距離を測る
top_loc = np.array([[-118.813009,34.028313],[-118.448221,34.132164]])

def u_dis(x1,y1,x2,y2):
    return np.sqrt((x1-x2)**2 + (y1-y2)**2)

def min_dis(df):
    tmp = np.empty(2)
    for i,x,y in zip(range(1),top_loc[:,0],top_loc[:,1]):
        tmp[i] = u_dis(df['longitude'],df['latitude'],x,y)
    return tmp

train_X_df = pd.concat([train_X_df,train_X_df.apply(min_dis, axis=1).apply(pd.Series)], axis=1)
test_X_df = pd.concat([test_X_df,test_X_df.apply(min_dis, axis=1).apply(pd.Series)], axis=1)

train_X_df = train_X_df.rename(columns=dict([(x,'dis_'+str(x)) for x in range(10)]))
test_X_df = test_X_df.rename(columns=dict([(x,'dis_'+str(x)) for x in range(10)]))

In [None]:
# 占有住宅率
train_X_df['occupied_housing_rate'] = train_X_df['occupied_housing_units'] / train_X_df['housing_units']
test_X_df['occupied_housing_rate'] = test_X_df['occupied_housing_units'] / test_X_df['housing_units']

In [None]:
from copy import copy
from sklearn.model_selection import train_test_split
# トレーニングデータとテストデータを分割
train_X,train_Y,test_X = copy(train_X_df),copy(train['y']),copy(test_X_df)
train_X, train_X_split, train_Y, train_Y_split = train_test_split(train_X, train_Y, test_size=0.2)

In [None]:
import math
from sklearn.metrics import mean_squared_error
from catboost import Pool
import catboost
import sklearn.metrics
import sklearn.preprocessing as sp
def objective(trial):
    # データを変換
    cat_features = train_X.dtypes[train_X.dtypes == 'category'].index
    c_train = Pool(train_X, label=train_Y,cat_features=cat_features)
    c_train_split = Pool(train_X_split, label=train_Y_split,cat_features=cat_features)
    c_test = Pool(test_X,cat_features=cat_features)

    # パラメータの指定
    params = {
        'iterations' : trial.suggest_int('iterations', 50, 300),                         
        'depth' : trial.suggest_int('depth', 4, 10),                                       
        'learning_rate' : trial.suggest_loguniform('learning_rate', 0.01, 0.3),               
        'random_strength' :trial.suggest_int('random_strength', 0, 100),                       
        'bagging_temperature' :trial.suggest_loguniform('bagging_temperature', 0.01, 100.00), 
        'od_type': trial.suggest_categorical('od_type', ['IncToDec', 'Iter']),
        'od_wait' :trial.suggest_int('od_wait', 10, 50)
    }

    # 学習の平均値をとる
    pre_list = np.empty((20,test.shape[0]))
    model = catboost.CatBoostRegressor(**params, loss_function='RMSE',task_type='GPU', max_bin=250)
    for i in range(20):
        model.fit(c_train, eval_set=c_train_split, early_stopping_rounds=10,verbose=False)
        pre_list[i] = math.sqrt(mean_squared_error(train_Y_split, model.predict(c_train_split)))

    return np.mean(pre_list)

In [None]:
import optuna
# 最適パラメータ取得
study = optuna.create_study()
study.optimize(objective, timeout=3600)
trial = study.best_trial
params_best = dict(trial.params.items())
params_best['random_seed'] = 0

In [None]:
cat_features = train_X.dtypes[train_X.dtypes == 'category'].index
c_train = Pool(train_X, label=train_Y,cat_features=cat_features)
c_train_split = Pool(train_X_split, label=train_Y_split,cat_features=cat_features)
c_test = Pool(test_X,cat_features=cat_features)

pre_list = np.empty((20,test.shape[0]))
model_o = catboost.CatBoostRegressor(**params_best, loss_function='RMSE',task_type='GPU', max_bin=250)
for i in range(20):
    model_o.fit(c_train, eval_set=c_train_split, early_stopping_rounds=10,verbose=False)
    pre_list[i] = model_o.predict(c_test).flatten()
results = np.mean(pre_list, axis=0)

In [None]:
print(results)

In [None]:
pd.DataFrame(results).to_csv('submit/submit.csv', header=False, index=True)

In [None]:
# !pip install lightgbm
# !pip install optuna

In [None]:
#!pip install category_encoders