In [1]:
import os
import re
import time
import pandas as pd
import numpy as np

start = time.perf_counter()

train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [2]:
train_X_df = pd.DataFrame()
test_X_df = pd.DataFrame()

In [3]:
# strで書かれてしまったリストをndarrayに変換する
def str_to_nd(strs):
    strs = re.sub('[{}"]','',strs)
    str_list = strs.split(',')
    str_nd = np.array(str_list)
    return str_nd

# リストをOne-Hotに変換する
from sklearn.preprocessing import MultiLabelBinarizer
def list_to_onehot(train,test):
    mlb = MultiLabelBinarizer()
    train_onehot = mlb.fit_transform(train.values)
    test_onehot = mlb.transform(test.values)
    train_onehot_df = pd.DataFrame(train_onehot, columns = mlb.classes_)
    columns_list = train_onehot_df.mean()[train_onehot_df.mean()>0.00].index.tolist()
    train_onehot_df = train_onehot_df[columns_list]
    test_onehot_df = pd.DataFrame(test_onehot, columns = mlb.classes_)
    test_onehot_df = test_onehot_df[columns_list]
    return train_onehot_df.astype('int').astype('category'),test_onehot_df.astype('int').astype('category')

# アメニティデータ
train['amenities'] = train['amenities'].map(str_to_nd)
test['amenities'] = test['amenities'].map(str_to_nd)
train_X_df = pd.concat([train_X_df,list_to_onehot(train['amenities'],test['amenities'])[0]], axis=1)
test_X_df = pd.concat([test_X_df,list_to_onehot(train['amenities'],test['amenities'])[1]], axis=1)
# アメニティの数
train_X_df = pd.concat([train_X_df,train['amenities'].map(lambda x:len(x)).rename('am_num')], axis=1)
test_X_df = pd.concat([test_X_df,test['amenities'].map(lambda x:len(x)).rename('am_num')], axis=1)



In [4]:
# カテゴリー系のデータをラベルにする
import itertools
import category_encoders as ce
def df_to_label(train,test):
    train,test = train.fillna('nan'),test.fillna('nan')
    oe = ce.OrdinalEncoder(handle_unknown='return nan')
    train_oe = oe.fit_transform(train).fillna(999).astype(int).astype('category') # 欠損値はありえない数値で埋める
    test_oe = oe.transform(test).fillna(999).astype(int).astype('category')
    return train_oe,test_oe

# カテゴリー系データ
cat_list = ['bed_type','cancellation_policy','city','cleaning_fee','host_has_profile_pic','host_identity_verified','instant_bookable','property_type','room_type','neighbourhood']
train_X_df = pd.concat([train_X_df,df_to_label(train[cat_list],test[cat_list])[0]], axis=1)
test_X_df = pd.concat([test_X_df,df_to_label(train[cat_list],test[cat_list])[1]], axis=1)

In [5]:
# 日付関連のデータを整形
import datetime
def str_to_date(df):
    # 年と月だけを抜き出す、nanはそのまま
    df_year = df.applymap(lambda x: datetime.datetime.strptime(x,'%Y-%m-%d').year if x==x else x)
    df_month = df.applymap(lambda x: datetime.datetime.strptime(x,'%Y-%m-%d').month if x==x else x)

    # 月で換算する(わかりやすくするために年は2000までは切り捨て)
    df_date = (df_year - 2000) * 12 + df_month

    return df_date

# 日付関連のデータ
date_list = ['host_since','first_review','last_review']
train_X_df = pd.concat([train_X_df,str_to_date(train[date_list])], axis=1)
test_X_df = pd.concat([test_X_df,str_to_date(test[date_list])], axis=1)

In [6]:
# 月データ(季節)
str_to_month = lambda x: datetime.datetime.strptime(x,'%Y-%m-%d').month if x==x else x
train_X_df = pd.concat([train_X_df,train['host_since'].map(str_to_month).rename('host_since_month')], axis=1)
test_X_df = pd.concat([test_X_df,test['host_since'].map(str_to_month).rename('host_since_month')], axis=1)

train_X_df = pd.concat([train_X_df,train['first_review'].map(str_to_month).rename('first_review_month')], axis=1)
test_X_df = pd.concat([test_X_df,test['first_review'].map(str_to_month).rename('first_review_month')], axis=1)

train_X_df = pd.concat([train_X_df,train['last_review'].map(str_to_month).rename('last_review_month')], axis=1)
test_X_df = pd.concat([test_X_df,test['last_review'].map(str_to_month).rename('last_review_month')], axis=1)

In [7]:
# 反応があるかないかをOne-Hotデータで表現する
def res_to_onehot(df):
    df = df.applymap(lambda x: 1 if x==x else 0)
    df = df.set_axis([x+'_hot' for x in df.columns], axis=1)
    return df.astype('category')

#res_list = ['first_review','host_since','last_review','thumbnail_url']
res_list = ['host_since']
train_X_df = pd.concat([train_X_df,res_to_onehot(train[res_list])], axis=1)
test_X_df = pd.concat([test_X_df,res_to_onehot(test[res_list])], axis=1)

In [8]:
# str型のパーセンテージで書かれたデータを0~1のfloat型にする
def strper_to_float(strper):
    if strper==strper:
        strnum = re.sub('%','',strper)
        num_float = float(strnum) / 100.
    else:
        num_float = np.nan
    return num_float

# str型のパーセンテージ
train['host_response_rate'] = train['host_response_rate'].map(strper_to_float)
test['host_response_rate'] = test['host_response_rate'].map(strper_to_float)

# float型データ
num_list = ['accommodates','bathrooms','bedrooms','beds','latitude','longitude','number_of_reviews','review_scores_rating','host_response_rate']
train_X_df = pd.concat([train_X_df,train[num_list]], axis=1)
test_X_df = pd.concat([test_X_df,test[num_list]], axis=1)

In [9]:
# 緯度経度から地理情報を入手
from uszipcode import SearchEngine
search = SearchEngine()
def latlon_to_geo(lat, lon):
    d = search.by_coordinates(lat, lon, radius=20)[0].to_dict()
    return pd.DataFrame([d])

def data_to_geo(df):
    geo = pd.concat([latlon_to_geo(x,y) for x,y in zip(df['latitude'],df['longitude'])])
    return geo

def geo_cat_num(geo):
    geo['zipcode'] = geo['zipcode'].apply(lambda x:str(x)[:5])
    cat_list = ['zipcode', 
                'major_city']

    num_list = ['population', 
                'population_density', 
                'land_area_in_sqmi', 
                'water_area_in_sqmi', 
                'housing_units', 
                'occupied_housing_units', 
                'median_home_value', 
                'median_household_income']
    
    return geo[cat_list], geo[num_list]

# DataFrameの作成が非常に遅いため、DataFrameを作成したらcsvとして保存し次に使うときはcsvから読み込む

if os.path.isfile('train_geo.csv'):
    train_geo = pd.read_csv('train_geo.csv', index_col=0)
else:
    train_geo = data_to_geo(train).reset_index()
    train_geo.to_csv('train_geo.csv', header=True, index=True)
train_geo_cat, train_geo_num = geo_cat_num(train_geo)

if os.path.isfile('test_geo.csv'):
    test_geo = pd.read_csv('test_geo.csv', index_col=0)
else:
    test_geo = data_to_geo(test).reset_index()
    test_geo.to_csv('test_geo.csv', header=True, index=True)
test_geo_cat, test_geo_num = geo_cat_num(test_geo)

train_X_df = pd.concat([train_X_df,df_to_label(train_geo_cat,test_geo_cat)[0]], axis=1)
test_X_df = pd.concat([test_X_df,df_to_label(train_geo_cat,test_geo_cat)[1]], axis=1)
train_X_df = pd.concat([train_X_df,train_geo_num], axis=1)
test_X_df = pd.concat([test_X_df,test_geo_num], axis=1)

In [10]:
# descriptionの単語数を調べる
from collections import Counter
wordnum = lambda x:sum(Counter(x.split()).values())
train_X_df = pd.concat([train_X_df,train['description'].map(wordnum).rename('word_num')], axis=1)
test_X_df = pd.concat([test_X_df,test['description'].map(wordnum).rename('word_num')], axis=1)

In [11]:
# 部屋数
train_X_df['rooms'] = train_X_df['bedrooms'] + train_X_df['bathrooms']
test_X_df['rooms'] = test_X_df['bedrooms'] + test_X_df['bathrooms']

In [12]:
# 価格が高い地点との距離を測る
top_loc = np.array([[-118.813009,34.028313],[-118.448221,34.132164]])

def u_dis(x1,y1,x2,y2):
    return np.sqrt((x1-x2)**2 + (y1-y2)**2)

def min_dis(df):
    tmp = np.empty(2)
    for i,x,y in zip(range(1),top_loc[:,0],top_loc[:,1]):
        tmp[i] = u_dis(df['longitude'],df['latitude'],x,y)
    return tmp

train_X_df = pd.concat([train_X_df,train_X_df.apply(min_dis, axis=1).apply(pd.Series)], axis=1)
test_X_df = pd.concat([test_X_df,test_X_df.apply(min_dis, axis=1).apply(pd.Series)], axis=1)

train_X_df = train_X_df.rename(columns=dict([(x,'dis_'+str(x)) for x in range(2)]))
test_X_df = test_X_df.rename(columns=dict([(x,'dis_'+str(x)) for x in range(2)]))

In [13]:
# 占有住宅率
train_X_df['occupied_housing_rate'] = train_X_df['occupied_housing_units'] / train_X_df['housing_units']
test_X_df['occupied_housing_rate'] = test_X_df['occupied_housing_units'] / test_X_df['housing_units']

In [14]:
# 学習用データの作成
from nltk import sent_tokenize, word_tokenize
remnum = lambda x:re.sub(r'[0-9]+', " ", x)
remstr = lambda x:re.sub(r'[\．_－―─\-‐|\“■×+α÷⇒—●□(=)*&^%$#@!~`){}…\[\]\"\'\”\’:;<>?・,\./→←○\n\u3000]+', " ", x)
shape_sent = lambda x:word_tokenize(x.lower())

train_desc = train['description'].map(remnum).map(remstr).map(shape_sent)
test_desc = test['description'].map(remnum).map(remstr).map(shape_sent)

train_name = train['name'].map(remnum).map(remstr).map(shape_sent)
test_name = test['name'].map(remnum).map(remstr).map(shape_sent)

In [15]:
# description用モデルの作成
vector_size=50
from nltk import sent_tokenize, word_tokenize
from gensim.models import Word2Vec
if os.path.isfile('description.model'):
    model_desc = Word2Vec.load('description.model')
else:
    # word2vecのモデル作成
    sentence = train_desc.values.tolist()
    model_desc = Word2Vec(sentence,vector_size=vector_size,window=15,min_count=5)
    model_desc.train(sentence,total_examples=len(sentence),epochs=100)
    model_desc.save('description.model')



In [16]:
# name用モデルの作成
vector_size_name=50
if os.path.isfile('name.model'):
    model_name = Word2Vec.load('name.model')
else:
    # word2vecのモデル作成
    sentence = train_name.values.tolist()
    model_name = Word2Vec(sentence,vector_size=vector_size,window=10,min_count=5)
    model_name.train(sentence,total_examples=len(sentence),epochs=50)
    model_name.save('name.model')

In [17]:
# descriptionの文章ベクトルの作成
def wordvec2docvec(sentence):
    # 文章ベクトルの初期値（0ベクトルを初期値とする）
    docvecs = np.zeros(vector_size, dtype="float32")

    # 文章の中に存在する単語の数
    denomenator = len(sentence)
    # 文章内の各単語ベクトルを足し合わせる
    for word in sentence:
        try:
            temp = model_desc.wv[word]
        except:
            denomenator -= 1
            continue
        docvecs += temp

    # 文章に現れる単語のうち、モデルに存在した単語の数で割る
    if denomenator > 0:
        docvecs =  docvecs / denomenator

    return docvecs

train_X_df = pd.concat([train_X_df,train_desc.map(wordvec2docvec).apply(pd.Series)], axis=1)
test_X_df = pd.concat([test_X_df,test_desc.map(wordvec2docvec).apply(pd.Series)], axis=1)
train_X_df = train_X_df.rename(columns=dict([(x,'desc_'+str(x)) for x in range(vector_size)]))
test_X_df = test_X_df.rename(columns=dict([(x,'desc_'+str(x)) for x in range(vector_size)]))

In [18]:
# nameの文章ベクトルの作成
def wordvec2docvec(sentence):
    # 文章ベクトルの初期値（0ベクトルを初期値とする）
    docvecs = np.zeros(vector_size_name, dtype="float32")

    # 文章の中に存在する単語の数
    denomenator = len(sentence)
    # 文章内の各単語ベクトルを足し合わせる
    for word in sentence:
        try:
            temp = model_name.wv[word]
        except:
            denomenator -= 1
            continue
        docvecs += temp

    # 文章に現れる単語のうち、モデルに存在した単語の数で割る
    if denomenator > 0:
        docvecs =  docvecs / denomenator

    return docvecs

train_X_df = pd.concat([train_X_df,train_name.map(wordvec2docvec).apply(pd.Series)], axis=1)
test_X_df = pd.concat([test_X_df,test_name.map(wordvec2docvec).apply(pd.Series)], axis=1)
train_X_df = train_X_df.rename(columns=dict([(x,'name_'+str(x)) for x in range(vector_size)]))
test_X_df = test_X_df.rename(columns=dict([(x,'name_'+str(x)) for x in range(vector_size)]))

KeyboardInterrupt: 

In [20]:
pd.set_option('display.max_columns', 800)
display(train_X_df.head(5))
print(train_X_df.shape)

Unnamed: 0,24-hour check-in,Air conditioning,Bathtub,Bed linens,Breakfast,Buzzer/wireless intercom,Cable TV,Carbon monoxide detector,Cat(s),Children’s books and toys,Coffee maker,Cooking basics,Dishes and silverware,Dishwasher,Dog(s),Doorman,Dryer,Elevator,Elevator in building,Essentials,Extra pillows and blankets,Family/kid friendly,Fire extinguisher,First aid kit,Free parking on premises,Garden or backyard,Gym,Hair dryer,Hangers,Heating,Host greets you,Hot tub,Hot water,Indoor fireplace,Internet,Iron,Keypad,Kitchen,Laptop friendly workspace,Lock on bedroom door,Lockbox,Long term stays allowed,Luggage dropoff allowed,Microwave,Oven,Pack ’n Play/travel crib,Patio or balcony,Pets allowed,Pets live on this property,Pool,Private entrance,Private living room,Refrigerator,Room-darkening shades,Safety card,Self Check-In,Shampoo,Smartlock,Smoke detector,Smoking allowed,Step-free access,Stove,Suitable for events,TV,Washer,Well-lit path to entrance,Wheelchair accessible,Wireless Internet,translation missing: en.hosting_amenity_49,translation missing: en.hosting_amenity_50,am_num,bed_type,cancellation_policy,city,cleaning_fee,host_has_profile_pic,host_identity_verified,instant_bookable,property_type,room_type,neighbourhood,host_since,first_review,last_review,host_since_month,first_review_month,last_review_month,host_since_hot,accommodates,bathrooms,bedrooms,beds,latitude,longitude,number_of_reviews,review_scores_rating,host_response_rate,zipcode,major_city,population,population_density,land_area_in_sqmi,water_area_in_sqmi,housing_units,occupied_housing_units,median_home_value,median_household_income,word_num,rooms,dis_0,dis_1,occupied_housing_rate,desc_0,desc_1,desc_2,desc_3,desc_4,desc_5,desc_6,desc_7,desc_8,desc_9,desc_10,desc_11,desc_12,desc_13,desc_14,desc_15,desc_16,desc_17,desc_18,desc_19,desc_20,desc_21,desc_22,desc_23,desc_24,desc_25,desc_26,desc_27,desc_28,desc_29,desc_30,desc_31,desc_32,desc_33,desc_34,desc_35,desc_36,desc_37,desc_38,desc_39,desc_40,desc_41,desc_42,desc_43,desc_44,desc_45,desc_46,desc_47,desc_48,desc_49
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,1,0,0,1,0,0,7,1,1,1,1,1,1,1,1,1,1,199.0,199.0,199.0,7.0,7.0,7.0,1,6,2.0,1.0,4.0,33.788931,-118.154761,1,60.0,,1,1,40311,19061.0,2.11,0.0,15792,14556,340600.0,42672.0,76,3.0,0.700424,-6.522993e-166,0.921733,-1.900919,0.231837,-0.361965,-0.15613,-0.888952,-0.208904,-1.498402,-1.058776,-1.216541,0.651656,-0.626779,0.310452,-0.587204,0.303202,0.293937,0.174701,0.426225,-0.731825,-0.458716,-0.847465,0.1286,-0.935357,-1.55762,0.34327,-0.900722,2.083479,-1.143699,0.372548,0.619828,1.278649,0.188055,0.684179,-0.713492,1.411318,-0.574032,0.528712,-0.978824,-0.493474,-1.327512,-1.043061,-0.318801,-0.975571,0.215525,0.838071,0.070018,-0.53413,-0.801854,0.992161,-0.442845,1.611833
1,1,1,0,0,0,0,1,1,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,1,1,0,0,0,1,1,0,0,0,0,1,1,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,1,1,0,0,1,1,1,22,1,2,2,1,1,2,1,2,1,2,192.0,201.0,207.0,12.0,9.0,3.0,1,2,1.0,1.0,1.0,38.93481,-76.97819,9,100.0,1.0,2,2,16894,5602.0,3.02,0.01,7652,7005,355500.0,52317.0,70,2.0,42.12156,1.061916e+109,0.915447,0.044158,0.113658,-0.033609,-0.364714,0.567105,0.556047,-0.630063,1.355203,0.906076,1.044096,-0.557932,-0.351208,-0.424376,-0.22948,-0.556773,-0.920768,-0.316864,-0.651414,-1.711163,-0.016381,-0.703694,0.295607,-0.946121,0.525093,-0.73459,1.056076,-0.226515,0.039734,-0.553441,1.237357,0.458374,0.601546,0.402689,0.694649,-0.34633,-1.309196,-0.088227,1.793174,0.442358,-0.552676,0.271393,0.220107,-0.050281,-0.068751,-0.194533,-0.259209,0.293063,-0.529087,-0.901656,-0.065665
2,1,0,1,0,0,1,0,1,0,0,0,0,0,0,0,0,1,0,0,1,0,0,1,1,0,0,0,1,1,1,0,0,0,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,0,1,0,0,0,0,1,1,0,0,1,1,1,29,1,2,3,1,1,1,2,1,1,3,197.0,198.0,212.0,5.0,6.0,8.0,1,2,2.0,1.0,1.0,40.695118,-73.92624,27,83.0,1.0,3,3,78895,57033.0,1.38,0.0,31784,28391,543800.0,39178.0,79,3.0,45.379162,0.0,0.893248,-0.641935,-0.581967,0.304856,0.081552,-0.43701,-0.088733,-0.844983,-1.084293,-1.057522,1.186769,-0.377836,-0.059716,-0.538127,-0.128097,0.604671,0.856488,1.339953,-0.5528,-0.815549,-0.582429,0.276294,-0.460807,-1.868058,-0.178864,0.178433,0.47623,-0.874663,-0.85182,0.760317,1.059791,0.714377,0.655204,-1.703702,-0.512337,0.115533,0.148635,-0.494098,0.465845,0.105314,1.105203,0.369854,-1.444038,-0.239121,0.363112,-0.011313,-0.150178,-0.214801,-0.5479,-0.210648,0.672317
3,0,1,0,0,0,1,1,1,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,0,1,0,0,0,0,1,1,0,0,1,0,0,18,1,2,4,1,1,2,2,1,1,4,150.0,171.0,213.0,6.0,3.0,9.0,1,2,1.0,1.0,1.0,37.796728,-122.411906,38,95.0,1.0,4,4,13768,50983.0,0.27,0.0,8843,7628,856900.0,35427.0,183,2.0,5.210855,6.952391e-310,0.862603,0.097897,-0.367784,-0.778518,-0.169237,0.165545,-1.182026,-0.726145,-0.224695,0.544596,-0.240758,0.234892,-0.2559,-0.238931,-0.209705,-0.756818,-1.806813,-0.685274,-0.042196,0.533328,0.885913,-1.117236,0.382014,-1.079423,0.035708,-0.904172,0.300797,-0.514923,-0.432467,0.355145,0.265456,0.359447,0.548238,-0.258019,1.510483,-0.224177,-0.172252,1.010828,0.934962,0.084096,-0.662155,-0.346692,0.934905,1.29328,1.525268,0.665803,0.336256,0.468665,0.663039,0.584035,-1.026452
4,0,1,0,0,0,1,0,1,0,0,0,0,0,0,0,0,1,1,0,1,0,0,1,0,0,0,0,1,1,1,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,1,0,0,0,0,1,1,0,0,1,0,0,18,1,2,3,1,1,2,1,1,2,5,183.0,188.0,213.0,3.0,8.0,9.0,1,2,1.0,1.0,1.0,40.78505,-73.974691,5,100.0,1.0,5,5,60998,124357.0,0.49,0.0,39402,34383,872500.0,103534.0,172,2.0,45.344551,6.952391e-310,0.872621,0.336234,-0.846469,0.236255,-1.909312,-0.199328,-0.774147,0.764652,-0.313032,-0.394287,2.670185,-0.197344,0.686809,-1.727731,-0.365483,0.155637,0.838438,0.079,-0.701284,-0.723818,-1.423023,0.805489,-0.086649,-0.137798,0.371904,-0.50095,0.408363,-1.084448,-0.031117,-0.620314,0.669138,0.58113,1.149369,-1.273463,-0.366768,0.374153,-0.591189,-0.095716,-0.51041,-0.048973,-0.990633,-0.204771,0.025449,-0.849469,-1.580561,-0.786989,-0.199927,-0.945691,-0.420864,0.077025,1.16286
5,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,1,1,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,1,0,10,1,1,3,1,1,2,1,2,1,6,203.0,206.0,213.0,11.0,2.0,9.0,1,2,1.0,1.0,1.0,40.640241,-74.015729,7,94.0,1.0,6,3,99598,55603.0,1.79,0.0,31045,29018,615300.0,37580.0,158,2.0,45.2826,6.952395e-310,0.934708,0.405066,-0.0696,0.505565,-1.203447,-0.149615,-1.166468,-0.337233,-0.931331,0.178086,1.163529,-0.758704,-0.880686,-0.54458,-0.066734,1.938759,0.021058,1.617234,-0.104793,0.920867,-0.345064,0.114559,0.202199,-1.015886,0.69473,-0.644118,1.621285,-0.853773,0.469426,0.293949,0.966986,0.401074,1.459478,-1.357022,3.145036,-0.37656,-0.25511,-0.029341,0.671229,-0.5675,-0.30756,-0.522086,-0.381495,0.175681,0.743278,1.136454,0.129608,-0.201142,-0.23146,-0.848795,0.659672
6,1,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,1,0,1,1,1,1,0,0,1,1,1,0,0,0,0,1,1,0,1,1,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,1,1,0,1,0,0,0,0,0,0,0,0,1,0,1,24,1,3,3,1,1,2,2,2,1,7,173.0,185.0,213.0,5.0,5.0,9.0,1,2,1.5,1.0,1.0,40.676824,-73.915965,65,91.0,1.0,7,3,67053,49746.0,1.35,0.0,29074,25693,551900.0,34492.0,170,2.5,45.386642,6.952395e-310,0.883711,0.342457,0.079571,-1.218456,-0.576328,1.549925,-0.6423,-0.863667,1.474615,0.351032,-0.619935,0.614273,-0.243659,-0.229384,0.006351,0.423398,-1.174113,-0.032415,0.788159,1.123074,0.958812,-0.677423,-0.347228,-1.178809,-1.564016,-0.020211,0.492216,-0.823553,1.038746,1.477468,-0.141053,1.337039,0.23767,-0.404893,1.521518,0.094889,-0.237682,1.249783,0.812284,0.306544,-1.045394,0.513504,0.110728,0.509446,0.427596,0.805039,-0.153877,1.485242,0.253838,-0.373908,-1.828358
7,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,1,0,0,1,0,0,1,0,1,0,0,1,0,1,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,1,1,0,0,17,2,1,1,2,1,1,1,1,1,8,128.0,,,8.0,,,1,2,1.0,1.0,1.0,34.068441,-118.353515,0,,,8,6,36865,14922.0,2.47,0.0,19920,18646,945000.0,71589.0,169,2.0,0.461243,6.952395e-310,0.936044,0.954821,0.034923,-0.484426,-0.159271,-0.555823,-0.253049,-0.051201,0.419002,0.786326,0.248683,-0.02956,0.030407,0.159311,-0.771203,0.089949,0.471646,-1.01955,0.106689,-0.254579,-0.757221,0.022151,0.234992,0.32794,0.927248,0.395764,-0.181856,-0.484569,-0.345169,0.005214,0.303876,-0.425588,0.392218,-0.132748,-0.846639,0.294696,-0.874053,-0.893368,0.271586,-0.762572,0.064742,-0.932217,1.170915,0.124109,-0.064459,-0.515906,-0.012872,-0.197922,-0.474142,0.615176,-0.597599
8,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,1,0,1,0,0,0,0,0,0,0,0,1,1,1,14,1,2,4,1,1,1,1,1,1,9,199.0,210.0,213.0,7.0,6.0,9.0,1,2,1.0,1.0,1.0,37.801514,-122.41141,21,87.0,1.0,9,4,3713,10791.0,0.34,0.19,2811,2365,882000.0,89722.0,178,2.0,5.213975,5e-324,0.841338,-0.410205,-0.229365,-0.309753,-0.854542,-1.072711,-0.790663,-0.362803,0.143501,-0.260168,0.388682,0.108815,0.197395,-0.661643,-0.137759,0.195646,-0.651456,-0.336653,-1.017495,-0.829252,-0.849578,-0.161113,-0.826619,-1.362941,-0.247913,-0.553381,1.395698,-0.859032,-0.544096,-0.614442,0.698932,0.666913,0.555384,-0.524115,0.821469,0.152502,0.278581,-0.608494,-0.430915,-0.410455,-0.317834,-0.20624,0.069858,-0.124559,-0.094706,-0.135205,-0.36699,-0.109363,-0.041464,0.216989,0.124994
9,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,7,1,3,3,1,1,1,1,2,1,10,142.0,156.0,201.0,10.0,12.0,9.0,1,2,1.0,1.0,1.0,40.751933,-73.878733,27,88.0,1.0,10,7,66636,90360.0,0.74,0.0,25100,23906,265500.0,48683.0,199,2.0,45.434527,4e-323,0.95243,-0.386466,-0.415543,-0.790199,-1.334186,-0.155241,-0.885956,-0.106774,-0.759018,-0.699641,1.143974,-0.907503,0.162517,-0.153001,0.668408,1.270368,0.499638,0.043442,-0.955577,0.586342,-0.2946,-0.781297,-1.189999,-1.0464,-0.343633,-0.400559,1.248821,-1.059819,0.478923,0.48711,-0.082268,-0.068191,1.065826,-0.293593,1.016145,-0.429413,0.128414,0.273648,0.489063,-0.543996,-0.574489,-0.456753,-1.731431,-0.363465,0.074568,0.073573,-0.44047,0.647871,1.039814,0.494598,0.717888


In [22]:
from copy import copy
import catboost
from catboost import Pool
import sklearn.preprocessing as sp
from sklearn import model_selection
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
def CAT(train_X_df,train_Y_df,test_X_df,test_size=0):
    # 代入されないようシャドーコピー
    train_X,train_Y,test_X = copy(train_X_df),copy(train_Y_df),copy(test_X_df)
    # データ分割(test_sizeを指定しない場合は分割しない)
    if test_size:
        train_X, train_X_split, train_Y, train_Y_split = train_test_split(train_X, train_Y, test_size=test_size)
    # データを変換
    cat_features = train_X.dtypes[train_X.dtypes == 'category'].index
    c_train = Pool(train_X, label=train_Y,cat_features=cat_features)
    if test_size: c_train_split = Pool(train_X_split, label=train_Y_split,cat_features=cat_features)
    c_test = Pool(test_X,cat_features=cat_features)
    # モデル定義
    model = catboost.CatBoostRegressor(iterations=100000, loss_function='RMSE')
    # 学習
    if test_size:
        model.fit(c_train, eval_set=c_train_split,early_stopping_rounds=1000, use_best_model=True,verbose=True)
    else:
        model.fit(c_train, early_stopping_rounds=1000, use_best_model=True,verbose=True)

    # モデルの評価(データ分割をしている場合としてしない場合で分岐)
    
    if test_size:
        pred_train = model.predict(c_train)
        pred_test = model.predict(c_train_split)
        print(mean_squared_error(train_Y, pred_train))
        print(mean_squared_error(train_Y_split, pred_test))
    else:
        pred_train = model.predict(c_train)
        print(mean_squared_error(train_Y, pred_train))

    # モデルの予測
    predictions = model.predict(c_test)

    return predictions, model.get_feature_importance(c_train), train_X.columns

60 (658)	total: 3m 18s	remaining: 37.2s
842:	learn: 77.3369845	test: 99.2745396	best: 99.1285460 (658)	total: 3m 18s	remaining: 37s
843:	learn: 77.3332779	test: 99.2755531	best: 99.1285460 (658)	total: 3m 18s	remaining: 36.7s
844:	learn: 77.3237808	test: 99.2777809	best: 99.1285460 (658)	total: 3m 19s	remaining: 36.5s
845:	learn: 77.3101861	test: 99.2791128	best: 99.1285460 (658)	total: 3m 19s	remaining: 36.3s
846:	learn: 77.2951691	test: 99.2759221	best: 99.1285460 (658)	total: 3m 19s	remaining: 36s
847:	learn: 77.2536982	test: 99.2847200	best: 99.1285460 (658)	total: 3m 19s	remaining: 35.8s
848:	learn: 77.2336665	test: 99.2777741	best: 99.1285460 (658)	total: 3m 19s	remaining: 35.5s
849:	learn: 77.2091788	test: 99.2646954	best: 99.1285460 (658)	total: 3m 20s	remaining: 35.3s
850:	learn: 77.1977737	test: 99.2768809	best: 99.1285460 (658)	total: 3m 20s	remaining: 35.1s
851:	learn: 77.1831332	test: 99.2805788	best: 99.1285460 (658)	total: 3m 20s	remaining: 34.9s
852:	learn: 77.1539492	t

In [None]:
model_num = 100 # モデルの数
timeout = 100000 # 何秒経ったら切り上げるか

dir_path = '/submit/subs'

os.makedirs(dir_path, exist_ok=True)

for i in range(model_num):
    predictions, feature_importances, feature_names = CAT(train_X_df,train['y'],test_X_df,test_size=0.1)
    results = predictions.flatten()
    pd.DataFrame(results).to_csv(os.path.join(dir_path, str(i+1) + 'sub.csv'), header=False, index=True)
    if (time.perf_counter() - start) > timeout: break

In [None]:
for score, name in sorted(zip(feature_importances, feature_names), reverse=True):
    print('{}: {}'.format(name, score))

In [48]:
from glob import glob

csvs = glob(dir_path + '*')
csv_list = [pd.read_csv(csv, header=None) for csv in csvs]
sum_csv = sum(csv_list) / len(csv_list)

pd.DataFrame(sum_csv).to_csv('submit.csv', header=False, index=True)