In [1]:
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, f1_score, confusion_matrix
import seaborn as sns
import lightgbm as lgb
import matplotlib.pyplot as plt
import os

#directori読み込み
path = "./"
files = os.listdir(path)

In [2]:
#ファイルの読み込み
INPUT = Path(os.getcwd())
df_train = pd.read_csv(INPUT / "train.csv")
df_test = pd.read_csv(INPUT / "test.csv")
df_sample_sub = pd.read_csv(INPUT / "sample_submit.csv", header=None)
df_sample_sub.columns = ["index", "SalePrice"]

# EDA

In [3]:
df_train.describe()

Unnamed: 0,id,accommodates,bathrooms,bedrooms,beds,latitude,longitude,number_of_reviews,review_scores_rating,y
count,55583.0,55583.0,55436.0,55512.0,55487.0,55583.0,55583.0,55583.0,43027.0,55583.0
mean,27791.0,3.152906,1.236426,1.265312,1.71177,38.450812,-92.342406,20.937283,94.077928,160.163647
std,16045.574343,2.153001,0.583074,0.849319,1.257822,3.080869,21.687844,37.951889,7.802154,168.08714
min,0.0,1.0,0.0,0.0,0.0,33.338905,-122.5115,0.0,20.0,1.0
25%,13895.5,2.0,1.0,1.0,1.0,34.128205,-118.341876,1.0,92.0,74.0
50%,27791.0,2.0,1.0,1.0,1.0,40.663312,-76.995602,6.0,96.0,111.0
75%,41686.5,4.0,1.0,1.0,2.0,40.746146,-73.954703,23.0,100.0,185.0
max,55582.0,16.0,8.0,10.0,18.0,42.390437,-70.999166,605.0,100.0,1999.0


In [4]:
df_train.dtypes

id                          int64
accommodates                int64
amenities                  object
bathrooms                 float64
bed_type                   object
bedrooms                  float64
beds                      float64
cancellation_policy        object
city                       object
cleaning_fee               object
description                object
first_review               object
host_has_profile_pic       object
host_identity_verified     object
host_response_rate         object
host_since                 object
instant_bookable           object
last_review                object
latitude                  float64
longitude                 float64
name                       object
neighbourhood              object
number_of_reviews           int64
property_type              object
review_scores_rating      float64
room_type                  object
thumbnail_url              object
zipcode                    object
y                         float64
dtype: object

In [5]:
df_train.columns

Index(['id', 'accommodates', 'amenities', 'bathrooms', 'bed_type', 'bedrooms',
       'beds', 'cancellation_policy', 'city', 'cleaning_fee', 'description',
       'first_review', 'host_has_profile_pic', 'host_identity_verified',
       'host_response_rate', 'host_since', 'instant_bookable', 'last_review',
       'latitude', 'longitude', 'name', 'neighbourhood', 'number_of_reviews',
       'property_type', 'review_scores_rating', 'room_type', 'thumbnail_url',
       'zipcode', 'y'],
      dtype='object')

In [6]:
df_train['bed_type'].value_counts()

Real Bed         53989
Futon              569
Pull-out Sofa      453
Airbed             364
Couch              208
Name: bed_type, dtype: int64

In [7]:
df_train['cancellation_policy'].value_counts()

strict             24318
flexible           16886
moderate           14288
super_strict_30       78
super_strict_60       13
Name: cancellation_policy, dtype: int64

In [8]:
df_train['city'].value_counts()

NYC        24326
LA         16828
SF          4768
DC          4259
Chicago     2807
Boston      2595
Name: city, dtype: int64

In [9]:
df_train['cleaning_fee'].value_counts()

t    40821
f    14762
Name: cleaning_fee, dtype: int64

In [10]:
df_train['first_review'].value_counts()

2017-01-01    214
2017-01-22    191
2016-01-02    174
2017-01-02    153
2016-01-03    142
             ... 
2010-01-13      1
2010-08-04      1
2009-05-10      1
2011-10-24      1
2013-01-25      1
Name: first_review, Length: 2446, dtype: int64

In [11]:
df_train['name'].value_counts()

Your home away from home                  6
East Village Studio                       6
Charming East Village Apartment           5
Studio in the heart of Manhattan          4
Bunk bed in the Treat Street Clubhouse    4
                                         ..
Camping in Nolita                         1
1 Bedroom in nice, clean LES apartment    1
Bright Modern top floor balcony apt       1
Lovely 2 bed home in Mid City LA          1
Large Brentwood bedroom                   1
Name: name, Length: 55152, dtype: int64

In [12]:
df_train['neighbourhood'].value_counts()

Williamsburg          2189
Bedford-Stuyvesant    1615
Bushwick              1208
Upper West Side       1075
Mid-Wilshire          1058
                      ... 
Castleton Corners        1
Watertown                1
Arboretum                1
Mill Basin               1
Shipley Terrace          1
Name: neighbourhood, Length: 607, dtype: int64

In [13]:
df_train['cleaning_fee'].value_counts()

t    40821
f    14762
Name: cleaning_fee, dtype: int64

In [14]:
df_train['property_type'].value_counts()

Apartment             36826
House                 12326
Condominium            1983
Townhouse              1249
Loft                    914
Other                   448
Guesthouse              392
Bed & Breakfast         349
Bungalow                281
Villa                   131
Dorm                    112
Guest suite              96
Camper/RV                71
In-law                   60
Timeshare                55
Boat                     53
Hostel                   51
Boutique hotel           50
Cabin                    47
Serviced apartment       16
Tent                     14
Vacation home            11
Castle                    9
Yurt                      8
Treehouse                 7
Hut                       7
Earth House               4
Chalet                    4
Tipi                      3
Train                     2
Island                    1
Parking Space             1
Cave                      1
Casa particular           1
Name: property_type, dtype: int64

In [15]:
df_train['host_response_rate'].value_counts()

100%    32510
90%      1666
80%       824
0%        660
50%       464
        ...  
6%          1
31%         1
15%         1
21%         1
39%         1
Name: host_response_rate, Length: 78, dtype: int64

In [16]:
df_train['room_type'].value_counts()

Entire home/apt    30939
Private room       23023
Shared room         1621
Name: room_type, dtype: int64

In [17]:
df_train['host_has_profile_pic'].value_counts()

t    55265
f      170
Name: host_has_profile_pic, dtype: int64

# 欠損値の確認

In [18]:
len(df_train)

55583

In [19]:
df_train.isnull().sum()

id                            0
accommodates                  0
amenities                     0
bathrooms                   147
bed_type                      0
bedrooms                     71
beds                         96
cancellation_policy           0
city                          0
cleaning_fee                  0
description                   0
first_review              11908
host_has_profile_pic        148
host_identity_verified      148
host_response_rate        13704
host_since                  148
instant_bookable              0
last_review               11880
latitude                      0
longitude                     0
name                          0
neighbourhood              5160
number_of_reviews             0
property_type                 0
review_scores_rating      12556
room_type                     0
thumbnail_url              6145
zipcode                     716
y                             0
dtype: int64

In [20]:
def merge_train_test(df_train, df_test):
    if "SalePrice" not in df_test.columns.tolist():
        df_test["SalePrice"] = -100
    res = pd.concat([df_train, df_test])
    res.reset_index(inplace=True, drop=True)
    return res

def split_train_test(df):
    df_train = df[df["SalePrice"] != -100]
    df_test = df[df["SalePrice"] == -100]
    df_train.reset_index(inplace=True, drop=True)
    df_test.reset_index(inplace=True, drop=True)
    return df_train, df_test

df = merge_train_test(df_train, df_test)

# 特徴量エンジニアリング＋特徴量選択

### 〇　id は使わなくてよさそう
### 〇　accommodate はそのまま使用
### 〇　amenity は編集してonehotに
### 〇　bathrooms (欠損あり)　はそのまま使う
### 〇　bed_types　はonehotに
### 〇　beds (欠損あり)　はそのまま使う
### 〇　cancellation_policy　はonehot変換
### city は考え中
### 〇　cleaning_fee　はonehot変換
### description 説明文　厄介のため保留
### 〇　first_review 日付データ (大幅欠損)　これも保留
### 〇　host_has_profile_pic（写真ありなし） はbinary変換
### 〇　host_identity_verified （身元確認ありなし） はbinary変換
### 〇　host_response_rate（返信率）(大幅欠損)　保留
### 〇　host_since (いつホストが登録したか)　保留
### 〇　instant_bookable(すぐ予約できるか)　はbinary変換　←多分大事
### 〇　last_review（最後のレビュー日）(大幅欠損)　は保留
### 〇　latitude(緯度)　はそのまま使う　　←変換した方がいいかも
### 〇　longitude(経度)　はそのまま使う　　←変換した方がいいかも
### 〇　name(物件名) いらんくね
### △　neighbourhood（近所情報）　思ったより使えそうだけど数が多い
### 〇　number_of_reviews  そのまま使えそうだけどreviewの最初と最後から頻繁にレビューされているか使えそう
### 〇　property_type  （物件の種類）　onehot ←これも大事そう
### 〇　review_scores_rating （レビュー得点）　あんま効かないんじゃないかな
### 〇　room_type  （部屋の種類）　onehot ←これも大事そう
### 〇　thumbnail_url 多分使えなそう
### ✖　zipcode（郵便番号） とりあえず保留  

In [21]:
use_num_columns=['accommodates', 'bathrooms','beds','host_response_rate','latitude', 'longitude','number_of_reviews','review_scores_rating']
use_one_columns=['bed_type','cancellation_policy','cleaning_fee','property_type','room_type']
use_bin_columns=['host_has_profile_pic','host_identity_verified','instant_bookable','neighbourhood','city']

### host_response_rateの%とる

In [22]:
host_tmp=[]
for i in range(len(df)):
    if type(df['host_response_rate'][i])!=float:
        host_tmp.append(int(df['host_response_rate'][i][:-1]))
    else:
        host_tmp.append(float('nan'))

In [23]:
df['host_response_rate']=host_tmp

### amenityの特徴量制作

In [24]:
ame_sum=[]
for i in range(len(df)):
    for j in df['amenities'][i].replace('{', '').replace('}', '').split(','):
        if j not in ame_sum:
            ame_sum.append(j)

In [25]:
for num, i in enumerate(ame_sum):
    ams=[] 
    for j in range(len(df)):
        if i in df['amenities'][j].replace('{', '').replace('}', '').split(','):
            ams.append(1)
        else:
            ams.append(0)
    df['ame '+str(num)]=ams

### one-hot-encoding

In [26]:
df_one_hot_encoded = pd.get_dummies(df[use_one_columns])

In [27]:
categorical=list(df_one_hot_encoded.columns)

### label encoding

In [28]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
for i in use_bin_columns:
    #ラベルを覚えさせる
    le = le.fit(df[i])
    #ラベルを整数に変換
    df[i] = le.transform(df[i])

In [29]:
categorical2=use_bin_columns
df[categorical2]

Unnamed: 0,host_has_profile_pic,host_identity_verified,instant_bookable,neighbourhood,city
0,1,0,0,619,3
1,1,1,0,74,2
2,1,0,1,84,4
3,1,1,1,390,5
4,1,1,0,556,4
...,...,...,...,...,...
74106,1,1,0,238,4
74107,1,0,0,299,1
74108,1,0,0,27,1
74109,1,0,0,165,4


### onehot とdf の結合

In [30]:
df=pd.concat([df, df_one_hot_encoded], axis=1)

In [31]:
ame_col=[]
for i in range(len(ame_sum)):
    ame_col.append('ame '+str(i))

### 日付の差分抽出

In [32]:
# import datetime
# degree=[]
# for i in range(len(df)):
#     if type(df['last_review'][i])!=float and type(df['first_review'][i])!=float :
#         year,month,day=df['first_review'][i].split('-')
#         dt1 = datetime.datetime(int(year), int(month), int(day))
#         year2,month2,day2=df['last_review'][i].split('-')
#         dt2 = datetime.datetime(int(year2), int(month2), int(day2))

#         td = dt2 - dt1
#         degree.append(td.days)
#     else:
#         degree.append(0)

In [33]:
# df['degree']=degree
# use_num_columns.append('degree')



### 緯度や経度の平面変換
https://qiita.com/sw1227/items/e7a590994ad7dcd0e8ab

In [34]:
def calc_xy(phi_deg, lambda_deg, phi0_deg, lambda0_deg):
    """ 緯度経度を平面直角座標に変換する
    - input:
        (phi_deg, lambda_deg): 変換したい緯度・経度[度]（分・秒でなく小数であることに注意）
        (phi0_deg, lambda0_deg): 平面直角座標系原点の緯度・経度[度]（分・秒でなく小数であることに注意）
    - output:
        x: 変換後の平面直角座標[m]
        y: 変換後の平面直角座標[m]
    """
    # 緯度経度・平面直角座標系原点をラジアンに直す
    phi_rad = np.deg2rad(phi_deg)
    lambda_rad = np.deg2rad(lambda_deg)
    phi0_rad = np.deg2rad(phi0_deg)
    lambda0_rad = np.deg2rad(lambda0_deg)

    # 補助関数
    def A_array(n):
        A0 = 1 + (n**2)/4. + (n**4)/64.
        A1 = -     (3./2)*( n - (n**3)/8. - (n**5)/64. ) 
        A2 =     (15./16)*( n**2 - (n**4)/4. )
        A3 = -   (35./48)*( n**3 - (5./16)*(n**5) )
        A4 =   (315./512)*( n**4 )
        A5 = -(693./1280)*( n**5 )
        return np.array([A0, A1, A2, A3, A4, A5])

    def alpha_array(n):
        a0 = np.nan # dummy
        a1 = (1./2)*n - (2./3)*(n**2) + (5./16)*(n**3) + (41./180)*(n**4) - (127./288)*(n**5)
        a2 = (13./48)*(n**2) - (3./5)*(n**3) + (557./1440)*(n**4) + (281./630)*(n**5)
        a3 = (61./240)*(n**3) - (103./140)*(n**4) + (15061./26880)*(n**5)
        a4 = (49561./161280)*(n**4) - (179./168)*(n**5)
        a5 = (34729./80640)*(n**5)
        return np.array([a0, a1, a2, a3, a4, a5])

    # 定数 (a, F: 世界測地系-測地基準系1980（GRS80）楕円体)
    m0 = 0.9999 
    a = 6378137.
    F = 298.257222101

    # (1) n, A_i, alpha_iの計算
    n = 1. / (2*F - 1)
    A_array = A_array(n)
    alpha_array = alpha_array(n)

    # (2), S, Aの計算
    A_ = ( (m0*a)/(1.+n) )*A_array[0] # [m]
    S_ = ( (m0*a)/(1.+n) )*( A_array[0]*phi0_rad + np.dot(A_array[1:], np.sin(2*phi0_rad*np.arange(1,6))) ) # [m]

    # (3) lambda_c, lambda_sの計算
    lambda_c = np.cos(lambda_rad - lambda0_rad)
    lambda_s = np.sin(lambda_rad - lambda0_rad)

    # (4) t, t_の計算
    t = np.sinh( np.arctanh(np.sin(phi_rad)) - ((2*np.sqrt(n)) / (1+n))*np.arctanh(((2*np.sqrt(n)) / (1+n)) * np.sin(phi_rad)) )
    t_ = np.sqrt(1 + t*t)

    # (5) xi', eta'の計算
    xi2  = np.arctan(t / lambda_c) # [rad]
    eta2 = np.arctanh(lambda_s / t_)

    # (6) x, yの計算
    x = A_ * (xi2 + np.sum(np.multiply(alpha_array[1:],
                                       np.multiply(np.sin(2*xi2*np.arange(1,6)),
                                                   np.cosh(2*eta2*np.arange(1,6)))))) - S_ # [m]
    y = A_ * (eta2 + np.sum(np.multiply(alpha_array[1:],
                                        np.multiply(np.cos(2*xi2*np.arange(1,6)),
                                                    np.sinh(2*eta2*np.arange(1,6)))))) # [m]
    # return
    return x, y # [m]

In [35]:
xrange=[]
yrange=[]
for i in range(len(df)):
    x, y = calc_xy(df['latitude'][i], df['longitude'][i], 0,0)
    xrange.append(x)
    yrange.append(y)

In [36]:
df['latitude']=xrange
df['longitude']=yrange
df['kyori']=df['latitude']*df['latitude']+df['longitude']*df['longitude']
use_num_columns.append('kyori')
df[['latitude','longitude','kyori']].head(10)

Unnamed: 0,latitude,longitude,kyori
0,-6087261.0,-5965946.0,72647250000000.0
1,8268435.0,-6324322.0,108364100000000.0
2,8015575.0,-5908035.0,99154310000000.0
3,-6143851.0,-5143114.0,64198520000000.0
4,8026880.0,-5897109.0,99206700000000.0
5,8022109.0,-5920610.0,99407850000000.0
6,8013223.0,-5910235.0,99142620000000.0
7,-6099232.0,-5914527.0,72182260000000.0
8,-6144404.0,-5142660.0,64200650000000.0
9,8013929.0,-5897267.0,99000810000000.0


### 日付変換
https://qiita.com/shimopino/items/4ef78aa589e43f315113

In [37]:
df['first_review']

0        2016-07-27
1        2016-09-12
2        2016-06-15
3        2014-03-15
4        2015-08-05
            ...    
74106           NaN
74107    2017-01-16
74108           NaN
74109    2016-04-15
74110    2015-08-26
Name: first_review, Length: 74111, dtype: object

In [38]:
year=[]
month=[]
day=[]
year2=[]
month2=[]
day2=[]
year3=[]
month3=[]
day3=[]
for i in range(len(df)):
    if type(df['first_review'][i])!=float:
        tmp=df['first_review'][i].split('-')
        year.append(int(tmp[0]))
        month.append(int(tmp[1]))
        day.append(int(tmp[2]))
    else:
        year.append(float('nan'))
        month.append(float('nan'))
        day.append(float('nan'))
    if type(df['last_review'][i])!=float:
        tmp2=df['last_review'][i].split('-')
        year2.append(int(tmp2[0]))
        month2.append(int(tmp2[1]))
        day2.append(int(tmp2[2]))
    else:
        year2.append(float('nan'))
        month2.append(float('nan'))
        day2.append(float('nan'))
    if type(df['host_since'][i])!=float:
        tmp3=df['host_since'][i].split('-')
        year3.append(int(tmp3[0]))
        month3.append(int(tmp3[1]))
        day3.append(int(tmp3[2]))
    else:
        year3.append(float('nan'))
        month3.append(float('nan'))
        day3.append(float('nan'))

In [39]:
df['first-year']=year
df['first-month']=month
df['first-day']=day
df['last-year']=year2
df['last-month']=month2
df['last-day']=day2
df['host-year']=year3
df['host-month']=month3
df['host-day']=day3

In [40]:
def encode(df, col):
    # この方法だと場合によって最大値が変化するデータでは正確な値は出ない
    # 例：月の日数が30日や31日の場合がある
    df[col + '_cos'] = np.cos(2 * np.pi * df[col] / df[col].max())
    df[col + '_sin'] = np.sin(2 * np.pi * df[col] / df[col].max())
    return df

In [41]:
df = encode(df, 'first-day')
df = encode(df, 'first-month')
df = encode(df, 'last-day')
df = encode(df, 'last-month')
df = encode(df, 'host-day')
df = encode(df, 'host-month')

In [42]:
new=['first-year','first-day_cos','first-day_sin','first-month_cos','first-month_sin','last-year','last-day_cos','last-day_sin','last-month_cos','last-month_sin','host-year','host-day_cos','host-day_sin','host-month_cos','host-month_sin']
df[new].head(10)

Unnamed: 0,first-year,first-day_cos,first-day_sin,first-month_cos,first-month_sin,last-year,last-day_cos,last-day_sin,last-month_cos,last-month_sin,host-year,host-day_cos,host-day_sin,host-month_cos,host-month_sin
0,2016.0,0.688967,-0.724793,-0.8660254,-0.5,2016.0,0.688967,-0.7247928,-0.8660254,-0.5,2016.0,-0.874347,0.485302,-0.8660254,-0.5
1,2016.0,-0.758758,0.651372,-1.83697e-16,-1.0,2017.0,1.0,-2.449294e-16,6.123234000000001e-17,1.0,2015.0,0.97953,-0.2012985,1.0,-2.449294e-16
2,2016.0,-0.994869,0.101168,-1.0,1.224647e-16,2017.0,-0.874347,0.485302,-0.5,-0.866025,2016.0,-0.440394,-0.8978045,-0.8660254,0.5
3,2014.0,-0.994869,0.101168,6.123234000000001e-17,1.0,2017.0,0.820763,0.5712682,-1.83697e-16,-1.0,2012.0,-0.758758,-0.6513725,-1.0,1.224647e-16
4,2015.0,0.528964,0.848644,-0.5,-0.8660254,2017.0,-0.440394,0.8978045,-1.83697e-16,-1.0,2015.0,0.347305,-0.9377521,6.123234000000001e-17,1.0
5,2017.0,-0.440394,0.897805,0.5,0.8660254,2017.0,-0.994869,0.1011683,-1.83697e-16,-1.0,2016.0,0.918958,-0.3943559,0.8660254,-0.5
6,2015.0,-0.994869,0.101168,-0.8660254,0.5,2017.0,0.528964,0.8486443,-1.83697e-16,-1.0,2014.0,0.918958,-0.3943559,-0.8660254,0.5
7,,,,,,,,,,,2010.0,1.0,-2.449294e-16,-0.5,-0.8660254
8,2017.0,0.347305,0.937752,-1.0,1.224647e-16,2017.0,-0.994869,-0.1011683,-1.83697e-16,-1.0,2016.0,-0.250653,0.9680771,-0.8660254,-0.5
9,2012.0,-0.050649,0.998717,1.0,-2.449294e-16,2016.0,-0.954139,-0.2993631,-1.83697e-16,-1.0,2011.0,-0.250653,0.9680771,0.5,-0.8660254


In [43]:
use_num_columns[len(use_num_columns):len(new)] = new

### amenityの特徴量選択

In [44]:
ame_new=[]
for i in range(len(ame_col)):
    if sum(df[ame_col[i]])/len(df)>0.1:
        ame_new.append(ame_col[i])
df[ame_new]

#ame_new=ame_col

Unnamed: 0,ame 0,ame 1,ame 2,ame 3,ame 4,ame 5,ame 6,ame 7,ame 8,ame 9,...,ame 24,ame 25,ame 26,ame 27,ame 32,ame 33,ame 35,ame 40,ame 43,ame 59
0,1,1,1,1,1,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,1,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
2,1,1,1,0,1,1,1,0,1,0,...,1,1,1,1,0,0,0,0,0,0
3,1,1,1,0,1,1,1,1,1,1,...,1,0,0,1,0,0,0,0,0,0
4,1,1,1,0,1,1,1,0,1,1,...,1,1,1,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
74106,1,1,1,0,0,0,1,0,1,1,...,0,0,0,0,0,0,1,0,0,0
74107,1,1,1,0,1,1,1,0,0,1,...,0,0,0,0,0,0,1,0,0,0
74108,1,1,1,0,1,1,1,0,0,1,...,0,1,1,0,0,0,1,0,0,1
74109,0,1,1,0,1,1,1,0,1,1,...,0,0,0,0,0,0,1,0,0,0


# 再度EDA


In [138]:
pd.set_option('display.max_columns', len(df.columns))
df2=df.describe()

In [139]:
df2

Unnamed: 0,id,accommodates,bathrooms,bedrooms,beds,city,host_has_profile_pic,host_identity_verified,host_response_rate,instant_bookable,latitude,longitude,neighbourhood,number_of_reviews,review_scores_rating,y,SalePrice,ame 0,ame 1,ame 2,ame 3,ame 4,ame 5,ame 6,ame 7,ame 8,ame 9,ame 10,ame 11,ame 12,ame 13,ame 14,ame 15,ame 16,ame 17,ame 18,ame 19,ame 20,ame 21,ame 22,ame 23,ame 24,ame 25,ame 26,ame 27,ame 28,ame 29,ame 30,ame 31,ame 32,ame 33,ame 34,ame 35,ame 36,ame 37,ame 38,ame 39,ame 40,ame 41,ame 42,ame 43,ame 44,ame 45,ame 46,ame 47,ame 48,ame 49,ame 50,ame 51,ame 52,ame 53,ame 54,ame 55,ame 56,ame 57,ame 58,ame 59,ame 60,ame 61,ame 62,ame 63,ame 64,ame 65,ame 66,ame 67,ame 68,ame 69,ame 70,ame 71,ame 72,ame 73,ame 74,ame 75,ame 76,ame 77,ame 78,ame 79,ame 80,ame 81,ame 82,ame 83,ame 84,ame 85,ame 86,ame 87,ame 88,ame 89,ame 90,ame 91,ame 92,ame 93,ame 94,ame 95,ame 96,ame 97,ame 98,ame 99,ame 100,ame 101,ame 102,ame 103,ame 104,ame 105,ame 106,ame 107,ame 108,ame 109,ame 110,ame 111,ame 112,ame 113,ame 114,ame 115,ame 116,ame 117,ame 118,ame 119,ame 120,ame 121,ame 122,ame 123,ame 124,ame 125,ame 126,ame 127,ame 128,ame 129,ame 130,bed_type_Airbed,bed_type_Couch,bed_type_Futon,bed_type_Pull-out Sofa,bed_type_Real Bed,cancellation_policy_flexible,cancellation_policy_moderate,cancellation_policy_strict,cancellation_policy_super_strict_30,cancellation_policy_super_strict_60,cleaning_fee_f,cleaning_fee_t,property_type_Apartment,property_type_Bed & Breakfast,property_type_Boat,property_type_Boutique hotel,property_type_Bungalow,property_type_Cabin,property_type_Camper/RV,property_type_Casa particular,property_type_Castle,property_type_Cave,property_type_Chalet,property_type_Condominium,property_type_Dorm,property_type_Earth House,property_type_Guest suite,property_type_Guesthouse,property_type_Hostel,property_type_House,property_type_Hut,property_type_In-law,property_type_Island,property_type_Lighthouse,property_type_Loft,property_type_Other,property_type_Parking Space,property_type_Serviced apartment,property_type_Tent,property_type_Timeshare,property_type_Tipi,property_type_Townhouse,property_type_Train,property_type_Treehouse,property_type_Vacation home,property_type_Villa,property_type_Yurt,room_type_Entire home/apt,room_type_Private room,room_type_Shared room,kyori,first-year,first-month,first-day,last-year,last-month,last-day,host-year,host-month,host-day,first-day_cos,first-day_sin,first-month_cos,first-month_sin,last-day_cos,last-day_sin,last-month_cos,last-month_sin,host-day_cos,host-day_sin,host-month_cos,host-month_sin
count,74111.0,74111.0,73911.0,74020.0,73980.0,74111.0,74111.0,74111.0,55812.0,74111.0,74111.0,74111.0,74111.0,74111.0,57389.0,55583.0,18528.0,74111.0,74111.0,74111.0,74111.0,74111.0,74111.0,74111.0,74111.0,74111.0,74111.0,74111.0,74111.0,74111.0,74111.0,74111.0,74111.0,74111.0,74111.0,74111.0,74111.0,74111.0,74111.0,74111.0,74111.0,74111.0,74111.0,74111.0,74111.0,74111.0,74111.0,74111.0,74111.0,74111.0,74111.0,74111.0,74111.0,74111.0,74111.0,74111.0,74111.0,74111.0,74111.0,74111.0,74111.0,74111.0,74111.0,74111.0,74111.0,74111.0,74111.0,74111.0,74111.0,74111.0,74111.0,74111.0,74111.0,74111.0,74111.0,74111.0,74111.0,74111.0,74111.0,74111.0,74111.0,74111.0,74111.0,74111.0,74111.0,74111.0,74111.0,74111.0,74111.0,74111.0,74111.0,74111.0,74111.0,74111.0,74111.0,74111.0,74111.0,74111.0,74111.0,74111.0,74111.0,74111.0,74111.0,74111.0,74111.0,74111.0,74111.0,74111.0,74111.0,74111.0,74111.0,74111.0,74111.0,74111.0,74111.0,74111.0,74111.0,74111.0,74111.0,74111.0,74111.0,74111.0,74111.0,74111.0,74111.0,74111.0,74111.0,74111.0,74111.0,74111.0,74111.0,74111.0,74111.0,74111.0,74111.0,74111.0,74111.0,74111.0,74111.0,74111.0,74111.0,74111.0,74111.0,74111.0,74111.0,74111.0,74111.0,74111.0,74111.0,74111.0,74111.0,74111.0,74111.0,74111.0,74111.0,74111.0,74111.0,74111.0,74111.0,74111.0,74111.0,74111.0,74111.0,74111.0,74111.0,74111.0,74111.0,74111.0,74111.0,74111.0,74111.0,74111.0,74111.0,74111.0,74111.0,74111.0,74111.0,74111.0,74111.0,74111.0,74111.0,74111.0,74111.0,74111.0,74111.0,74111.0,74111.0,74111.0,74111.0,74111.0,74111.0,74111.0,74111.0,74111.0,74111.0,74111.0,74111.0,74111.0,74111.0,58247.0,58247.0,58247.0,58284.0,58284.0,58284.0,73923.0,73923.0,73923.0,58247.0,58247.0,58247.0,58247.0,58284.0,58284.0,58284.0,58284.0,73923.0,73923.0,73923.0,73923.0
mean,23159.062501,3.155146,1.235263,1.265793,1.710868,3.292629,0.999487,0.676337,94.351967,0.262458,2608396.0,-5868083.0,343.508386,20.900568,94.067365,160.163647,-100.0,0.706103,0.961598,0.911147,0.318967,0.582491,0.576311,0.832899,0.327252,0.602448,0.744964,0.905034,0.636748,0.371497,0.863637,0.667445,0.24265,0.256575,0.663505,0.562494,0.589697,0.275627,0.341258,0.125487,0.229831,0.155348,0.414567,0.584664,0.148979,0.040412,0.077424,0.050748,0.020186,0.145997,0.13129,0.048481,0.499602,0.008865,0.065415,0.084778,0.000418,0.101078,0.086586,0.085412,0.137591,0.057589,0.057576,0.056375,0.052786,0.047969,0.062744,0.057414,0.053406,0.053217,0.055093,0.098096,0.015193,0.059357,0.070907,0.049561,0.112075,0.034057,0.040831,0.006112,0.022547,0.016138,0.006018,0.008568,0.015166,0.008568,0.0051,0.010619,0.011172,0.005357,0.005357,0.001376,0.031196,0.000999,0.008298,0.004115,0.002334,0.006463,0.001187,0.008541,0.016462,0.015234,0.022129,0.002942,0.009135,0.001835,0.002928,0.006409,0.00757,0.01039,0.009351,0.000661,0.007003,0.003657,0.005478,0.004709,0.003522,0.00166,0.004075,0.007907,0.000918,0.001552,0.005964,0.000243,0.004034,0.002955,0.000189,0.000445,0.000364,0.001552,0.001795,0.003036,0.001039,0.000459,0.000337,0.000175,0.000459,0.000202,0.000148,1.3e-05,1.3e-05,1.3e-05,1.3e-05,1.3e-05,2.7e-05,9.4e-05,4e-05,1.3e-05,0.006436,0.003616,0.01016,0.007894,0.971894,0.304206,0.257222,0.436831,0.001511,0.000229,0.265925,0.734075,0.661211,0.006234,0.000877,0.000931,0.004939,0.000972,0.001268,1.3e-05,0.000175,2.7e-05,8.1e-05,0.035865,0.001916,5.4e-05,0.00166,0.00672,0.000945,0.222787,0.000108,0.000958,1.3e-05,1.3e-05,0.016786,0.00819,1.3e-05,0.000283,0.000243,0.001039,4e-05,0.022831,2.7e-05,9.4e-05,0.000148,0.002415,0.000121,0.557407,0.413407,0.029186,89981590000000.0,2015.535513,6.548148,15.43319,2016.690636,6.61818,16.757687,2014.051107,6.54086,15.635621,-0.001874,0.00947,-0.1099776,-0.0755023,0.048111,-0.102078,-0.1736672,-0.144237,-0.02402,0.01551347,-0.06277813,-0.02296353
std,16266.832865,2.153589,0.582044,0.852143,1.254142,1.181851,0.07474,0.473267,16.341817,0.439973,6976555.0,267947.5,198.607368,37.828641,7.836556,168.08714,0.0,0.455548,0.192166,0.284534,0.46608,0.493152,0.494146,0.373068,0.469213,0.489395,0.435885,0.293169,0.48094,0.483208,0.343176,0.471132,0.428688,0.436746,0.472514,0.496082,0.491892,0.446833,0.474135,0.331273,0.420727,0.362239,0.492651,0.492783,0.35607,0.196926,0.267265,0.219485,0.140637,0.353106,0.337719,0.214782,0.500003,0.093737,0.247259,0.278553,0.020448,0.301434,0.28123,0.279496,0.344472,0.232967,0.232941,0.230646,0.223607,0.213701,0.242503,0.232633,0.224844,0.224468,0.228163,0.297447,0.122322,0.236293,0.256671,0.217037,0.315461,0.181377,0.197899,0.077943,0.148456,0.126007,0.077343,0.092168,0.122215,0.092168,0.071236,0.102502,0.105108,0.072995,0.072995,0.037073,0.173849,0.031584,0.090717,0.06402,0.048259,0.080135,0.034439,0.092024,0.127244,0.122483,0.147104,0.054156,0.09514,0.042799,0.054032,0.079802,0.086675,0.1014,0.096247,0.025705,0.083391,0.06036,0.073813,0.068462,0.05924,0.040706,0.063706,0.08857,0.030277,0.039362,0.076997,0.015583,0.06339,0.05428,0.013743,0.021097,0.019084,0.039362,0.042325,0.055016,0.032217,0.021414,0.018364,0.013243,0.021414,0.014225,0.012182,0.003673,0.003673,0.003673,0.003673,0.003673,0.005195,0.009718,0.006362,0.003673,0.079968,0.060026,0.100286,0.088495,0.165278,0.460073,0.437106,0.495997,0.038846,0.015144,0.441828,0.441828,0.473301,0.078709,0.029602,0.030499,0.070101,0.031154,0.035592,0.003673,0.013243,0.005195,0.008997,0.185955,0.043731,0.007346,0.040706,0.081698,0.030719,0.41612,0.010389,0.030937,0.003673,0.003673,0.128468,0.09013,0.003673,0.016831,0.015583,0.032217,0.006362,0.149364,0.005195,0.009718,0.012182,0.049087,0.011019,0.496697,0.492448,0.168329,17448320000000.0,1.393215,3.187603,8.953425,0.645022,2.850486,9.179894,1.838333,3.32294,8.785544,0.712475,0.701643,0.6859238,0.7153528,0.720556,0.684165,0.528593,0.818316,0.704399,0.7092375,0.7125185,0.6984715
min,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-6218443.0,-6343591.0,0.0,0.0,20.0,1.0,-100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,64004430000000.0,2008.0,1.0,1.0,2009.0,1.0,1.0,2008.0,1.0,1.0,-0.994869,-0.998717,-1.0,-1.0,-0.994869,-0.998717,-1.0,-1.0,-0.994869,-0.9987165,-1.0,-1.0
25%,9263.5,2.0,1.0,1.0,1.0,3.0,1.0,0.0,100.0,0.0,-6099448.0,-5919013.0,170.0,1.0,92.0,74.0,-100.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,72207470000000.0,2015.0,4.0,8.0,2017.0,4.0,9.0,2013.0,4.0,8.0,-0.758758,-0.724793,-0.8660254,-0.8660254,-0.758758,-0.790776,-0.5,-1.0,-0.758758,-0.7247928,-0.8660254,-0.8660254
50%,18527.0,2.0,1.0,1.0,1.0,4.0,1.0,1.0,100.0,0.0,8015792.0,-5908124.0,332.0,6.0,96.0,111.0,-100.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,99106810000000.0,2016.0,7.0,15.0,2017.0,7.0,17.0,2014.0,7.0,16.0,-0.050649,0.101168,-1.83697e-16,-2.449294e-16,0.151428,-0.201299,-1.83697e-16,-0.5,-0.050649,-2.449294e-16,-1.83697e-16,-2.449294e-16
75%,37054.5,4.0,1.0,1.0,2.0,4.0,1.0,1.0,100.0,1.0,8025096.0,-5897110.0,555.0,23.0,100.0,185.0,-100.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,99263680000000.0,2017.0,9.0,23.0,2017.0,9.0,25.0,2015.0,9.0,23.0,0.688967,0.724793,0.5,0.5,0.820763,0.485302,-1.83697e-16,0.866025,0.688967,0.7247928,0.5,0.5
max,55582.0,16.0,8.0,10.0,18.0,5.0,2.0,2.0,100.0,1.0,9734661.0,-5136229.0,619.0,605.0,100.0,1999.0,-100.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,132261400000000.0,2017.0,12.0,31.0,2017.0,12.0,31.0,2017.0,12.0,31.0,1.0,0.998717,1.0,1.0,1.0,0.998717,1.0,1.0,1.0,0.9987165,1.0,1.0


In [143]:
for i in df2.columns:
    if abs(df[i].describe().loc['mean'])>0.1:
        print('True '+str(df[i].describe().loc['mean']))
    else:
        print('False '+str(i))

True 23159.06250084333
True 3.1551456598885457
True 1.235262680791763
True 1.265793028911105
True 1.710867802108678
True 3.2926286246306216
True 0.9994872556030818
True 0.676336846082228
True 94.35196731885615
True 0.26245766485407024
True 2608395.804708948
True -5868083.353100022
True 343.5083860695443
True 20.900568066818693
True 94.06736482601195
True 160.1636471583038
True -100.0
True 0.7061030076506861
True 0.9615981433255522
True 0.9111467933235282
True 0.31896749470389013
True 0.582491128172606
True 0.5763112088623821
True 0.8328992996991
True 0.32725236469619895
True 0.602447679831604
True 0.7449636356276397
True 0.9050343403813199
True 0.6367475813307066
True 0.3714968088407929
True 0.8636369769669819
True 0.6674447787777793
True 0.24264953920470644
True 0.2565745975631148
True 0.6635047428856715
True 0.5624940966927986
True 0.5896965362766661
True 0.2756270998907045
True 0.34125838269622594
True 0.1254874445089123
True 0.22983092928175305
True 0.1553480589925922
True 0.414567

In [130]:
pd.set_option('display.max_columns', len(df.columns))
df

Unnamed: 0,id,accommodates,amenities,bathrooms,bed_type,bedrooms,beds,cancellation_policy,city,cleaning_fee,description,first_review,host_has_profile_pic,host_identity_verified,host_response_rate,host_since,instant_bookable,last_review,latitude,longitude,name,neighbourhood,number_of_reviews,property_type,review_scores_rating,room_type,thumbnail_url,zipcode,y,SalePrice,ame 0,ame 1,ame 2,ame 3,ame 4,ame 5,ame 6,ame 7,ame 8,ame 9,ame 10,ame 11,ame 12,ame 13,ame 14,ame 15,ame 16,ame 17,ame 18,ame 19,ame 20,ame 21,ame 22,ame 23,ame 24,ame 25,ame 26,ame 27,ame 28,ame 29,ame 30,ame 31,ame 32,ame 33,ame 34,ame 35,ame 36,ame 37,ame 38,ame 39,ame 40,ame 41,ame 42,ame 43,ame 44,ame 45,ame 46,ame 47,ame 48,ame 49,ame 50,ame 51,ame 52,ame 53,ame 54,ame 55,ame 56,ame 57,ame 58,ame 59,ame 60,ame 61,ame 62,ame 63,ame 64,ame 65,ame 66,ame 67,ame 68,ame 69,ame 70,ame 71,ame 72,ame 73,ame 74,ame 75,ame 76,ame 77,ame 78,ame 79,ame 80,ame 81,ame 82,ame 83,ame 84,ame 85,ame 86,ame 87,ame 88,ame 89,ame 90,ame 91,ame 92,ame 93,ame 94,ame 95,ame 96,ame 97,ame 98,ame 99,ame 100,ame 101,ame 102,ame 103,ame 104,ame 105,ame 106,ame 107,ame 108,ame 109,ame 110,ame 111,ame 112,ame 113,ame 114,ame 115,ame 116,ame 117,ame 118,ame 119,ame 120,ame 121,ame 122,ame 123,ame 124,ame 125,ame 126,ame 127,ame 128,ame 129,ame 130,bed_type_Airbed,bed_type_Couch,bed_type_Futon,bed_type_Pull-out Sofa,bed_type_Real Bed,cancellation_policy_flexible,cancellation_policy_moderate,cancellation_policy_strict,cancellation_policy_super_strict_30,cancellation_policy_super_strict_60,cleaning_fee_f,cleaning_fee_t,property_type_Apartment,property_type_Bed & Breakfast,property_type_Boat,property_type_Boutique hotel,property_type_Bungalow,property_type_Cabin,property_type_Camper/RV,property_type_Casa particular,property_type_Castle,property_type_Cave,property_type_Chalet,property_type_Condominium,property_type_Dorm,property_type_Earth House,property_type_Guest suite,property_type_Guesthouse,property_type_Hostel,property_type_House,property_type_Hut,property_type_In-law,property_type_Island,property_type_Lighthouse,property_type_Loft,property_type_Other,property_type_Parking Space,property_type_Serviced apartment,property_type_Tent,property_type_Timeshare,property_type_Tipi,property_type_Townhouse,property_type_Train,property_type_Treehouse,property_type_Vacation home,property_type_Villa,property_type_Yurt,room_type_Entire home/apt,room_type_Private room,room_type_Shared room,kyori,first-year,first-month,first-day,last-year,last-month,last-day,host-year,host-month,host-day,first-day_cos,first-day_sin,first-month_cos,first-month_sin,last-day_cos,last-day_sin,last-month_cos,last-month_sin,host-day_cos,host-day_sin,host-month_cos,host-month_sin
0,0,6,"{TV,""Wireless Internet"",Kitchen,""Free parking ...",2.0,Real Bed,1.0,4.0,flexible,3,t,My place is meant for family and a few friends...,2016-07-27,1,0,,2016-07-13,0,2016-07-27,-6.087261e+06,-5.965946e+06,The Penthouse,619,1,Apartment,60.0,Private room,,90804,138.0,,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,7.264725e+13,2016.0,7.0,27.0,2016.0,7.0,27.0,2016.0,7.0,13.0,0.688967,-0.724793,-8.660254e-01,-5.000000e-01,0.688967,-7.247928e-01,-8.660254e-01,-0.500000,-0.874347,0.485302,-8.660254e-01,-5.000000e-01
1,1,2,"{TV,""Cable TV"",Internet,""Wireless Internet"",""A...",1.0,Real Bed,1.0,1.0,strict,2,t,This is a new listing for a lovely guest bedro...,2016-09-12,1,1,100.0,2015-12-30,0,2017-03-31,8.268435e+06,-6.324322e+06,Guest Bedroom in Brookland,74,9,House,100.0,Private room,https://a0.muscache.com/im/pictures/e4d8b51f-6...,20018,42.0,,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1.083641e+14,2016.0,9.0,12.0,2017.0,3.0,31.0,2015.0,12.0,30.0,-0.758758,0.651372,-1.836970e-16,-1.000000e+00,1.000000,-2.449294e-16,6.123234e-17,1.000000,0.979530,-0.201299,1.000000e+00,-2.449294e-16
2,2,2,"{TV,Internet,""Wireless Internet"",Kitchen,""Indo...",2.0,Real Bed,1.0,1.0,strict,4,t,We're looking forward to your stay at our apt....,2016-06-15,1,0,100.0,2016-05-21,1,2017-08-13,8.015575e+06,-5.908035e+06,Clean Modern Room in Lux Apt 1 Block From J Train,84,27,Apartment,83.0,Private room,https://a0.muscache.com/im/pictures/5ffecc9b-d...,,65.0,,1,1,1,0,1,1,1,0,1,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,9.915431e+13,2016.0,6.0,15.0,2017.0,8.0,13.0,2016.0,5.0,21.0,-0.994869,0.101168,-1.000000e+00,1.224647e-16,-0.874347,4.853020e-01,-5.000000e-01,-0.866025,-0.440394,-0.897805,-8.660254e-01,5.000000e-01
3,3,2,"{TV,""Cable TV"",Internet,""Wireless Internet"",""A...",1.0,Real Bed,1.0,1.0,strict,5,t,BEST CITY VIEWS - - ROOF DECK W/ BBQ & WiFi - ...,2014-03-15,1,1,100.0,2012-06-19,1,2017-09-03,-6.143851e+06,-5.143114e+06,BEST views + reviews! 5/5 stars*****,390,38,Apartment,95.0,Private room,,94133,166.0,,1,1,1,0,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,1,1,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,6.419852e+13,2014.0,3.0,15.0,2017.0,9.0,3.0,2012.0,6.0,19.0,-0.994869,0.101168,6.123234e-17,1.000000e+00,0.820763,5.712682e-01,-1.836970e-16,-1.000000,-0.758758,-0.651372,-1.000000e+00,1.224647e-16
4,4,2,"{TV,Internet,""Wireless Internet"",""Air conditio...",1.0,Real Bed,1.0,1.0,strict,4,t,Charming Apartment on the upper west side of M...,2015-08-05,1,1,100.0,2015-03-25,0,2017-09-10,8.026880e+06,-5.897109e+06,Charming 1-bedroom - UWS Manhattan,556,5,Apartment,100.0,Entire home/apt,https://a0.muscache.com/im/pictures/92879730/5...,10024,165.0,,1,1,1,0,1,1,1,0,1,1,1,1,0,1,1,0,0,1,0,0,0,0,0,1,1,1,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,9.920670e+13,2015.0,8.0,5.0,2017.0,9.0,10.0,2015.0,3.0,25.0,0.528964,0.848644,-5.000000e-01,-8.660254e-01,-0.440394,8.978045e-01,-1.836970e-16,-1.000000,0.347305,-0.937752,6.123234e-17,1.000000e+00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
74106,18523,4,"{TV,Internet,""Wireless Internet"",""Air conditio...",1.0,Real Bed,2.0,2.0,strict,4,t,"The Greenhouse, located on Green Street, is a ...",,1,1,100.0,2009-11-16,0,,8.021372e+06,-5.903611e+06,Spacious 2BR Greenpoint Getaway,238,0,Apartment,,Entire home/apt,https://a0.muscache.com/im/pictures/57338613/6...,11222,,-100.0,1,1,1,0,0,0,1,0,1,1,1,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,9.919504e+13,,,,,,,2009.0,11.0,16.0,,,,,,,,,-0.994869,-0.101168,8.660254e-01,-5.000000e-01
74107,18524,2,"{TV,""Wireless Internet"",""Air conditioning"",Kit...",1.0,Real Bed,2.0,1.0,flexible,1,f,"Two bedroom, one bathroom with large dining/li...",2017-01-16,1,0,100.0,2017-01-08,0,2017-04-11,9.713132e+06,-6.110296e+06,Walk up Apartment in Lakeview/Wrigleyville,299,9,Apartment,90.0,Entire home/apt,,60657,,-100.0,1,1,1,0,1,1,1,0,0,1,1,1,0,1,1,1,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1.316806e+14,2017.0,1.0,16.0,2017.0,4.0,11.0,2017.0,1.0,8.0,-0.994869,-0.101168,8.660254e-01,5.000000e-01,-0.612106,7.907757e-01,-5.000000e-01,0.866025,-0.050649,0.998717,8.660254e-01,5.000000e-01
74108,18525,5,"{TV,""Wireless Internet"",""Air conditioning"",Kit...",1.5,Real Bed,2.0,2.0,flexible,1,t,Happy Holidays! If you're looking for a big op...,,1,0,100.0,2014-09-02,0,,9.717457e+06,-6.112683e+06,Beautiful Logan Square Home,27,0,House,,Entire home/apt,https://a0.muscache.com/im/pictures/361642af-e...,60618,,-100.0,1,1,1,0,1,1,1,0,0,1,1,1,1,1,1,0,0,0,1,1,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1.317939e+14,,,,,,,2014.0,9.0,2.0,,,,,,,,,0.918958,0.394356,-1.836970e-16,-1.000000e+00
74109,18526,2,"{Internet,""Wireless Internet"",""Air conditionin...",1.0,Real Bed,1.0,2.0,strict,4,t,This is a cozy one-bedroom apartment a few blo...,2016-04-15,1,0,100.0,2014-03-17,0,2017-05-08,8.024041e+06,-5.895221e+06,Charming 1 BR apartment east of Central Park,165,4,Apartment,95.0,Entire home/apt,https://a0.muscache.com/im/pictures/49c8e83f-d...,10029.0,,-100.0,0,1,1,0,1,1,1,0,1,1,1,0,0,1,0,0,0,1,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,9.913887e+13,2016.0,4.0,15.0,2017.0,5.0,8.0,2014.0,3.0,17.0,-0.994869,0.101168,-5.000000e-01,8.660254e-01,-0.050649,9.987165e-01,-8.660254e-01,0.500000,-0.954139,-0.299363,6.123234e-17,1.000000e+00


# モデルの制作

In [144]:
target = df_train["y"]

In [145]:
cols=use_num_columns
df_train, df_test = split_train_test(df)

In [197]:
# parameters

learning_rate = 0.05
SEED = 42
lgb_params = {
    "objective": "regression",
    "metric": "rmse",
    "learning_rate": learning_rate,
    "num_leaves": 8,
    "min_data_in_leaf": 40,
    #"colsample_bytree": 1.0,
    #"feature_fraction": 1.0,
    #"bagging_freq": 0,
    #"bagging_fraction": 1.0,
    "verbosity": 0,
    "seed": SEED,
}

In [198]:
from sklearn.model_selection import LeaveOneOut
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold

SEED_SKF = SEED
np.random.seed(SEED)
 
FOLD = 10
skf = StratifiedKFold(n_splits=FOLD, shuffle=True, random_state=SEED_SKF)
kf = KFold(n_splits=FOLD, shuffle=True, random_state=SEED)

oof = np.zeros(len(df_train))
predictions = np.zeros(len(df_test))
df_feature_importance = pd.DataFrame()

features_numerical =  cols 
features_categorical = categorical+categorical2+ame_new

features = features_numerical + features_categorical 

N_CLASSES=1

In [199]:
print(len(features_numerical))
print(len(features_categorical))

24
89


In [200]:
# colname= features_numerical 
# tmp=[]
# rate=0.005

# for x in list(colname):
#     res=target.corr(df[x])
#     if abs(res)>rate:
#         tmp.append(x)
# features_numerical = tmp
# colname= features_categorical

# tmp2=[]
# for x in list(colname):
#     res=target.corr(df[x])
#     if abs(res)>rate:
#         tmp2.append(x)
# features_categorical = tmp2

# features = tmp + tmp2
# print(len(features_numerical))
# print(len(features_categorical))

In [201]:
for fold_, (indexes_trn, indexes_val) in enumerate(kf.split(df_train.values, target.values)):
    print(f"------------------------------ fold {fold_} ------------------------------")
    
    df_trn = df_train.loc[indexes_trn].reset_index(drop=True)
    df_val = df_train.loc[indexes_val].reset_index(drop=True)
    target_trn = target.loc[indexes_trn].reset_index(drop=True)
    target_val = target.loc[indexes_val].reset_index(drop=True)
    
    
    lgb_train = lgb.Dataset(
        df_trn.loc[:, features],
        label=target_trn,
        feature_name=features,
        categorical_feature=features_categorical
    )
    lgb_valid = lgb.Dataset(
        df_val.loc[:, features],
        label=target_val,
        feature_name=features,
        categorical_feature=features_categorical
    )
    
    lgb_params["learning_rate"] = learning_rate + np.random.random() * 0.001  # おまじない
    num_round = 999999999
    model = lgb.train(
        lgb_params,
        lgb_train, 
        num_round, 
        valid_sets=[lgb_train, lgb_valid], 
        verbose_eval=300,
        early_stopping_rounds=300 if num_round >= 1e8 else None,
        fobj=None,
        #feval=lgb_metric,
    )
    
    # cv
    prediction_round = model.best_iteration+150 if num_round >= 1e8 else num_round  # おまじない
    oof[indexes_val] = model.predict(df_val[features], num_iteration=prediction_round) 
    
    # feature importance
    df_fold_importance = pd.DataFrame()
    df_fold_importance["feature"] = features
    df_fold_importance["importance"] = model.feature_importance()
    df_fold_importance["fold"] = fold_
    df_feature_importance = pd.concat([df_feature_importance, df_fold_importance], axis=0)
    
    # prediction for test data
    predictions += model.predict(df_test[features], num_iteration=prediction_round) / FOLD
    print()

------------------------------ fold 0 ------------------------------




You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
Training until validation scores don't improve for 300 rounds




[300]	training's rmse: 99.1664	valid_1's rmse: 108.405
[600]	training's rmse: 93.0383	valid_1's rmse: 106.958
[900]	training's rmse: 89.1754	valid_1's rmse: 106.011
[1200]	training's rmse: 85.7571	valid_1's rmse: 105.712
[1500]	training's rmse: 83.1033	valid_1's rmse: 105.502
Early stopping, best iteration is:
[1410]	training's rmse: 83.857	valid_1's rmse: 105.438

------------------------------ fold 1 ------------------------------
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
Training until validation scores don't improve for 300 rounds
[300]	training's rmse: 99.3311	valid_1's rmse: 104.884
[600]	training's rmse: 93.0432	valid_1's rmse: 104.605
[900]	training's rmse: 88.9048	valid_1's rmse: 104.722
Early stopping, best iteration is:
[791]	training's rmse: 90.3715	valid_1's rmse: 104.469

------------------------------ fold 2 ------------------------------
You can set `force_row_wise=true` to remove the overhe




In [202]:
pd.set_option('display.max_rows', 500)
df_fold_importance 

Unnamed: 0,feature,importance,fold
0,accommodates,898,9
1,bathrooms,662,9
2,beds,299,9
3,host_response_rate,303,9
4,latitude,167,9
5,longitude,361,9
6,number_of_reviews,229,9
7,review_scores_rating,250,9
8,kyori,259,9
9,first-year,109,9


In [203]:
from sklearn.metrics import mean_squared_error

df_test["prediction"] = predictions.tolist()
score=np.sqrt(mean_squared_error(target, oof))
print("CV score")
print(f"  rmse: {score:8.5f}")

CV score
  rmse: 106.32447


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test["prediction"] = predictions.tolist()


In [204]:
SEED=58
SEED_SKF = SEED
np.random.seed(SEED)
 
FOLD = 10
kf = KFold(n_splits=FOLD, shuffle=True, random_state=SEED)

oof2 = np.zeros(len(df_train))
predictions2 = np.zeros(len(df_test))
df_feature_importance2 = pd.DataFrame()

for fold_, (indexes_trn, indexes_val) in enumerate(kf.split(df_train.values, target.values)):
    print(f"------------------------------ fold {fold_} ------------------------------")
    
    df_trn = df_train.loc[indexes_trn].reset_index(drop=True)
    df_val = df_train.loc[indexes_val].reset_index(drop=True)
    target_trn = target.loc[indexes_trn].reset_index(drop=True)
    target_val = target.loc[indexes_val].reset_index(drop=True)
    
    
    lgb_train = lgb.Dataset(
        df_trn.loc[:, features],
        label=target_trn,
        feature_name=features,
        categorical_feature=features_categorical
    )
    lgb_valid = lgb.Dataset(
        df_val.loc[:, features],
        label=target_val,
        feature_name=features,
        categorical_feature=features_categorical
    )
    
    lgb_params["learning_rate"] = learning_rate + np.random.random() * 0.001  # おまじない
    num_round = 999999999
    model = lgb.train(
        lgb_params,
        lgb_train, 
        num_round, 
        valid_sets=[lgb_train, lgb_valid], 
        verbose_eval=300,
        early_stopping_rounds=300 if num_round >= 1e8 else None,
        fobj=None,
        #feval=lgb_metric,
    )
    
    # cv
    prediction_round = model.best_iteration+150 if num_round >= 1e8 else num_round  # おまじない
    oof2[indexes_val] = model.predict(df_val[features], num_iteration=prediction_round) 
    
    # feature importance
    df_fold_importance = pd.DataFrame()
    df_fold_importance["feature"] = features
    df_fold_importance["importance"] = model.feature_importance()
    df_fold_importance["fold"] = fold_
    df_feature_importance2 = pd.concat([df_feature_importance, df_fold_importance], axis=0)
    
    # prediction for test data
    predictions2 += model.predict(df_test[features], num_iteration=prediction_round) / FOLD
    print()
    

------------------------------ fold 0 ------------------------------




You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
Training until validation scores don't improve for 300 rounds




[300]	training's rmse: 98.8945	valid_1's rmse: 113.531
[600]	training's rmse: 92.7246	valid_1's rmse: 112.666
[900]	training's rmse: 88.5289	valid_1's rmse: 112.315
[1200]	training's rmse: 85.3414	valid_1's rmse: 112.406
Early stopping, best iteration is:
[946]	training's rmse: 88.0024	valid_1's rmse: 112.245

------------------------------ fold 1 ------------------------------
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
Training until validation scores don't improve for 300 rounds
[300]	training's rmse: 99.3309	valid_1's rmse: 107.093
[600]	training's rmse: 93.2367	valid_1's rmse: 105.996
[900]	training's rmse: 89.3412	valid_1's rmse: 105.479
[1200]	training's rmse: 86.0425	valid_1's rmse: 105.111
[1500]	training's rmse: 83.3715	valid_1's rmse: 104.954
[1800]	training's rmse: 80.9392	valid_1's rmse: 104.791
[2100]	training's rmse: 78.7345	valid_1's rmse: 104.808
Early stopping, best iteration is:
[1988]	trai

[1200]	training's rmse: 86.1337	valid_1's rmse: 102.632
[1500]	training's rmse: 83.4431	valid_1's rmse: 102.601
Early stopping, best iteration is:
[1453]	training's rmse: 83.8297	valid_1's rmse: 102.528



In [205]:
SEED=94
SEED_SKF = SEED
np.random.seed(SEED)
 
FOLD = 10
kf = KFold(n_splits=FOLD, shuffle=True, random_state=SEED)

oof3 = np.zeros(len(df_train))
predictions3 = np.zeros(len(df_test))
df_feature_importance3 = pd.DataFrame()

for fold_, (indexes_trn, indexes_val) in enumerate(kf.split(df_train.values, target.values)):
    print(f"------------------------------ fold {fold_} ------------------------------")
    
    df_trn = df_train.loc[indexes_trn].reset_index(drop=True)
    df_val = df_train.loc[indexes_val].reset_index(drop=True)
    target_trn = target.loc[indexes_trn].reset_index(drop=True)
    target_val = target.loc[indexes_val].reset_index(drop=True)
    
    
    lgb_train = lgb.Dataset(
        df_trn.loc[:, features],
        label=target_trn,
        feature_name=features,
        categorical_feature=features_categorical
    )
    lgb_valid = lgb.Dataset(
        df_val.loc[:, features],
        label=target_val,
        feature_name=features,
        categorical_feature=features_categorical
    )
    
    lgb_params["learning_rate"] = learning_rate + np.random.random() * 0.001  # おまじない
    num_round = 999999999
    model = lgb.train(
        lgb_params,
        lgb_train, 
        num_round, 
        valid_sets=[lgb_train, lgb_valid], 
        verbose_eval=300,
        early_stopping_rounds=300 if num_round >= 1e8 else None,
        fobj=None,
        #feval=lgb_metric,
    )
    
    # cv
    prediction_round = model.best_iteration+150 if num_round >= 1e8 else num_round  # おまじない
    oof3[indexes_val] = model.predict(df_val[features], num_iteration=prediction_round) 
    
    # feature importance
    df_fold_importance = pd.DataFrame()
    df_fold_importance["feature"] = features
    df_fold_importance["importance"] = model.feature_importance()
    df_fold_importance["fold"] = fold_
    df_feature_importance3 = pd.concat([df_feature_importance, df_fold_importance], axis=0)
    
    # prediction for test data
    predictions3 += model.predict(df_test[features], num_iteration=prediction_round) / FOLD
    print()
    

------------------------------ fold 0 ------------------------------
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
Training until validation scores don't improve for 300 rounds
[300]	training's rmse: 99.2737	valid_1's rmse: 106.972
[600]	training's rmse: 93.151	valid_1's rmse: 106.589
Early stopping, best iteration is:
[462]	training's rmse: 95.4856	valid_1's rmse: 106.449

------------------------------ fold 1 ------------------------------
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
Training until validation scores don't improve for 300 rounds
[300]	training's rmse: 99.2092	valid_1's rmse: 108.054
[600]	training's rmse: 93.455	valid_1's rmse: 107.043
[900]	training's rmse: 89.2876	valid_1's rmse: 106.66
[1200]	training's rmse: 86.0788	valid_1's rmse: 106.492
Early stopping, best iteration is:
[1177]	training's rmse: 86.3477	valid_1'

[300]	training's rmse: 98.7285	valid_1's rmse: 111.23
[600]	training's rmse: 92.8217	valid_1's rmse: 109.103
[900]	training's rmse: 88.8675	valid_1's rmse: 108.43
[1200]	training's rmse: 85.8554	valid_1's rmse: 107.978
[1500]	training's rmse: 83.2568	valid_1's rmse: 107.805
Early stopping, best iteration is:
[1496]	training's rmse: 83.291	valid_1's rmse: 107.793



In [206]:
df_test["prediction2"] = predictions2.tolist()
df_test["prediction3"] = predictions3.tolist()
print()
print(df_test["prediction"])
print()
print(df_test["prediction2"])
print()
print(df_test["prediction3"])
print()
df_test["prediction"]=(df_test["prediction2"]+df_test["prediction"]+df_test["prediction3"])/3
print(df_test["prediction"])


0        249.873433
1        146.010588
2        110.316994
3        173.766665
4        184.256868
            ...    
18523    178.674653
18524    109.543500
18525    192.262941
18526    120.586024
18527     37.564720
Name: prediction, Length: 18528, dtype: float64

0        249.487840
1        148.053483
2        110.204246
3        171.281735
4        188.365312
            ...    
18523    178.401925
18524    108.847431
18525    190.526816
18526    121.243330
18527     36.450599
Name: prediction2, Length: 18528, dtype: float64

0        253.731342
1        145.633189
2        109.165148
3        175.036207
4        187.031162
            ...    
18523    185.125447
18524    112.038326
18525    190.311536
18526    121.310496
18527     36.942786
Name: prediction3, Length: 18528, dtype: float64

0        251.030871
1        146.565753
2        109.895463
3        173.361536
4        186.551114
            ...    
18523    180.734009
18524    110.143085
18525    191.033764
18526    1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test["prediction2"] = predictions2.tolist()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test["prediction3"] = predictions3.tolist()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test["prediction"]=(df_test["prediction2"]+df_test["prediction"]+df_test["prediction3"])/3


In [207]:
df_submission = df_sample_sub.copy()
df_submission["SalePrice"] = df_submission["index"].map(dict(df_test[["id", "prediction"]].values))
assert not df_submission["SalePrice"].isna().any()
display(df_submission.head(10))

# make submission file
df_submission.to_csv("fn_selection_fold"+str(FOLD)+"_SEED_"+str(SEED)+"_learning rate_"+str(learning_rate)+".csv", header=None, index=False)

Unnamed: 0,index,SalePrice
0,0,251.030871
1,1,146.565753
2,2,109.895463
3,3,173.361536
4,4,186.551114
5,5,115.265932
6,6,703.25725
7,7,66.177108
8,8,79.016196
9,9,856.730433
