In [1]:
# coding:utf-8
import pandas as pd
import numpy as np
import os
import re
import warnings
import seaborn as sns
warnings.filterwarnings('ignore')
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import KFold
import lightgbm as lgb
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error
plt.rcParams['font.family'] = ['sans-serif']
plt.rcParams['font.sans-serif'] = ['SimHei']
from sklearn.decomposition import TruncatedSVD
import catboost as ctb
import gc
n_compo = 10
tsvd = TruncatedSVD(n_components=n_compo,n_iter=45)
n_compo2 = 5
tsvd2 = TruncatedSVD(n_components=n_compo2,n_iter=45)
lbl = LabelEncoder()
pd.set_option('max_column',None)
pd.set_option('max_row',500)

In [2]:
input_dir = "../input/"

In [3]:
os.listdir(input_dir)

['sample_submit.csv', 'test.csv', 'train.csv']

In [4]:
sub = pd.read_csv(input_dir+"sample_submit.csv")
train_data = pd.read_csv(input_dir+"train.csv")
train_data = train_data[(train_data['id']!=5776)&(train_data['id']!=7492)].reset_index(drop = True)
test_data = pd.read_csv(input_dir+"test.csv")
train_data.loc[train_data['id'] == 20927,'面積'] = '43.01m2'
test_data.loc[test_data['id'] == 39954,'間取り'] = '1R'
cate_cols = []
useless_columns = []

In [5]:
# 一些先验的数据清洗
# train_data.loc[train_data['id'] == 7492,'面積'] = '51.83m2'
# train_data.loc[train_data['id'] == 5776,'賃料'] = 128000
# train_data.loc[train_data['id'] == 20927,'面積'] = '43.01m2'

In [6]:
dataset = pd.concat([train_data,test_data],axis = 0).reset_index(drop = True)

In [7]:
dataset['区'] = dataset['所在地'].apply(lambda x:x.split('区')[0]+'区')
cate_cols += ['区']

In [8]:
# 城镇
def town(x):
    tmp = x.split('区')[1].split('丁目')[0]
    return "".join(list(filter(lambda x:x.isalpha(),tmp)))
dataset['城镇'] = dataset['所在地'].apply(town)
cate_cols += ['城镇']

In [9]:
# 无用
# dataset['丁目'] = dataset['所在地'].apply(lambda x:x.split('区')[1].split('丁目')[0]+'丁目')
# cate_cols += ['丁目']

In [10]:
# 无用
# dataset['番-号'] = dataset['所在地'].apply(lambda x:np.nan if x[-1]=='目' else x.split('丁目')[-1])
# cate_cols += ['番-号']

In [11]:
dataset['区'].nunique()

23

In [12]:
# 地铁路线
dataset['最近路线-count'] = dataset['アクセス'].apply(lambda x:len(x.split('\t\t')))

In [13]:
# 地铁线路
dataset['最近地铁线路'] = dataset['アクセス'].apply(lambda x:[i.split('\t')[0] if '\t' in i else 'nan' for i in x.split('\t\t')])
dataset['最近地铁线路'] = dataset['最近地铁线路'].apply(lambda x:",".join(x))
countvectorizer = CountVectorizer()
tmp = pd.DataFrame(countvectorizer.fit_transform(dataset['最近地铁线路']).toarray(),columns = countvectorizer.get_feature_names())
tmp = pd.DataFrame(tsvd.fit_transform(tmp),columns = [str(i)+'_线路' for i in range(n_compo)])
dataset = pd.concat([dataset,tmp],axis = 1)
del tmp,dataset['最近地铁线路']

In [14]:
# 地铁站
dataset['最近地铁站'] = dataset['アクセス'].apply(lambda x:[i.split('\t')[1] if '\t' in i else 'nan' for i in x.split('\t\t')])
dataset['最近地铁站'] = dataset['最近地铁站'].apply(lambda x:",".join(x))
countvectorizer = CountVectorizer()
tmp = pd.DataFrame(countvectorizer.fit_transform(dataset['最近地铁站']).toarray(),columns = countvectorizer.get_feature_names())
tmp = pd.DataFrame(tsvd.fit_transform(tmp),columns = [str(i)+'_地铁站' for i in range(n_compo)])
dataset = pd.concat([dataset,tmp],axis = 1)
del tmp,dataset['最近地铁站']

In [15]:
# 针对地铁站做加权以及到站口的时间
def railwayStaion(x):
    result = []
    for i in x.split('\t\t'):
        tmp = re.findall('.*駅',i)
        if len(tmp)!=0:
            result.append(tmp[0])
        else:
            result.append(x.split('\t')[1])
    return result
dataset['最近地铁站'] = dataset['アクセス'].apply(railwayStaion)
dataset['最短时间'] = dataset['アクセス'].apply(lambda x:re.findall(r'徒歩\d+',x))
dataset['最短时间'] = dataset['最短时间'].apply(lambda x:[int(re.findall(r'\d+',i)[0]) for i in x])
dataset['min-时间'] = dataset['最短时间'].apply(lambda x:np.min(x))
dataset['mean-时间'] = dataset['最短时间'].apply(lambda x:np.mean(x))
dataset['max-时间'] = dataset['最短时间'].apply(lambda x:np.max(x))
dataset['std-时间'] = dataset['最短时间'].apply(lambda x:np.std(x))
def countStation(row):
    station = row['最近地铁站']
    time = row['最短时间']
    result = []
    for i in range(min(len(time),len(station))):
        result.append((station[i].replace('\t','')+",")*time[i])
    return result
dataset['最近地铁站加权'] = dataset.apply(countStation,axis = 1)
dataset['最近地铁站加权'] = dataset['最近地铁站加权'].apply(lambda x:"".join(x))
countvectorizer = CountVectorizer()
tmp = pd.DataFrame(countvectorizer.fit_transform(dataset['最近地铁站加权']).toarray(),columns = countvectorizer.get_feature_names())
tmp = pd.DataFrame(tsvd.fit_transform(tmp),columns = [str(i)+'_最近地铁站加权' for i in range(n_compo)])
dataset = pd.concat([dataset,tmp],axis = 1)
del tmp,dataset['最近地铁站加权'],dataset['最近地铁站'],dataset['最短时间']
gc.collect()

42

In [16]:
# 户型
def houseType(x):
    # [房间,厨房,桌椅，客厅,杂物]
    room = [0,0,0,0,0]
    base = x.split('S')[0]
    num = int(base[0])
    if 'R' in base:
        room[0]+=num
    if 'K' in base:
        room[1]+=num
    if 'D' in base:
        room[2]+=num
    if 'L' in base:
        room[3]+=num
    if 'S' in base:
        room[4]+=1
    return room
dataset['房间'] = dataset['間取り'].apply(houseType)
dataset['卧室'] = dataset['房间'].apply(lambda x:x[0])
dataset['厨房'] = dataset['房间'].apply(lambda x:x[1])
dataset['桌椅'] = dataset['房间'].apply(lambda x:x[2])
dataset['客厅'] = dataset['房间'].apply(lambda x:x[3])
dataset['杂物'] = dataset['房间'].apply(lambda x:x[4])
del dataset['房间']

In [17]:
# 建筑时间包括新建筑(新築)，目前设置为0年0月
dataset['建筑时间-年'] = dataset['築年数'].apply(lambda x:int(x.split('年')[0]) if x !='新築' else 0)
dataset['建筑时间-月'] = dataset['築年数'].apply(lambda x:int(x.split('年')[1].split('ヶ')[0]) if x !='新築' else 0)
dataset['建筑时间-年月'] = dataset['建筑时间-年']*12+dataset['建筑时间-月']

In [18]:
cate_cols += ['方角']

In [19]:
dataset['面积'] = dataset['面積'].apply(lambda x:float(x[:-2]))

In [20]:
# 层数
def allStage(x):
    tmp = re.findall(r'\d+階建',x)
    if len(tmp) == 0:
        return np.nan
    else:
        return int(re.findall(r'\d+',tmp[0])[0])
def thisStage(x):
    tmp = re.findall(r'\d+',x)
    if len(tmp) == 0:
        return np.nan
    return int(tmp[0])
    
dataset['地上总层'] = dataset['所在階'].astype(str).apply(lambda x:x.replace('地下','-')).apply(allStage)
dataset['所在层'] = dataset['所在階'].astype(str).apply(lambda x:x.replace('地下','-')).apply(thisStage)
cate_cols += ['所在階']

In [21]:
# 厕所浴室-バス・トイレ
dataset['浴室-厕所'] = dataset['バス・トイレ'].astype(str).apply(lambda x:x.replace('／','').replace('\t',','))
countvectorizer = CountVectorizer()
tmp = pd.DataFrame(countvectorizer.fit_transform(dataset['浴室-厕所']).toarray(),columns = countvectorizer.get_feature_names())
dataset = pd.concat([dataset,tmp],axis = 1)
print(tmp.shape)
del tmp,dataset['浴室-厕所']

(62730, 15)


In [22]:
# 厨房设备-キッチン
def stove(x):
    tmp = re.findall(r'ロ\d+口',x)
    if len(tmp) == 0:
        return 0
    else:
        return int(re.findall(r'\d+',tmp[0])[0])
dataset['锅灶数'] = dataset['室内設備'].astype(str).apply(stove)
dataset['厨房设备'] = dataset['キッチン'].astype(str).apply(lambda x:x.replace('／','').replace('\t',','))
countvectorizer = CountVectorizer()
tmp = pd.DataFrame(countvectorizer.fit_transform(dataset['厨房设备']).toarray(),columns = countvectorizer.get_feature_names())
# tmp = pd.DataFrame(tsvd2.fit_transform(tmp),columns = [str(i)+'_厨房设备' for i in range(n_compo2)])
dataset = pd.concat([dataset,tmp],axis = 1)
print(tmp.shape)
del tmp,dataset['厨房设备']

(62730, 16)


In [23]:
# 互联网通信-放送・通信
dataset['互联网通信'] = dataset['放送・通信'].astype(str).apply(lambda x:x.replace('／','').replace('\t',','))
countvectorizer = CountVectorizer()
tmp = pd.DataFrame(countvectorizer.fit_transform(dataset['互联网通信']).toarray(),columns = countvectorizer.get_feature_names())
# tmp = pd.DataFrame(tsvd2.fit_transform(tmp),columns = [str(i)+'_互联网通信' for i in range(n_compo2)])
dataset = pd.concat([dataset,tmp],axis = 1)
print(tmp.shape)
del tmp,dataset['互联网通信']

(62730, 9)


In [24]:
# 房间设施-室内設備
dataset['房间设施'] = dataset['室内設備'].astype(str).apply(lambda x:x.replace('／','').replace('\t',','))
countvectorizer = CountVectorizer()
tmp = pd.DataFrame(countvectorizer.fit_transform(dataset['房间设施']).toarray(),columns = countvectorizer.get_feature_names())
# tmp = pd.DataFrame(tsvd2.fit_transform(tmp),columns = [str(i)+'_房间设施' for i in range(n_compo2)])
dataset = pd.concat([dataset,tmp],axis = 1)
print(tmp.shape)
del tmp,dataset['房间设施']

(62730, 43)


In [25]:
# 周边环境
dataset['周边环境'] = dataset['周辺環境'].astype(str).apply(lambda x:x.split('\t'))
dataset['周边环境'] = dataset['周边环境'].apply(lambda x:[i.split(' ')[0] for i in x])
dataset['周边环境'] = dataset['周边环境'].apply(lambda x:",".join(x))
countvectorizer = CountVectorizer()
tmp = pd.DataFrame(countvectorizer.fit_transform(dataset['周边环境']).toarray(),columns = countvectorizer.get_feature_names())
# tmp = pd.DataFrame(tsvd2.fit_transform(tmp),columns = [str(i)+'_周边环境' for i in range(n_compo2)])
dataset = pd.concat([dataset,tmp],axis = 1)
print(tmp.shape)
del tmp,dataset['周边环境']

(62730, 21)


In [26]:
# 房屋构造
cate_cols += ['建物構造']

In [27]:
# 契约时间
def contract(x):
    # return [按截止日期或者按时间，年，月，年間，月間，是否物件，是否定期借家]
    base = [0,0,0,0,0,0,0]
    if x == 'nan':
        return [np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan]
    if '年間' in x:
        tmp = int(re.findall(r"\d+年間",x)[0].split('年間')[0])
        base[3] += tmp
    if 'ヶ月間' in x:
        tmp = int(re.findall(r"\d+ヶ月間",x)[0].split('ヶ月間')[0])
        base[4] += tmp
    if '月ま' in x:
        tmp = int(re.findall(r"\d+年",x)[0].split('年')[0])
        base[0] = 1
        base[1] += tmp
        tmp = int(re.findall(r"\d+月",x)[0].split('月')[0])
        base[2] += tmp
    if '※この物件は' in x:
        base[-2] += 1
    if '定期借家' in x:
        base[-1] += 1
    return base
dataset['契约时间'] = dataset['契約期間'].astype(str).apply(contract)
dataset['按截止日期或者按时间'] = dataset['契约时间'].apply(lambda x:x[0])
dataset['年'] = dataset['契约时间'].apply(lambda x:x[1])
dataset['月'] = dataset['契约时间'].apply(lambda x:x[2])
dataset['年間'] = dataset['契约时间'].apply(lambda x:x[3])
dataset['月間'] = dataset['契约时间'].apply(lambda x:x[4])
dataset['是否物件'] = dataset['契约时间'].apply(lambda x:x[5])
dataset['是否定期借家'] = dataset['契约时间'].apply(lambda x:x[6])
dataset['年月間'] = dataset['年間']+12*dataset['月間']
del dataset['契约时间']

In [28]:
dataset['アクセス'] = dataset['アクセス'].astype(str).apply(lambda x:x.split('\t\t')[0])
cate_cols += ['アクセス']
# cate_cols += ['所在地']

In [29]:
# train   5776--1203500(删除)   7492(删除)
# test    30313
# 做数据清洗减缓平均价格
# dataset = dataset[(dataset['id']!=5776)&(dataset['id']!=7492)].reset_index(drop = True)

In [30]:
# 停车场特征
def carPark(x):
    # [駐輪場，駐車場，バイク置き場]
    result = [0,0,0]
    if len(re.findall(r'駐輪場',x)) == 0:
        result[0] = 1
    if len(re.findall(r'駐車場',x)) == 0:
        result[1] = 1
    if len(re.findall(r'バイク置き場',x)) == 0:
        result[2] = 1
    return result
dataset['停车场'] = dataset['駐車場'].astype(str).apply(carPark)
dataset['駐輪場'] = dataset['停车场'].apply(lambda x:x[0]).astype(int)
dataset['停車場'] = dataset['停车场'].apply(lambda x:x[1]).astype(int)
dataset['バイク置き場'] = dataset['停车场'].apply(lambda x:x[2]).astype(int)
del dataset['停车场']

In [36]:
# 停车费
def park_fee(x):
    tmp = re.findall(r'\d+,\d+円',x)
    if len(tmp) == 0:
        return 0
    else:
        tmp = tmp[0].split(',')
        fee1 = float(tmp[0])
        fee2 = float(tmp[1].split('円')[0])
        return fee1*1000+fee2
dataset['park_fee'] = dataset['駐車場'].astype(str).apply(park_fee)

In [None]:
# 构造不同小区不同层的平均价格
    
# train_data['所在地'] = train_data['所在地'].apply(split)
# test_data['所在地'] = test_data['所在地'].apply(split)
train_data['平均价格'] = train_data['賃料'] / train_data['面積'].apply(lambda x:float(x[:-2]))
tmp = train_data.groupby(['所在地','所在階'])['平均价格'].agg({
    'diff平均价格层层次-mean':'mean',
    'diff平均价格层层次-min':'min',
    'diff平均价格层层次-max':'max',
    'diff平均价格层层次-std':'std',
    'diff平均价格层层次-count':'count',
})
train_data = train_data.merge(tmp,on = ['所在地','所在階'],how = 'left')
new_train = train_data.drop_duplicates(['所在地','所在階'])
new_test = test_data.drop_duplicates(['所在地','所在階'])
mean_features = new_test[['所在地','所在階']].merge(new_train[['所在地','所在階','diff平均价格层层次-mean','diff平均价格层层次-min',
                                                        'diff平均价格层层次-max','diff平均价格层层次-std','平均价格','diff平均价格层层次-count']],
                                              on = ['所在地','所在階'],how = 'inner')
del mean_features['平均价格']
tmp = train_data[['所在地','所在階','平均价格']].merge(mean_features,on = ['所在地','所在階'],how = 'inner')
tmp['误差'] = abs(tmp['平均价格'] - tmp['diff平均价格层层次-mean'])
tmp = tmp[tmp['误差']<=100]
tmp.drop_duplicates(['所在地','所在階'],inplace = True)
tmp = tmp[tmp['误差']!=0]

## 这段是新添加
# tmp2 = pd.DataFrame((tmp['所在地']+"_"+tmp['所在階']).value_counts(),columns = ['count'])
# tmp2['所在地所在階'] = tmp2.index
# tmp2 = tmp2.reset_index(drop = True)
# tmp2 = tmp2[tmp2['count']>1]
# tmp2.drop_duplicates(['所在地所在階'],inplace = True)
# tmp2['所在地'] = tmp2['所在地所在階'].apply(lambda x:x.split('_')[0])
# tmp2['所在階'] = tmp2['所在地所在階'].apply(lambda x:x.split('_')[1])
# del tmp2['所在地所在階']
# tmp = tmp.merge(tmp2,on = ['所在階','所在地'],how = 'right')
# tmp.drop_duplicates(['所在地','所在階'],inplace = True)
#####
print(mean_features.shape)

dataset = dataset.merge(tmp.drop(['平均价格','误差'],axis = 1),on = ['所在地','所在階'],how = 'left')
# dataset = dataset.merge(mean_features,on = ['所在地','所在階'],how = 'left')
dataset['平均估计层层次-mean-label'] = dataset['diff平均价格层层次-mean']*dataset['面积']
dataset['平均估计层层次-min-label'] = dataset['diff平均价格层层次-min']*dataset['面积']
dataset['平均估计层层次-max-label'] = dataset['diff平均价格层层次-max']*dataset['面积']

del train_data['平均价格'],train_data['diff平均价格层层次-mean'],train_data['diff平均价格层层次-min']
del train_data['diff平均价格层层次-max'],train_data['diff平均价格层层次-std']
gc.collect()

In [None]:
# tmp = dataset[['所在地','所在階','面积','賃料']]

In [None]:
# train_data = pd.read_csv(input_dir+"train.csv")
# test_data = pd.read_csv(input_dir+"test.csv")
# train_data['面积'] = train_data['面積'].apply(lambda x:float(x[:-2]))
# test_data['面积'] = test_data['面積'].apply(lambda x:float(x[:-2]))

In [None]:
# tmp = train_data[['所在地','所在階','面积','賃料','方角','築年数']]

In [None]:
# tmp.sort_values(['所在地','平均价格'])

In [None]:
# tmp.sort_values(['面积','所在地','賃料'])

In [None]:
# a = tmp.groupby(['面积','所在地','所在階','方角'])['賃料'].nunique()

In [None]:
# a[a>1]

In [None]:
# ((tmp['面积'].astype(str)+tmp['所在地']+tmp['所在階']).value_counts()>1).sum()

In [None]:
# ((tmp['面积'].astype(str)+tmp['所在地']+tmp['所在階']+tmp['賃料'].astype(str)).value_counts()>1).sum()

In [None]:
for cate_col in cate_cols:
    dataset[cate_col] = lbl.fit_transform(dataset[cate_col].astype(str))
cate_cols.remove('アクセス')

In [None]:
# 统计特征
tmp = dataset.groupby(['アクセス'])['面积'].agg({
    'アクセス_面积_min':'min',
    'アクセス_面积_mean':'mean',
    'アクセス_面积_max':'max',
    'アクセス_面积_std':'std',
    'アクセス_count':'count',
})
dataset = dataset.merge(tmp,on = ['アクセス'],how = 'left')

tmp = dataset.groupby(['区'])['面积'].agg({
    '区_面积_min':'min',
    '区_面积_mean':'mean',
    '区_面积_max':'max',
    '区_面积_std':'std',
    '区_count':'count',
})
dataset = dataset.merge(tmp,on = ['区'],how = 'left')

tmp = dataset.groupby(['城镇'])['面积'].agg({
    '城镇_面积_min':'min',
    '城镇_面积_mean':'mean',
    '城镇_面积_max':'max',
    '城镇_面积_std':'std',
    '城镇_count':'count',
})
dataset = dataset.merge(tmp,on = ['城镇'],how = 'left')

tmp = dataset.groupby(['所在地','所在階'])['面积'].agg({
    '所在地_所在階_面积_mean':'mean',
    '所在地_所在階_面积_min':'min',
    '所在地_所在階_面积_max':'max',
    '所在地_所在階_面积_std':'std',
    '所在地_所在階_面积_count':'count',
})
dataset = dataset.merge(tmp,on = ['所在地','所在階'],how = 'left')

tmp = dataset.groupby(['所在层'])['面积'].agg({
    '所在层_面积_mean':'mean',
    '所在层_面积_min':'min',
    '所在层_面积_max':'max',
    '所在层_面积_std':'std',
    '所在层_面积_count':'count',
})
dataset = dataset.merge(tmp,on = ['所在层'],how = 'left')

tmp = dataset.groupby(['所在层'])['平均估计层层次-mean-label'].agg({
    '所在层_平均估计层层次-label_mean':'mean',
    '所在层_平均估计层层次-label_min':'min',
    '所在层_平均估计层层次-label_max':'max',
    '所在层_平均估计层层次-label_std':'std'
})
dataset = dataset.merge(tmp,on = ['所在层'],how = 'left')

In [None]:
useless_columns += ['id','キッチン','バス・トイレ','周辺環境','契約期間','室内設備',
                  '放送・通信','築年数','間取り','面積','駐車場','所在地','nan']

In [None]:
trainset = dataset[~dataset['賃料'].isna()].reset_index(drop = True).drop(useless_columns,axis = 1)
testset = dataset[dataset['賃料'].isna()].reset_index(drop = True).drop(useless_columns,axis = 1)

In [None]:
# 根据Label做统计特征
# tmp = trainset.groupby(['区'])['賃料'].agg({
#     '区_money_min':'min',
#     '区_money_mean':'mean',
#     '区_money_max':'max',
#     '区_money_std':'std',
# })
# trainset = trainset.merge(tmp,on = ['区'],how = 'left')
# testset = testset.merge(tmp,on = ['区'],how = 'left')

In [None]:
features = trainset.drop('賃料',axis = 1)
labels = trainset['賃料']
test_features = testset.drop('賃料',axis = 1)

In [None]:
features['平均估计层层次-mean-label'].count(),test_features['平均估计层层次-mean-label'].count()

In [None]:
# for column in features.columns:
#     sns.distplot(features[column].replace(np.nan,features[column].mean()),color = 'g')
#     sns.distplot(test_features[column].replace(np.nan,test_features[column].mean()),color = 'r')
#     plt.show()

In [None]:
# sns.distplot(features['平均估计层层次-mean-label'].replace(np.nan,-1),color = 'g')
# sns.distplot(test_features['平均估计层层次-mean-label'].replace(np.nan,-1),color = 'r')

In [None]:
features.shape,labels.shape,test_features.shape

In [None]:
params = {
    'learning_rate': 0.01,
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'num_leaves': 15,
    'max_depth':-1,
    'metric':'rmse'
}

In [None]:
# 全数训练
test_pred = np.zeros([test_features.shape[0],])
dtrain = lgb.Dataset(features,labels,
                     categorical_feature=cate_cols
                    )
model_lgb = lgb.train(
    params,
    dtrain,
    num_boost_round = 100000,
    valid_sets = [dtrain],
    verbose_eval = 100,
    categorical_feature=cate_cols,
)
test_pred += model_lgb.predict(test_features)
train_pred = model_lgb.predict(features)
# Catboost
# ctb_model = ctb.CatBoostRegressor(
#     iterations=35000,learning_rate=0.01,max_depth=7,l2_leaf_reg=1,verbose=50,
#     early_stopping_rounds=200,eval_metric='RMSE',task_type='GPU'
# )
# ctb_model.fit(features,labels,cat_features=cate_cols)
# test_pred += ctb_model.predict(test_features)

sns.distplot(labels,color = 'g')
sns.distplot(train_pred,color ='r')
plt.show()

In [None]:
# 五折训练
# test_pred = np.zeros([test_features.shape[0],])
# cv_score = []
# kf = KFold(n_splits=5,shuffle=True,random_state=42)
# for i,(train_idx,val_idx) in enumerate(kf.split(features,labels)):
#     print("======================  fold "+str(i+1)+" start training=====================")
    
#     ################ LGB
#     dtrain = lgb.Dataset(features.iloc[train_idx],labels[train_idx],categorical_feature=cate_cols)
#     dval = lgb.Dataset(features.iloc[val_idx],labels[val_idx],categorical_feature=cate_cols)
#     model_lgb = lgb.train(
#         params,
#         dtrain,
#         num_boost_round = 100000,
#         valid_sets = [dtrain,dval],
#         verbose_eval = 100,
#         early_stopping_rounds = 1000,
#         categorical_feature=cate_cols,
#     )
#     cv_score.append(np.sqrt(mean_squared_error(labels[val_idx],model_lgb.predict(features.iloc[val_idx]))))
#     test_pred += model_lgb.predict(test_features)
# print("Mean RMSE Score: ",np.mean(cv_score))
# test_pred /= 5

In [None]:
train_data = pd.read_csv(input_dir+"train.csv")
test_data = pd.read_csv(input_dir+"test.csv")
train_data['面积'] = train_data['面積'].apply(lambda x:float(x[:-2]))
test_data['面积'] = test_data['面積'].apply(lambda x:float(x[:-2]))

train_data['城镇'] = train_data['所在地'].apply(town)
test_data['城镇'] = test_data['所在地'].apply(town)

In [None]:
def split(x):
    return x.split('丁目')[0].split('-')[0].split('－')[0]
train_data['所在地'] = train_data['所在地'].apply(split)
test_data['所在地'] = test_data['所在地'].apply(split)
train_data['所在階'] = train_data['所在階'].astype(str).apply(lambda x:x.split('建')[0])
test_data['所在階'] = test_data['所在階'].astype(str).apply(lambda x:x.split('建')[0])

In [None]:
sub = test_data

In [None]:
sub['賃料'] = test_pred

In [None]:
# 一些后处理
tmp = train_data.groupby(['所在地','面积','所在階'])['賃料'].agg({
    '房租-mean-2':'mean'
})
sub = sub.merge(tmp,on = ['所在地','面积','所在階'],how = 'left')
sub['賃料'] = sub.apply(lambda row:row['賃料'] if np.isnan(row['房租-mean-2']) else row['房租-mean-2'],axis = 1)
# sub.loc[sub['id'] ==61784,'賃料'] = 120350
sub.loc[(sub['id'] == 55780)|(sub['id'] == 61634),'賃料'] = 1660000
sub.loc[(sub['id'] == 34294),'賃料'] = 1800000

In [None]:
sub['房租-mean-2'].count()

In [None]:
sub[['id','賃料']].to_csv("../output/baseline.csv",index = 0,header = 0)

In [None]:
sub['賃料'].max(),labels.max(),sub['賃料'].min(),labels.min()

In [None]:
sub[['id','賃料']][:10]

In [None]:
importance = model_lgb.feature_importance()
columns = features.columns
feature_importance_df = pd.concat([pd.DataFrame(importance),pd.DataFrame(columns)],axis = 1)
feature_importance_df.columns = ['importance','feature']
cols = (feature_importance_df[["feature", "importance"]].groupby("feature").mean().sort_values(by="importance", ascending=False)[:1000].index)
best_features = feature_importance_df.loc[feature_importance_df.feature.isin(cols)]
plt.figure(figsize=(15, features.shape[1]*0.23))
sns.barplot(x="importance", y="feature", data=best_features.sort_values(by="importance", ascending=False))
plt.title('LightGBM Features (One Folds)')
plt.show()   