In [12]:
import glob
import pandas as pd
import numpy as np
import xgboost as xgb
import category_encoders as ce
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error as mae

files = glob.glob("/tmp/working/dataset/nishika/old_apartment_2020/train/*")
data_list = []
for file in files:
    data_list.append(pd.read_csv(file, index_col=0))
df = pd.concat(data_list)

def data_pre(df):
    nonnull_list = []
    for col in df.columns:
        nonnull = df[col].count()
        if nonnull == 0:
            nonnull_list.append(col)
    df = df.drop(nonnull_list, axis=1)

    df = df.drop("市区町村名", axis=1)

    df = df.drop("種類", axis=1)

    dis = {
        "30分?60分":45,
        "1H?1H30":75,
        "2H?":120,
        "1H30?2H":105
    }
    df["最寄駅：距離（分）"] = df["最寄駅：距離（分）"].replace(dis).astype(float)

    df["面積（㎡）"] = df["面積（㎡）"].replace("2000㎡以上", 2000).astype(float)


    y_list = {}
    for i in df["建築年"].value_counts().keys():
        if "平成" in i:
            num = float(i.split("平成")[1].split("年")[0])
            year = 33 - num
        if "令和" in i:
            num = float(i.split("令和")[1].split("年")[0])
            year = 3 - num
        if "昭和" in i:
            num = float(i.split("昭和")[1].split("年")[0])
            year = 96 - num
        y_list[i] = year
    y_list["戦前"] = 76
    df["建築年"] = df["建築年"].replace(y_list)

    year = {
        "年第１四半期": ".25",
        "年第２四半期": ".50",
        "年第３四半期": ".75",
        "年第４四半期": ".99"
    }
    year_list = {}
    for i in df["取引時点"].value_counts().keys():
        for k, j in year.items():
            if k in i:
                year_rep = i.replace(k, j)
        year_list[i] = year_rep
    df["取引時点"] = df["取引時点"].replace(year_list).astype(float)
    
    cols = ["都道府県名", "地区名", "最寄駅：名称", "間取り", "建物の構造", "用途", "今後の利用目的", "都市計画", "改装", "取引の事情等"]
    ce_df = ce.OrdinalEncoder(cols=cols, handle_unknown='impute')
    df = ce_df.fit_transform(df)
    
    return df
    
df = data_pre(df)

df_train, df_val =train_test_split(df, test_size=0.2)

col = "取引価格（総額）_log"
train_y = df_train[col]
train_x = df_train.drop(col, axis=1)

val_y = df_val[col]
val_x = df_val.drop(col, axis=1)

# CatBoost
from catboost import Pool
from catboost import CatBoostRegressor, FeaturesData, Pool
import numpy as np

# model = CatBoostRegressor(**params)
# model.fit(train_pool)
model = CatBoostRegressor(learning_rate=1, depth=6, loss_function='MAE')
rg = model.fit(train_x, train_y)

vals = rg.predict(val_x)
mae(vals, val_y)

df_test = pd.read_csv("/tmp/working/dataset/nishika/old_apartment_2020/test.csv", index_col=0)
df_test = data_pre(df_test)

predict = rg.predict(df_test)
df_test["取引価格（総額）_log"] = predict
df_test[["取引価格（総額）_log"]].to_csv("submit_test.csv")

0:	learn: 0.1684516	total: 70.1ms	remaining: 1m 9s
1:	learn: 0.1530440	total: 135ms	remaining: 1m 7s
2:	learn: 0.1416860	total: 198ms	remaining: 1m 5s
3:	learn: 0.1352639	total: 259ms	remaining: 1m 4s
4:	learn: 0.1306984	total: 330ms	remaining: 1m 5s
5:	learn: 0.1283641	total: 390ms	remaining: 1m 4s
6:	learn: 0.1261008	total: 450ms	remaining: 1m 3s
7:	learn: 0.1242346	total: 510ms	remaining: 1m 3s
8:	learn: 0.1227896	total: 568ms	remaining: 1m 2s
9:	learn: 0.1209261	total: 626ms	remaining: 1m 1s
10:	learn: 0.1200101	total: 682ms	remaining: 1m 1s
11:	learn: 0.1181909	total: 739ms	remaining: 1m
12:	learn: 0.1169676	total: 799ms	remaining: 1m
13:	learn: 0.1162008	total: 856ms	remaining: 1m
14:	learn: 0.1149362	total: 920ms	remaining: 1m
15:	learn: 0.1140398	total: 984ms	remaining: 1m
16:	learn: 0.1132901	total: 1.05s	remaining: 1m
17:	learn: 0.1124394	total: 1.11s	remaining: 1m
18:	learn: 0.1116876	total: 1.17s	remaining: 1m
19:	learn: 0.1112206	total: 1.23s	remaining: 1m
20:	learn: 0.110

163:	learn: 0.0887553	total: 9.94s	remaining: 50.7s
164:	learn: 0.0887167	total: 10s	remaining: 50.6s
165:	learn: 0.0886293	total: 10.1s	remaining: 50.5s
166:	learn: 0.0885903	total: 10.1s	remaining: 50.5s
167:	learn: 0.0885623	total: 10.2s	remaining: 50.5s
168:	learn: 0.0885179	total: 10.3s	remaining: 50.4s
169:	learn: 0.0884892	total: 10.3s	remaining: 50.4s
170:	learn: 0.0884073	total: 10.4s	remaining: 50.3s
171:	learn: 0.0883714	total: 10.4s	remaining: 50.2s
172:	learn: 0.0883178	total: 10.5s	remaining: 50.2s
173:	learn: 0.0883021	total: 10.5s	remaining: 50.1s
174:	learn: 0.0882717	total: 10.6s	remaining: 50s
175:	learn: 0.0882367	total: 10.7s	remaining: 49.9s
176:	learn: 0.0882084	total: 10.7s	remaining: 49.8s
177:	learn: 0.0881638	total: 10.8s	remaining: 49.8s
178:	learn: 0.0881346	total: 10.8s	remaining: 49.7s
179:	learn: 0.0881029	total: 10.9s	remaining: 49.7s
180:	learn: 0.0880748	total: 11s	remaining: 49.6s
181:	learn: 0.0880392	total: 11s	remaining: 49.5s
182:	learn: 0.088004

325:	learn: 0.0836713	total: 19.9s	remaining: 41.2s
326:	learn: 0.0836539	total: 20s	remaining: 41.1s
327:	learn: 0.0836445	total: 20.1s	remaining: 41.1s
328:	learn: 0.0836199	total: 20.1s	remaining: 41s
329:	learn: 0.0836076	total: 20.2s	remaining: 40.9s
330:	learn: 0.0835964	total: 20.2s	remaining: 40.9s
331:	learn: 0.0835799	total: 20.3s	remaining: 40.8s
332:	learn: 0.0835437	total: 20.3s	remaining: 40.7s
333:	learn: 0.0835263	total: 20.4s	remaining: 40.7s
334:	learn: 0.0835136	total: 20.5s	remaining: 40.6s
335:	learn: 0.0834993	total: 20.5s	remaining: 40.6s
336:	learn: 0.0834887	total: 20.6s	remaining: 40.5s
337:	learn: 0.0834635	total: 20.6s	remaining: 40.4s
338:	learn: 0.0834492	total: 20.7s	remaining: 40.3s
339:	learn: 0.0834330	total: 20.7s	remaining: 40.3s
340:	learn: 0.0834189	total: 20.8s	remaining: 40.2s
341:	learn: 0.0834042	total: 20.9s	remaining: 40.1s
342:	learn: 0.0833956	total: 20.9s	remaining: 40.1s
343:	learn: 0.0833812	total: 21s	remaining: 40s
344:	learn: 0.083368

485:	learn: 0.0809461	total: 29.4s	remaining: 31.1s
486:	learn: 0.0809310	total: 29.5s	remaining: 31.1s
487:	learn: 0.0809126	total: 29.5s	remaining: 31s
488:	learn: 0.0809014	total: 29.6s	remaining: 30.9s
489:	learn: 0.0808915	total: 29.7s	remaining: 30.9s
490:	learn: 0.0808801	total: 29.7s	remaining: 30.8s
491:	learn: 0.0808651	total: 29.8s	remaining: 30.8s
492:	learn: 0.0808511	total: 29.8s	remaining: 30.7s
493:	learn: 0.0808392	total: 29.9s	remaining: 30.6s
494:	learn: 0.0808299	total: 30s	remaining: 30.6s
495:	learn: 0.0808213	total: 30s	remaining: 30.5s
496:	learn: 0.0808136	total: 30.1s	remaining: 30.4s
497:	learn: 0.0808021	total: 30.1s	remaining: 30.4s
498:	learn: 0.0807908	total: 30.2s	remaining: 30.3s
499:	learn: 0.0807824	total: 30.3s	remaining: 30.3s
500:	learn: 0.0807751	total: 30.3s	remaining: 30.2s
501:	learn: 0.0807597	total: 30.4s	remaining: 30.1s
502:	learn: 0.0807416	total: 30.4s	remaining: 30.1s
503:	learn: 0.0807312	total: 30.5s	remaining: 30s
504:	learn: 0.080716

645:	learn: 0.0792380	total: 38.8s	remaining: 21.3s
646:	learn: 0.0792303	total: 38.9s	remaining: 21.2s
647:	learn: 0.0792247	total: 38.9s	remaining: 21.1s
648:	learn: 0.0792131	total: 39s	remaining: 21.1s
649:	learn: 0.0792082	total: 39.1s	remaining: 21s
650:	learn: 0.0791986	total: 39.1s	remaining: 21s
651:	learn: 0.0791883	total: 39.2s	remaining: 20.9s
652:	learn: 0.0791748	total: 39.2s	remaining: 20.9s
653:	learn: 0.0791709	total: 39.3s	remaining: 20.8s
654:	learn: 0.0791656	total: 39.4s	remaining: 20.7s
655:	learn: 0.0791560	total: 39.4s	remaining: 20.7s
656:	learn: 0.0791457	total: 39.5s	remaining: 20.6s
657:	learn: 0.0791339	total: 39.6s	remaining: 20.6s
658:	learn: 0.0791296	total: 39.6s	remaining: 20.5s
659:	learn: 0.0791269	total: 39.7s	remaining: 20.4s
660:	learn: 0.0791189	total: 39.7s	remaining: 20.4s
661:	learn: 0.0791108	total: 39.8s	remaining: 20.3s
662:	learn: 0.0790994	total: 39.9s	remaining: 20.3s
663:	learn: 0.0790940	total: 39.9s	remaining: 20.2s
664:	learn: 0.0790

805:	learn: 0.0779644	total: 48.3s	remaining: 11.6s
806:	learn: 0.0779523	total: 48.4s	remaining: 11.6s
807:	learn: 0.0779482	total: 48.4s	remaining: 11.5s
808:	learn: 0.0779405	total: 48.5s	remaining: 11.4s
809:	learn: 0.0779310	total: 48.5s	remaining: 11.4s
810:	learn: 0.0779273	total: 48.6s	remaining: 11.3s
811:	learn: 0.0779059	total: 48.7s	remaining: 11.3s
812:	learn: 0.0778989	total: 48.7s	remaining: 11.2s
813:	learn: 0.0778929	total: 48.8s	remaining: 11.1s
814:	learn: 0.0778783	total: 48.8s	remaining: 11.1s
815:	learn: 0.0778760	total: 48.9s	remaining: 11s
816:	learn: 0.0778675	total: 49s	remaining: 11s
817:	learn: 0.0778647	total: 49s	remaining: 10.9s
818:	learn: 0.0778615	total: 49.1s	remaining: 10.8s
819:	learn: 0.0778560	total: 49.1s	remaining: 10.8s
820:	learn: 0.0778526	total: 49.2s	remaining: 10.7s
821:	learn: 0.0778517	total: 49.3s	remaining: 10.7s
822:	learn: 0.0778473	total: 49.3s	remaining: 10.6s
823:	learn: 0.0778440	total: 49.4s	remaining: 10.5s
824:	learn: 0.077841

965:	learn: 0.0770627	total: 57.6s	remaining: 2.03s
966:	learn: 0.0770570	total: 57.7s	remaining: 1.97s
967:	learn: 0.0770506	total: 57.8s	remaining: 1.91s
968:	learn: 0.0770448	total: 57.8s	remaining: 1.85s
969:	learn: 0.0770430	total: 57.9s	remaining: 1.79s
970:	learn: 0.0770414	total: 57.9s	remaining: 1.73s
971:	learn: 0.0770349	total: 58s	remaining: 1.67s
972:	learn: 0.0770329	total: 58s	remaining: 1.61s
973:	learn: 0.0770253	total: 58.1s	remaining: 1.55s
974:	learn: 0.0770173	total: 58.2s	remaining: 1.49s
975:	learn: 0.0770132	total: 58.2s	remaining: 1.43s
976:	learn: 0.0770106	total: 58.3s	remaining: 1.37s
977:	learn: 0.0770061	total: 58.3s	remaining: 1.31s
978:	learn: 0.0769971	total: 58.4s	remaining: 1.25s
979:	learn: 0.0769930	total: 58.5s	remaining: 1.19s
980:	learn: 0.0769888	total: 58.5s	remaining: 1.13s
981:	learn: 0.0769810	total: 58.6s	remaining: 1.07s
982:	learn: 0.0769745	total: 58.6s	remaining: 1.01s
983:	learn: 0.0769696	total: 58.7s	remaining: 954ms
984:	learn: 0.07

  elif pd.api.types.is_categorical(cols):


In [None]:
import glob
import pandas as pd
import numpy as np
import xgboost as xgb
import category_encoders as ce
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error as mae

files = glob.glob("/tmp/working/dataset/nishika/old_apartment_2020/train/*")
data_list = []
for file in files:
    data_list.append(pd.read_csv(file, index_col=0))
df = pd.concat(data_list)

def data_pre(df):
    nonnull_list = []
    for col in df.columns:
        nonnull = df[col].count()
        if nonnull == 0:
            nonnull_list.append(col)
    df = df.drop(nonnull_list, axis=1)

    df = df.drop("市区町村名", axis=1)

    df = df.drop("種類", axis=1)

    dis = {
        "30分?60分":45,
        "1H?1H30":75,
        "2H?":120,
        "1H30?2H":105
    }
    df["最寄駅：距離（分）"] = df["最寄駅：距離（分）"].replace(dis).astype(float)

    df["面積（㎡）"] = df["面積（㎡）"].replace("2000㎡以上", 2000).astype(float)


    y_list = {}
    for i in df["建築年"].value_counts().keys():
        if "平成" in i:
            num = float(i.split("平成")[1].split("年")[0])
            year = 33 - num
        if "令和" in i:
            num = float(i.split("令和")[1].split("年")[0])
            year = 3 - num
        if "昭和" in i:
            num = float(i.split("昭和")[1].split("年")[0])
            year = 96 - num
        y_list[i] = year
    y_list["戦前"] = 76
    df["建築年"] = df["建築年"].replace(y_list)

    year = {
        "年第１四半期": ".25",
        "年第２四半期": ".50",
        "年第３四半期": ".75",
        "年第４四半期": ".99"
    }
    year_list = {}
    for i in df["取引時点"].value_counts().keys():
        for k, j in year.items():
            if k in i:
                year_rep = i.replace(k, j)
        year_list[i] = year_rep
    df["取引時点"] = df["取引時点"].replace(year_list).astype(float)
    
    cols = ["都道府県名", "地区名", "最寄駅：名称", "間取り", "建物の構造", "用途", "今後の利用目的", "都市計画", "改装", "取引の事情等"]
    ce_df = ce.OrdinalEncoder(cols=cols, handle_unknown='impute')
    df = ce_df.fit_transform(df)
    
    return df
    
df = data_pre(df)

df_train, df_val =train_test_split(df, test_size=0.2)

col = "取引価格（総額）_log"
train_y = df_train[col]
train_x = df_train.drop(col, axis=1)

val_y = df_val[col]
val_x = df_val.drop(col, axis=1)

# train_data = xgb.DMatrix(train_x, label=train_y)
# eval_data = xgb.DMatrix(val_x, label=val_y)

# xgb_params = {
#     "objective": "reg:squarederror",
#     'eval_metric': "mae"
#     }
# evals = [(train_data, 'train'), (eval_data, 'eval')]

# model = xgb.train(
#     xgb_params,
#     train_data,
#     num_boost_round=10,
#     early_stopping_rounds=10,
#     evals=evals,
#     )

model = xgb.XGBRegressor()
rg = model.fit(train_x, train_y)

vals = rg.predict(val_x)
mae(vals, val_y)

df_test = pd.read_csv("/tmp/working/dataset/nishika/old_apartment_2020/test.csv", index_col=0)
df_test = data_pre(df_test)

predict = rg.predict(df_test)
df_test["取引価格（総額）_log"] = predict
df_test[["取引価格（総額）_log"]].to_csv("submit_test.csv")