### 学習データの加工
- 欠損率40%異常のデータを削除
- 国土交通省のデータを追加


### 0.import library and set configration

In [28]:
import os
import pandas as pd
import numpy as np
import geopandas as gpd

from scipy.spatial import cKDTree

import pickle
import unicodedata
import warnings
# 全ての警告を無視する
warnings.filterwarnings("ignore")

In [29]:
#import_original_function
from function.data_processing import test2
from function.data_processing import addtional_process
from function.modeling import objective

In [30]:
#pandasの表示設定
# 表示制限の緩和
pd.set_option('display.max_columns', 300 )
pd.set_option('display.max_rows', 500 )
# 指数表記の無効化
pd.options.display.float_format = '{:.2f}'.format

In [31]:
#緯度経度をgeo型のデータに変換
def convert_gep_df(df):
    try:
        df_converted = gpd.GeoDataFrame(
                                        df.drop(columns=["lat","lon"]),
                                        geometry=gpd.points_from_xy(df.lon, df.lat),
                                        crs="EPSG:6668"
                                    )
        return df_converted
    except Exception as e:
        print(f"エラーが発生しました: {e}")
        return df

In [32]:
def add_geo_parameter(original_df, geo_df, geo_dic,mode):
    # 定義辞書
    
    df = original_df.copy()
    
    new_col = geo_dic[mode]["new_col"]
    reference_col = geo_dic[mode]["reference_col"]
    
    # 空間結合でwithin関係を一括取得
    joined = gpd.sjoin(df, geo_df, how="left", predicate="within")
    
    # 合計値を新しい列に計算
    for col in reference_col:
        if col in geo_df.columns:
            joined[col] = joined[col].fillna(0)
        else:
            raise ValueError(f"Geo DataFrameに'{col}'列がありません")
    
    joined[new_col] = joined[reference_col].sum(axis=1)
    
    # 不要な列を削除して戻す
    return joined[original_df.columns.tolist() + [new_col]]

In [33]:
#configファイルの読み込み
import configparser
config_ini = configparser.ConfigParser()
config_ini.read('./config/config.ini', encoding='utf-8')

['./config/config.ini']

In [34]:
#read_file_congfig
test_file = config_ini["INPUT_FILE"]["test"]
train_file = config_ini["INPUT_FILE"]["train"]
test_edited_file = config_ini["INPUT_FILE"]["test_edited"]
train_edited_file = config_ini["INPUT_FILE"]["train_edited"]

deginition_file = config_ini["CONFIG_FILE"]["deginition"]
submit_file = config_ini["OUTPUT_FILE"]["submit"]
shap_plot_file = config_ini["OUTPUT_FILE"]["shap_plot"]
model_file = config_ini["OUTPUT_FILE"]["model"]
dbscan_model_file = config_ini["OUTPUT_FILE"]["dbscan_model"]

#read_var_config
response_var = config_ini["VAR"]["response_var"]


### 1.import_train/test_data


In [35]:
#データの成型方法を読込
df_deginition  = pd.read_excel(deginition_file,index_col=0, 
                               #dtype = 'object',
                               sheet_name="data_deginition")

In [36]:
# 対象のCSVデータの読込
df_train_raw = pd.read_csv(train_file , header=0,sep=',' , dtype = 'object')
df_test_raw  = pd.read_csv(test_file  , header=0,sep=',' , dtype = 'object')

### 2.data_processing

#### 2-1.select_varriables
- 学習データの欠損率の40％異常の変数を削除

In [37]:
#欠損率でフィルタリング
df_deginition = df_deginition[df_deginition["データ欠損率(%)"] <= 40][df_deginition["国土交通省データ"] != "〇"]
#学習に使うカラムを選択
target_columns = df_deginition["本番データ特徴量名"].values.tolist()
# 欠損率の高いカラムを絞り込み
df_train = df_train_raw[target_columns]
df_test = df_test_raw[[col for col in target_columns if not col == response_var]]

In [38]:
print("df_train",len(df_train))
print("df_test",len(df_test))

df_train 584507
df_test 384540


#### 2-2.add varriables
- 以下の国土交通省のデータを追記する（すでに存在している場合はスキップされる）
  - XX


In [39]:
#緯度経度の変換
df_train = convert_gep_df(df_train)
df_test = convert_gep_df(df_test)

In [40]:
#国土交通省のデータの読み取り方法を定義
geo_dic = {
    "population": {
        "new_col": "population_prediction",
        "reference_col": ["PT5_2030", "PT6_2030"]
    },

    "flood": {
        "new_col": "flood_depth_rank",
        "reference_col": ["A31b_101"]
    }
}


##### 2-2-1.人口推移カラム追加

In [41]:
shp_path = "./input/国土数値情報データ/人口推移/500m_mesh_2018_14.shp"
gdf_population = gpd.read_file(shp_path, encoding='cp932')  # Shapefile読込

In [42]:
df_train = add_geo_parameter(df_train,gdf_population,geo_dic,mode="population")
df_test = add_geo_parameter(df_test,gdf_population,geo_dic,mode="population")

##### 2-2-2.ハザードマップ情報追加

In [43]:
shp_path = f"./input/国土数値情報データ/洪水浸水想定区域_計画規模/A31b-10-23_10_5339.shp"
gdf_flood_hazard = gpd.read_file(shp_path, encoding='cp932')  # Shapefile読込

In [44]:
df_train = add_geo_parameter(df_train,gdf_flood_hazard,geo_dic,mode="flood")
df_test = add_geo_parameter(df_test,gdf_flood_hazard,geo_dic,mode="flood")

In [45]:
print("df_train",len(df_train))
print("df_test",len(df_test))

df_train 584507
df_test 384540


##### 2-2-3.交通利便性情報追加

In [46]:
shp_path = f"./input/国土数値情報データ/駅別乗降客数/S12-23_NumberOfPassengers.shp"
gdf_npass = gpd.read_file(shp_path, encoding='cp932')  # Shapefile読込

In [47]:
# 2022年のカラム
gdf_npass_2022 = gdf_npass[["S12_001","S12_050","S12_051","S12_053"]]
gdf_npass_2022.columns = ["eki_name1","重複コード","データ有無","number_of_passenger"]
gdf_npass_2022 = gdf_npass_2022[(
    gdf_npass_2022["重複コード"]==1)&(
    gdf_npass_2022["データ有無"]==1)&(
    gdf_npass_2022["number_of_passenger"]!=0
    )]

gdf_npass_2022 = gdf_npass_2022[["eki_name1","number_of_passenger"]]
gdf_npass_2022 = gdf_npass_2022.drop_duplicates("eki_name1")


In [48]:
df_train = pd.merge(df_train,gdf_npass_2022,on = "eki_name1",how="left")
df_test = pd.merge(df_test,gdf_npass_2022,on = "eki_name1",how="left")

In [53]:
print("df_train",len(df_train))
print("df_test",len(df_test))

df_train 584507
df_test 384540


##### 2-2-4.駅と地価情報追加

In [50]:
def add_nearest_point_info(A,B,new_col,reference_col):

    # AとBのgeometryを取り出してnumpy配列に変換
    A_coords = np.array(list(zip(A.geometry.x, A.geometry.y)))
    B_coords = np.array(list(zip(B.geometry.x, B.geometry.y)))

    # cKDTreeを使って最も近いポイントを効率的に検索
    tree = cKDTree(B_coords)
    distances, indices = tree.query(A_coords, k=1)  # k=1で最も近いポイントを検索

    # 検索結果をAの新しいカラムに格納
    A[new_col] = B.loc[indices,reference_col].reset_index(drop=True)

    return A

In [51]:
shp_path = "./input/国土数値情報データ/公示地価_神奈川/L01-24_14.shp"
gdf_land_price = gpd.read_file(shp_path, encoding='cp932')  # Shapefile読込

In [52]:
df_train = add_nearest_point_info(df_train,gdf_land_price,"land_price","L01_008")
df_test = add_nearest_point_info(df_test,gdf_land_price,"land_price","L01_008")

#### 3.データ出力

In [54]:
#geometryカラムを削除
df_train  = df_train.drop("geometry",axis=1)
df_test  = df_test.drop("geometry",axis=1)

In [55]:
df_train.to_csv(train_edited_file)
df_test.to_csv(test_edited_file)

In [76]:
df_test

Unnamed: 0,target_ym,building_id,building_status,building_create_date,building_modify_date,building_type,building_name,homes_building_name,homes_building_name_ruby,unit_count,full_address,building_structure,floor_count,year_built,building_tag_id,unit_id,unit_name,room_floor,dwelling_unit_window_angle,room_count,unit_area,floor_plan_code,unit_tag_id,bukken_id,snapshot_create_date,new_date,snapshot_modify_date,timelimit_date,flg_open,flg_own,bukken_type,empty_contents,post1,post2,addr1_1,addr1_2,addr2_name,addr4_name,nl,el,rosen_name1,eki_name1,walk_distance1,rosen_name2,eki_name2,walk_distance2,house_area,flg_new,room_kaisuu,snapshot_window_angle,madori_number_all,madori_kind_all,money_kyoueki,money_kyoueki_tax,parking_money,parking_kubun,genkyo_code,usable_status,convenience_distance,super_distance,statuses,money_hoshou_company,population_prediction,flood_depth_rank,乗降客数,land_price
0,202207,a372446,1,2014-06-27 20:30:14,2024-02-09 00:10:05,3,シャーメゾンエランA,シャーメゾンELAN A棟,シャーメゾンエラン Aトウ,6,山口県山陽小野田市大字西高泊618-12,10,2,201101,210301/321001/210101/210401/320901,23270592,202,2,6,2,62.5099983,250,340201/230103/310501/230201/230501/220501/3401...,37220460000756,2022-04-18 00:00:00,2022-04-18 00:00:00,2022-07-25 14:00:26,2022-08-01 00:00:00,1,1,3102,A0202,756,57,35,216,大字西高泊,,122414490,472240000,JR山陽本線,小野田,160,,,,62,0,2,5,2,50,3000,3,,1,2,1,,,110102/110903/121002/210301/220101/220201/2203...,【個人契約】 初回契約事務手数料：22，000円（税込）、月額保証料：賃料等の1％,0.00,0.00,2324.00,135000
1,202301,a276097,1,2014-06-28 02:47:45,2024-08-15 02:24:34,1,マルコフォート,マルコフォート,マルコフオート,15,東京都調布市菊野台3丁目3-3,4,3,198703,210101/321001/320901/210301/210201,853089,303,3,5,1,19.4400005,120,230801/240104/310501/263101/290101/230201/2605...,1038270025409,2023-01-30 00:00:00,2023-01-30 00:00:00,2023-01-30 16:44:45,2023-02-06 00:00:00,1,1,3101,303,182,7,13,208,菊野台３丁目,3-3,128339891,502460485,京王線,柴崎,240,京王線,つつじヶ丘,800,19,0,3,5,1,20,2000,3,0,4,3,3,190,30,210201/220501/230102/240104/290902/260101/2904...,,0.00,0.00,15817.00,384000
2,202307,a290359,1,2014-06-27 21:18:44,2024-04-01 19:59:13,3,リバーサイドハウス松浪,リバーサイドハウス,リバーサイドハウス,,大阪府泉佐野市下瓦屋1丁目2-48,10,2,198904,210302/210202/320901/321001/210101,23809363,101,1,5,3,63,350,290101/340401/223101/220401/220201/220101/2302...,1398630020406,2023-06-24 00:00:00,2023-06-24 00:00:00,2023-07-27 06:01:56,2023-08-03 00:00:00,1,1,3101,101,598,62,27,213,下瓦屋１丁目,2-48,123895830,487192181,南海線,井原里,240,南海線,鶴原,1440,61,0,1,3,3,50,0,3,0,1,2,1,1256,867,110301/110902/121002/210101/210202/210301/2201...,初回保証委託料は総賃料の50％要,0.00,0.00,2878.00,135000
3,202207,a200797,1,2014-06-27 20:22:54,2023-04-03 00:10:05,3,サンリットA棟,サン リットA,,,愛媛県松山市吉藤5丁目4-23,1,2,200401,210101/321001/210302/210202,6759472,201,2,5,1,46.2799988,150,220301/340102/230501/250301/220201/220101/2401...,1434580009728,2022-06-09 00:00:00,2022-06-09 00:00:00,2022-07-04 22:56:18,2022-07-11 00:00:00,1,0,3102,201,791,8011,38,201,吉藤5丁目,4-23,121939371,477951158,伊予鉄道環状線(JR松山駅経由),本町六丁目,720,,,,46,0,2,5,1,50,3500,,3300,1,2,3,,,121002/210101/210202/220101/220201/220301/2205...,,0.00,0.00,790.00,135000
4,202207,a200797,1,2014-06-27 20:22:54,2023-04-03 00:10:05,3,サンリットA棟,サン リットA,,,愛媛県松山市吉藤5丁目4-23,1,2,200401,210101/321001/210302/210202,6759472,201,2,5,1,46.2799988,150,220301/340102/230501/250301/220201/220101/2401...,1434580009728,2022-06-09 00:00:00,2022-06-09 00:00:00,2022-07-04 22:56:18,2022-07-11 00:00:00,1,0,3102,201,791,8011,38,201,吉藤5丁目,4-23,121939371,477951158,伊予鉄道環状線(JR松山駅経由),本町六丁目,720,,,,46,0,2,5,1,50,3500,,3300,1,2,3,,,121002/210101/210202/220101/220201/220301/2205...,,0.00,0.00,790.00,135000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
499820,202201,a342254,1,2014-06-28 01:19:41,2024-08-15 03:48:49,1,ジェイシティ八丁堀,ジェイシティ八丁堀,ジェイシティハッチョウボリ,67,広島県広島市中区八丁堀12-10,5,13,200511,210201/210101/310201/320901/321101/321001/3101...,205130,207,2,1,1,25.5,120,230101/253501/220301/220101/340102/310501/2536...,1231540074731,2021-12-01 00:00:00,2021-12-01 00:00:00,2022-01-30 23:04:38,2022-02-06 00:00:00,1,0,3101,0207,730,13,34,101,八丁堀,12-10,123811615,476876161,広島電鉄1系統,八丁堀,320,広島電鉄1系統,立町,320,25.5,0,2,1,1,20,3000,3,33000,1,2,2,188,315,110902/331001/210201/220201/220301/220501/2301...,初回　総賃料の30％ 月額　総賃料の1.5％,0.00,0.00,2020.00,135000
499821,202201,a342254,1,2014-06-28 01:19:41,2024-08-15 03:48:49,1,ジェイシティ八丁堀,ジェイシティ八丁堀,ジェイシティハッチョウボリ,67,広島県広島市中区八丁堀12-10,5,13,200511,210201/210101/310201/320901/321101/321001/3101...,205130,207,2,1,1,25.5,120,230101/253501/220301/220101/340102/310501/2536...,1231540074731,2021-12-01 00:00:00,2021-12-01 00:00:00,2022-01-30 23:04:38,2022-02-06 00:00:00,1,0,3101,0207,730,13,34,101,八丁堀,12-10,123811615,476876161,広島電鉄1系統,八丁堀,320,広島電鉄1系統,立町,320,25.5,0,2,1,1,20,3000,3,33000,1,2,2,188,315,110902/331001/210201/220201/220301/220501/2301...,初回　総賃料の30％ 月額　総賃料の1.5％,0.00,0.00,8618.00,135000
499822,202201,a342254,1,2014-06-28 01:19:41,2024-08-15 03:48:49,1,ジェイシティ八丁堀,ジェイシティ八丁堀,ジェイシティハッチョウボリ,67,広島県広島市中区八丁堀12-10,5,13,200511,210201/210101/310201/320901/321101/321001/3101...,205130,207,2,1,1,25.5,120,230101/253501/220301/220101/340102/310501/2536...,1231540074731,2021-12-01 00:00:00,2021-12-01 00:00:00,2022-01-30 23:04:38,2022-02-06 00:00:00,1,0,3101,0207,730,13,34,101,八丁堀,12-10,123811615,476876161,広島電鉄1系統,八丁堀,320,広島電鉄1系統,立町,320,25.5,0,2,1,1,20,3000,3,33000,1,2,2,188,315,110902/331001/210201/220201/220301/220501/2301...,初回　総賃料の30％ 月額　総賃料の1.5％,0.00,0.00,89641.00,135000
499823,202201,a342254,1,2014-06-28 01:19:41,2024-08-15 03:48:49,1,ジェイシティ八丁堀,ジェイシティ八丁堀,ジェイシティハッチョウボリ,67,広島県広島市中区八丁堀12-10,5,13,200511,210201/210101/310201/320901/321101/321001/3101...,205130,207,2,1,1,25.5,120,230101/253501/220301/220101/340102/310501/2536...,1231540074731,2021-12-01 00:00:00,2021-12-01 00:00:00,2022-01-30 23:04:38,2022-02-06 00:00:00,1,0,3101,0207,730,13,34,101,八丁堀,12-10,123811615,476876161,広島電鉄1系統,八丁堀,320,広島電鉄1系統,立町,320,25.5,0,2,1,1,20,3000,3,33000,1,2,2,188,315,110902/331001/210201/220201/220301/220501/2301...,初回　総賃料の30％ 月額　総賃料の1.5％,0.00,0.00,54782.00,135000
