In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
import codecs

In [2]:
# Read the data
with codecs.open('fudosan/34_Hiroshima Prefecture_20101_20204_mansion.csv', 'r', 'shift_jisx0213', 'ignore') as filename:
    X = pd.read_csv(filename)

In [3]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4085 entries, 0 to 4084
Data columns (total 16 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   種類         4085 non-null   object 
 1   市区町村名      4085 non-null   object 
 2   地区名        4085 non-null   object 
 3   最寄駅：名称     4084 non-null   object 
 4   最寄駅：距離（分）  4082 non-null   float64
 5   取引価格（総額）   4085 non-null   int64  
 6   間取り        3970 non-null   object 
 7   面積（㎡）      4085 non-null   int64  
 8   築年数        4085 non-null   int64  
 9   建物の構造      3995 non-null   object 
 10  都市計画       4076 non-null   object 
 11  建ぺい率（％）    4057 non-null   float64
 12  容積率（％）     4057 non-null   float64
 13  取引時点       4085 non-null   object 
 14  改装         3814 non-null   object 
 15  取引の事情等     115 non-null    object 
dtypes: float64(3), int64(3), object(10)
memory usage: 510.8+ KB


In [4]:
X.nunique()

種類             1
市区町村名         14
地区名          324
最寄駅：名称        86
最寄駅：距離（分）     34
取引価格（総額）     136
間取り           15
面積（㎡）         30
築年数           52
建物の構造          5
都市計画          12
建ぺい率（％）        4
容積率（％）        10
取引時点          44
改装             2
取引の事情等         3
dtype: int64

In [5]:
# Remove rows with missing target, separate target from predictors
X.dropna(axis=0, subset=['取引価格（総額）'], inplace=True)
y = X['取引価格（総額）']
X.drop(['取引価格（総額）'], axis=1, inplace=True)

In [6]:
'''
# Imputation
imputed_X = X.copy()

# Category imputation
cat_imputer = SimpleImputer(strategy='most_frequent')
cat_col = ['最寄駅：名称', '間取り', '建物の構造', '用途', '今後の利用目的', '都市計画', '改装']
imputed_X[cat_col] = pd.DataFrame(cat_imputer.fit_transform(imputed_X[cat_col]))

# Number imputation
num_imputer = SimpleImputer(strategy='median')
num_col = ['最寄駅：距離（分）', '築年数', '建ぺい率（％）', '容積率（％）']
imputed_X[num_col] = pd.DataFrame(num_imputer.fit_transform(imputed_X[num_col]))
'''

"\n# Imputation\nimputed_X = X.copy()\n\n# Category imputation\ncat_imputer = SimpleImputer(strategy='most_frequent')\ncat_col = ['最寄駅：名称', '間取り', '建物の構造', '用途', '今後の利用目的', '都市計画', '改装']\nimputed_X[cat_col] = pd.DataFrame(cat_imputer.fit_transform(imputed_X[cat_col]))\n\n# Number imputation\nnum_imputer = SimpleImputer(strategy='median')\nnum_col = ['最寄駅：距離（分）', '築年数', '建ぺい率（％）', '容積率（％）']\nimputed_X[num_col] = pd.DataFrame(num_imputer.fit_transform(imputed_X[num_col]))\n"

In [7]:
# Break off validation set from training data
X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=0)

In [8]:
'''
# Select categorical columns
low_cardinality_cols = [cname for cname in X_train.columns if X_train[cname].nunique() < 20 and 
                        X_train[cname].dtype == "object"]
high_cardinality_cols = [cname for cname in X_train.columns if X_train[cname].nunique() > 20 and 
                        X_train[cname].dtype == "object"]
                       
# Select numeric columns
numeric_cols = [cname for cname in X_train.columns if X_train[cname].dtype in ['int64', 'float64']]
'''                        

'\n# Select categorical columns\nlow_cardinality_cols = [cname for cname in X_train.columns if X_train[cname].nunique() < 20 and \n                        X_train[cname].dtype == "object"]\nhigh_cardinality_cols = [cname for cname in X_train.columns if X_train[cname].nunique() > 20 and \n                        X_train[cname].dtype == "object"]\n                       \n# Select numeric columns\nnumeric_cols = [cname for cname in X_train.columns if X_train[cname].dtype in [\'int64\', \'float64\']]\n'

In [9]:
# One-hot encode the data (to shorten the code, we use pandas)
# X_train = pd.get_dummies(X_train, columns=low_cardinality_cols)
# X_valid = pd.get_dummies(X_valid, columns=low_cardinality_cols)
X_train = pd.get_dummies(X_train)
X_valid = pd.get_dummies(X_valid)
X_train, X_valid = X_train.align(X_valid, join='left', axis=1)

In [10]:
# Label encode the data
'''
for cols in high_cardinality_cols:
    X_train[cols] = pd.factorize(X_train[cols])[0]
for cols in high_cardinality_cols:
    X_valid[cols] = pd.factorize(X_valid[cols])[0]
'''

'\nfor cols in high_cardinality_cols:\n    X_train[cols] = pd.factorize(X_train[cols])[0]\nfor cols in high_cardinality_cols:\n    X_valid[cols] = pd.factorize(X_valid[cols])[0]\n'

In [11]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3268 entries, 3223 to 2732
Columns: 501 entries, 最寄駅：距離（分） to 取引の事情等_関係者間取引
dtypes: float64(3), int64(2), uint8(496)
memory usage: 1.7 MB


In [12]:
y_train.describe()

count    3.268000e+03
mean     1.805174e+07
std      1.059953e+07
min      3.000000e+06
25%      1.100000e+07
50%      1.600000e+07
75%      2.400000e+07
max      9.500000e+07
Name: 取引価格（総額）, dtype: float64

In [13]:
X_train.head(1)

Unnamed: 0,最寄駅：距離（分）,面積（㎡）,築年数,建ぺい率（％）,容積率（％）,種類_中古マンション等,市区町村名_三原市,市区町村名_呉市,市区町村名_尾道市,市区町村名_広島市中区,...,取引時点_2019年第３四半期,取引時点_2019年第４四半期,取引時点_2020年第１四半期,取引時点_2020年第２四半期,取引時点_2020年第３四半期,取引時点_2020年第４四半期,改装_改装済,改装_未改装,取引の事情等_調停・競売等,取引の事情等_関係者間取引
3223,13.0,70,9,80.0,600.0,1,0,1,0,0,...,0,0,0,0,1,0,0,1,0,0


In [14]:
from xgboost import XGBRegressor

# Define the model
my_model_1 = XGBRegressor(random_state=0)

# Fit the model
my_model_1.fit(X_train, y_train)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.300000012, max_delta_step=0, max_depth=6,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=100, n_jobs=4, num_parallel_tree=1, random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
             tree_method='exact', validate_parameters=1, verbosity=None)

In [17]:
X_valid.head(1)

Unnamed: 0,最寄駅：距離（分）,面積（㎡）,築年数,建ぺい率（％）,容積率（％）,種類_中古マンション等,市区町村名_三原市,市区町村名_呉市,市区町村名_尾道市,市区町村名_広島市中区,...,取引時点_2019年第３四半期,取引時点_2019年第４四半期,取引時点_2020年第１四半期,取引時点_2020年第２四半期,取引時点_2020年第３四半期,取引時点_2020年第４四半期,改装_改装済,改装_未改装,取引の事情等_調停・競売等,取引の事情等_関係者間取引
2193,5.0,65,20,80.0,300.0,1,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [15]:
from sklearn.metrics import mean_absolute_error

# Get predictions
predictions_1 = my_model_1.predict(X_valid)

In [16]:
# Calculate MAE
mae_1 = mean_absolute_error(predictions_1, y_valid)

print("Mean Absolute Error:" , mae_1)

Mean Absolute Error: 2597357.9143206854
