## 직방 아파트 가격 예측 : 결측치 처리 & 전처리

In [1]:
import pandas as pd
import numpy as np
import os
import warnings
import matplotlib.pyplot as plt
warnings.filterwarnings(action='ignore')                  # Turn off the warnings.
from sklearn.linear_model import LinearRegression
import os

In [2]:
os.chdir(r'..\data')

### 1. 데이터 불러와서 살펴보기.

In [3]:
df = pd.read_csv('data_zigbang.csv', header='infer',encoding = 'latin')

In [4]:
df.shape

(1601458, 25)

In [5]:
df.head(3)

Unnamed: 0,key,apartment_id,city,transaction_year_month,transaction_date,year_of_completion,exclusive_use_area,floor,latitude,longitude,...,lowest_building_in_sites,heat_type,heat_fuel,room_id,supply_area,total_household_count_of_area_type,room_count,bathroom_count,front_door_structure,transaction_real_price
0,0,5584,1,200601,11~20,1999,47.43,6,37.585965,127.000231,...,4.0,individual,gas,91120,65.63,46,1.0,1.0,corridor,215000000
1,1,5584,1,200601,11~20,1999,44.37,8,37.585965,127.000231,...,4.0,individual,gas,91119,61.39,10,2.0,1.0,corridor,200000000
2,2,5059,1,200601,11~20,1992,54.7,8,37.580511,127.014016,...,9.0,individual,gas,8430,72.36,201,2.0,1.0,corridor,168000000


In [6]:
features = list(df.columns)

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1601458 entries, 0 to 1601457
Data columns (total 25 columns):
key                                   1601458 non-null int64
apartment_id                          1601458 non-null int64
city                                  1601458 non-null int64
transaction_year_month                1601458 non-null int64
transaction_date                      1601458 non-null object
year_of_completion                    1601458 non-null int64
exclusive_use_area                    1601458 non-null float64
floor                                 1601458 non-null int64
latitude                              1601458 non-null float64
longitude                             1601458 non-null float64
address_by_law                        1601458 non-null int64
total_parking_capacity_in_site        1509645 non-null float64
total_household_count_in_sites        1601458 non-null int64
apartment_building_count_in_sites     1601458 non-null int64
tallest_building_in_site

### 2. 결측치 처리.

In [8]:
# 결측치 확인.
(df.isnull()).mean(axis=0)

key                                   0.000000
apartment_id                          0.000000
city                                  0.000000
transaction_year_month                0.000000
transaction_date                      0.000000
year_of_completion                    0.000000
exclusive_use_area                    0.000000
floor                                 0.000000
latitude                              0.000000
longitude                             0.000000
address_by_law                        0.000000
total_parking_capacity_in_site        0.057331
total_household_count_in_sites        0.000000
apartment_building_count_in_sites     0.000000
tallest_building_in_sites             0.000006
lowest_building_in_sites              0.000006
heat_type                             0.001259
heat_fuel                             0.006036
room_id                               0.000000
supply_area                           0.000000
total_household_count_of_area_type    0.000000
room_count   

#### total_parking_capacity_in_site 변수의 결측치 처리.

In [9]:
# 다른 변수와 상관성 확인.
print(np.round(df.total_household_count_in_sites.corr(df.total_parking_capacity_in_site),3))

0.902


In [10]:
# 선형회귀.
reg = LinearRegression()
X = df['total_household_count_in_sites']
Y = df['total_parking_capacity_in_site']
X_train = X[-Y.isnull()]
Y_train = Y[-Y.isnull()]
reg.fit(np.array(X_train).reshape(-1,1), np.array(Y_train).reshape(-1,1));

In [11]:
# 결측치만 예측해서 대체.
Y_fill = reg.predict(np.array(X).reshape(-1,1))
df['total_parking_capacity_in_site'][Y.isnull()] = Y_fill[Y.isnull()]
print(df['total_parking_capacity_in_site'].isnull().sum())                # 결측치 0개 확인.

0


#### room_count 변수의 결측치 처리.

In [12]:
# 다른 변수와 상관성 확인.
print(np.round(df.room_count.corr(df.exclusive_use_area),3))
print(np.round(df.room_count.corr(df.supply_area),3))

0.769
0.758


In [13]:
# 선형회귀.
reg = LinearRegression()
X = df[['exclusive_use_area','supply_area']]
Y = df['room_count']
X_train = X[-Y.isnull()]
Y_train = Y[-Y.isnull()]
reg.fit(np.array(X_train), np.array(Y_train).reshape(-1,1));

In [14]:
# 결측치만 예측해서 대체.
Y_fill = reg.predict(X)
df['room_count'][Y.isnull()] = Y_fill[Y.isnull()].round()
print(df['room_count'].isnull().sum())                # 결측치 0개 확인.

0


In [15]:
# room_count의 도수분포 보기.
df['room_count'].value_counts()

3.0    1085967
2.0     249807
4.0     208101
1.0      34311
5.0      18468
0.0       3231
6.0       1519
7.0         38
8.0         16
Name: room_count, dtype: int64

#### bathroom_count 변수의 결측치 처리.

In [16]:
# 다른 변수와 상관성 확인.
print(np.round(df.bathroom_count.corr(df.exclusive_use_area),3))
print(np.round(df.bathroom_count.corr(df.supply_area),3))
print(np.round(df.bathroom_count.corr(df.room_count),3))

0.648
0.654
0.565


In [17]:
# 선형회귀.
reg = LinearRegression()
X = df[['exclusive_use_area','supply_area','room_count']]
Y = df['bathroom_count']
X_train = X[-Y.isnull()]
Y_train = Y[-Y.isnull()]
reg.fit(np.array(X_train), np.array(Y_train).reshape(-1,1));

In [18]:
# 결측치만 예측해서 대체.
Y_fill = reg.predict(X)
df['bathroom_count'][Y.isnull()] = Y_fill[Y.isnull()].round()
print(df['bathroom_count'].isnull().sum())                # 결측치 0개 확인.

0


In [19]:
# bathroom_count의 도수분포 보기.
df['bathroom_count'].value_counts()

2.0    926987
1.0    663278
3.0      7627
0.0      3231
4.0       334
5.0         1
Name: bathroom_count, dtype: int64

#### tallest_building_in_sites 와 lowest_building_in_sites 결측치 처리.

In [20]:
i = df.tallest_building_in_sites.isnull()
avg = np.round(df['tallest_building_in_sites'].mean())
df.loc[i,'tallest_building_in_sites'] = avg

In [21]:
i = df.lowest_building_in_sites.isnull()
avg = np.round(df['lowest_building_in_sites'].mean())
df.loc[i,'lowest_building_in_sites'] = avg

### 3. 데이터 간추리기.

In [22]:
# key 변수 제거.
df.drop(columns=['key'],inplace=True)

In [23]:
# city = 1 서울 선택 (city=0 부산 제거).
# city 변수 제거.
df = df[df['city'] == 1]
df.drop(columns=['city'],inplace=True)

In [24]:
# transaction_year_month > 201600.
df = df[df['transaction_year_month'] > 201600]

In [25]:
# 이외 불필요 변수 제거.
df.drop(columns=['heat_type','heat_fuel','front_door_structure'],inplace=True)  

In [26]:
# 모양.
df.shape

(301400, 20)

In [27]:
# 결측치 최종 집계.
df.isnull().sum()

apartment_id                          0
transaction_year_month                0
transaction_date                      0
year_of_completion                    0
exclusive_use_area                    0
floor                                 0
latitude                              0
longitude                             0
address_by_law                        0
total_parking_capacity_in_site        0
total_household_count_in_sites        0
apartment_building_count_in_sites     0
tallest_building_in_sites             0
lowest_building_in_sites              0
room_id                               0
supply_area                           0
total_household_count_of_area_type    0
room_count                            0
bathroom_count                        0
transaction_real_price                0
dtype: int64

In [28]:
# index 리셋.
df.index = range(df.shape[0])

### 4. 데이터 전처리.

#### transaction_year_month로 파생변수 만들기.

In [29]:
# transaction_year 파생변수.
df['transaction_year']=df['transaction_year_month'].apply(lambda x : int(str(x)[:4]))

In [30]:
# transaction_month 파생변수.
df['transaction_month']=df['transaction_year_month'].apply(lambda x : int(str(x)[4:]))

In [31]:
i1 = (1 <= df.transaction_month) & (df.transaction_month <=3)  # 1사분기
i2 = (4 <= df.transaction_month) & (df.transaction_month <=6)  # 2사분기
i3 = (7 <= df.transaction_month) & (df.transaction_month <=9)  # 3사분기
i4 = (9 <= df.transaction_month) & (df.transaction_month <=12)  # 4사분기

In [32]:
# transaction_quarter 파생변수.
df['transaction_quarter'] = df['transaction_month']
df['transaction_quarter'][i1] = 1
df['transaction_quarter'][i2] = 2
df['transaction_quarter'][i3] = 3
df['transaction_quarter'][i4] = 4

#### old = transaction_year - year_of_completion 파생변수 만들기.

In [33]:
df['old'] = df.transaction_year - df.year_of_completion

#### transaction_date 변환.

In [34]:
i1 = df.transaction_date == '1~10'  
i2 = df.transaction_date == '11~20'  
i3 = (df.transaction_date != '1~10') & (df.transaction_date != '11~20')

In [35]:
df['transaction_date'] = 0   # 먼저 integer로 변환.
df['transaction_date'][i1] = 5
df['transaction_date'][i2] = 15
df['transaction_date'][i3] = 25

#### 전용면적 비율.

In [36]:
df['area_ratio'] = df['exclusive_use_area']/df['supply_area']

#### 아파트별 주차장.

In [37]:
df['parking_household'] = df['total_parking_capacity_in_site']/df['total_household_count_in_sites']

#### transaction_real_price 로그로 변환된 변수.

In [38]:
df['y'] = np.log(df['transaction_real_price'])

### 5. 외부 파일로 출력.

In [39]:
df.to_csv('data_zigbang_2.csv',index=False)