## 키워드, Phase 변수 생성

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# 전처리한 데이터 불러오기
df = pd.read_csv('df_all_apt(2022-02-15).csv')
df.head()

Unnamed: 0,property_ID,address_ID,listed_date,reupload,new,descrip10,description,Floorsize,Floor,House_type,...,g_unit,g_virtual,g_location,g_apt,g_activity,phase_num,highfloor,g_view2,g_unit2,nearMRT
0,1,1,2019-10-26,new,1,excellent unit sale miss 2 bed rom sale go fast,excellent unit sale miss 2 bed rom sale go fas...,930,Middle Floor,Condo,...,0,0,1,0,1,1.0,0,1,0,0.0
1,2,2,2019-10-26,new,1,great deal miss rarely avail 3 bed sale go fast,great deal miss rarely avail 3 bed sale go fas...,1143,Middle Floor,Condo,...,0,0,1,0,0,1.0,0,1,0,0.0
2,3,3,2019-10-26,new,1,canopy exe condo available single spr 21 years...,canopy exe condo available single spr 21 years...,1033,High Floor,Condo,...,1,0,0,1,1,1.0,1,1,1,0.0
3,4,4,2019-10-26,new,1,big 2 bedrooms sale sale want stay high floor ...,big 2 bedrooms sale sale want stay high floor ...,1001,High Floor,Condo,...,0,0,0,0,0,1.0,1,1,1,0.0
4,5,2,2019-10-26,new,1,3br awesome full reservoir view rare full unbl...,3br awesome full reservoir view rare full unbl...,1248,High Floor,Condo,...,1,0,0,0,0,1.0,1,1,1,0.0


In [2]:
# 높은 층
df['highfloor'] = np.where(df['Floor'] == 'High Floor', 1, 0)

# 재택 근무가 보편화된 날짜
df['WFH0516']  = np.where(df['listed_date'] >  '2021-05-16', 1,0)
df['WFH'] = np.where((df['listed_date']>'2020-06-18') & (df['listed_date']<'2020-12-27'), 1, 0)

# df[['listed_date', 'WFH0516']].sample(10)

## keyword 파생변수

In [3]:
# keyword 개별 변수 생성
keywords  = ['park', 'green','school', 'pool', 'swim', 'gym', 'view', 'high floor', 'study', 'spacious', 'mall', 'MRT']

for keyword in keywords:
    df['keyword_' + str(keyword)] = df['description'].str.contains(keyword, case= False)

# renovation은 따로 만듦
df['keyword_renovated'] = df['description'].str.contains('renovate|renovated|renovation|', case = False)

In [4]:
# df.drop(columns = ['keyword_renovate', 'keyword_renovation'], inplace = True)

- `g_renovated`: renovate, maintain(maintained의 원형)
- `g_view`: unblock, river, lake, view, seaview, skyview, highfloor(condo에서 급증)
- `g_unit`: 3bedroom, balcony, space, renovate, maintain(maintained의 원형), study(condo에서 급증) 
- `g_virtual`: virtual, video

- `g_location (입지적 측면)` : mrt, near, nearby, nearest, location, walk, minute, mins, short, distance, amenities, station, central, school, shop, street, bus, opposite, centre, surround, interchange, go, accessible, supermarket, bank, connectivity
-`g_apt (단지 측면)` : facilities, community, swim, gym, gym
- `g_activity`: swim, gym, pool

In [5]:
# grouping
# 2019년에 비해 2020년, 2021년에 언급 확률이 높아진 단어들
df['g_renovated'] = df['description'].str.contains('renovate|renovated|maintain|maintained', case = False)
df['g_view'] = df['description'].str.contains('unblock|river|lake|view|seaview|skyview|highfloor', case = False)
df['g_view2'] = df['description'].str.contains('unblock|river|lake|view|seaview|skyview|highfloor|sea|seaside', case = False)
df['g_unit'] = df['description'].str.contains('3bedroom|balcony|space|renovate|renovated|maintain|maintained|study', case = False)
df['g_unit2'] = df['description'].str.contains('3bedroom|bedroom|balcony|space|renovate|renovated|maintain|maintained|study', case = False)
df['g_virtual'] = df['description'].str.contains('virtual|video', case = False)

# 2019년에 비해 2020년, 2021년에 언급 확률이 낮아진 단어들
location = ['mrt', 'near, nearby', 'nearest', 'location', 'walk', 'minute', 'mins', 'short', 'distance', 'amenities', 'station', 'central', 'school', 'shop', 'street', 
            'bus', 'opposite', 'centre', 'surround', 'interchange', 'go', 'accessible', 'supermarket', 'bank', 'connectivity']

location = '|'.join(location)
df['g_location'] = df['description'].str.contains(location, case = False)
df['g_apt'] = df['description'].str.contains('facilities|community|swim|gym', case = False)
df['g_activity'] = df['description'].str.contains('swim|gym|pool', case = False)

해당 키워드의 포함 여부를 통해 그룹핑 변수를 생성해주었습니다. 제대로 생성되었는지 확인보겠습니다.

In [6]:
df.loc[df['description'].str.contains('minute')]['g_location'].value_counts()

True    4760
Name: g_location, dtype: int64

In [7]:
df.loc[df['description'].str.contains('unblock')]['g_view'].value_counts()

True    28497
Name: g_view, dtype: int64

In [8]:
df.loc[df['description'].str.contains('maintain')]['g_renovated'].value_counts()

True    8978
Name: g_renovated, dtype: int64

확인 완료!

## Phase

In [9]:
# 해당 단계의 숫자 뽑아옴
df['phase_num'] = df['phase'].str.split(')').apply(lambda x : x[0])
df[['phase', 'phase_num']].sample(5)

Unnamed: 0,phase,phase_num
33720,5) phase3,5
13032,4) phase2,4
31484,1) before covid,1
134303,4) phase2,4
90061,4) phase2,4


In [10]:
# 해당 기준보다 입력 받은 값이 크면 1 아니면 0
var = [1,1.5, 2,3,4,5,6,7,10,11]

def get_phase(x, num):
    if float(num) >  float(x):
        return True
    else:
        return False
    
for num in var:
    df['T_a'+str(num)]= df['phase_num'].apply(lambda x : get_phase(x, num))

In [11]:
df[['phase_num', 'T_a1', 'T_a1.5', 'T_a2', 'T_a3', 'T_a4', 'T_a5', 'T_a6', 'T_a7', 'T_a10', 'T_a11']].sample(5)

Unnamed: 0,phase_num,T_a1,T_a1.5,T_a2,T_a3,T_a4,T_a5,T_a6,T_a7,T_a10,T_a11
167663,5,False,False,False,False,False,False,True,True,True,True
7199,1,False,True,True,True,True,True,True,True,True,True
179748,4,False,False,False,False,False,True,True,True,True,True
236575,1,False,True,True,True,True,True,True,True,True,True
237563,1,False,True,True,True,True,True,True,True,True,True


In [12]:
df['listed_year_dummy'] = np.where(df['listed_year'] >= 2020, 1, 0)
df[['listed_year', 'listed_year_dummy']].head()

Unnamed: 0,listed_year,listed_year_dummy
0,2019,0
1,2019,0
2,2019,0
3,2019,0
4,2019,0


In [13]:
# bool type -> int type

# extract the bool variable
cols = list(df.dtypes[df.dtypes == 'bool'].index)

# convert to int
for col in cols:
    df[col] = df[col].astype('int')

In [None]:
df.to_csv('df_all_apt(2022-02-15).csv', index = False)

In [None]:
df_condo = df.loc[df['House_type'] == 'Condo'].reset_index(drop = True)
df_condo.to_csv('df_condo(2022-02-15).csv', index = False)

In [None]:
df_HDB = df.loc[df['House_type'] == 'HDB'].reset_index(drop = True)
df_HDB.to_csv('df_HDB(2022-02-15).csv', index = False)