In [237]:
import numpy as np
import pandas as pd
import statsmodels.formula.api as smf
from statsmodels.stats.outliers_influence import variance_inflation_factor
import matplotlib.pyplot as plt
from matplotlib import font_manager
import matplotlib.ticker as ticker
import seaborn as sns
import scipy as sp
from scipy import stats
import ast
from datetime import datetime, date, timedelta
from pandas.tseries.offsets import DateOffset
from scipy.stats import levene

import warnings
warnings.filterwarnings("ignore")

font_fname = 'C:/Windows/Fonts/malgun.ttf'
font_family = font_manager.FontProperties(fname=font_fname).get_name()
plt.rcParams['font.family'] = font_family
plt.rcParams['axes.unicode_minus'] = False

%matplotlib inline

In [238]:
apartment = pd.read_csv("apartment_20230823.csv")
apartment.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3133 entries, 0 to 3132
Data columns (total 41 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   아파트명       3133 non-null   object 
 1   법정동주소      3133 non-null   object 
 2   위도         3133 non-null   float64
 3   경도         3133 non-null   float64
 4   세대수        3133 non-null   int64  
 5   임대세대수      2899 non-null   float64
 6   최고층        3103 non-null   float64
 7   최저층        3090 non-null   float64
 8   최대공급면적     3133 non-null   float64
 9   최소공급면적     3133 non-null   float64
 10  총아파트동수     3109 non-null   float64
 11  용적률        3133 non-null   float64
 12  건폐율        3133 non-null   float64
 13  세대평균_주차대수  3065 non-null   float64
 14  공급면적       3133 non-null   float64
 15  전용면적       3133 non-null   float64
 16  전용율        3133 non-null   float64
 17  방수         3051 non-null   float64
 18  욕실수        3051 non-null   float64
 19  현관구조       3133 non-null   object 
 20  입주예정연도  

NULL 제거

In [239]:
apartment.columns
cols = ['아파트명', '법정동주소', '위도', '경도', '세대수', '임대세대수', '최고층', '최저층', '최대공급면적',
       '최소공급면적', '총아파트동수', '세대평균_주차대수', '전용율',
       '방수', '욕실수', '현관구조', '입주예정연도', '공급액(만원)', '대형건설사', '지하철역', '지하철역_거리',
       '역사명', '노선명_리스트', '1차병원', '2차병원', '3차병원', '공원', '대학', '광역', '기초', '소매',
       '음식', '교육', '장례식장', '보건의료', '유원지오락', '총인구수']

In [240]:
apartment = apartment.dropna(subset=cols, how='any')
apartment = apartment[(apartment['용적률'] != -1) & (apartment['건폐율'] != -1)]
apartment.tail(3)

Unnamed: 0,아파트명,법정동주소,위도,경도,세대수,임대세대수,최고층,최저층,최대공급면적,최소공급면적,총아파트동수,용적률,건폐율,세대평균_주차대수,공급면적,전용면적,전용율,방수,욕실수,현관구조,입주예정연도,공급액(만원),대형건설사,지하철역,지하철역_거리,역사명,노선명_리스트,1차병원,2차병원,3차병원,공원,대학,광역,기초,소매,음식,교육,장례식장,보건의료,유원지오락,총인구수
3125,군포 당동 그랑시티2차,경기도 군포시 당동 749-1,37.354777,126.947194,32,0.0,18.0,17.0,81.17,64.7,1.0,798.0,58.0,0.96,73.64,62.28,85.0,3.0,2.0,계단식,2020,29300,False,군포역,0.1649,군포역,['1호선'],52,6,0,27,7,경기,군포시,6286,3015,694,4,103,276,265999
3126,군포 당동 그랑시티2차,경기도 군포시 당동 749-1,37.354777,126.947194,32,0.0,18.0,17.0,81.17,64.7,1.0,798.0,58.0,0.96,78.22,65.85,84.0,3.0,2.0,계단식,2020,28800,False,군포역,0.1649,군포역,['1호선'],52,6,0,27,7,경기,군포시,6286,3015,694,4,103,276,265999
3127,군포 당동 그랑시티2차,경기도 군포시 당동 749-1,37.354777,126.947194,32,0.0,18.0,17.0,81.17,64.7,1.0,798.0,58.0,0.96,69.67,58.51,84.0,3.0,2.0,계단식,2020,28000,False,군포역,0.1649,군포역,['1호선'],4,0,0,27,7,경기,군포시,6286,3015,694,4,103,276,265999


지하철 노선명 리스트 원핫인코딩

In [241]:
apartment['노선명_리스트'] = apartment['노선명_리스트'].apply(ast.literal_eval)
all_rail = set(rail for rail_list in apartment['노선명_리스트'] for rail in rail_list)

for one_rail in all_rail:
    apartment[f'노선_{one_rail}'] = apartment['노선명_리스트'].apply(lambda x: 1 if one_rail in x else 0)

apartment.drop(columns=['노선명_리스트'], inplace=True)

세대 타입 수 변수 추가

In [242]:
type_cnt = apartment['아파트명'].value_counts().to_dict()
apartment['타입'] = apartment['아파트명'].map(type_cnt)

In [243]:
apartment.loc[:, '공급액(만원)'] = apartment['공급액(만원)'].replace(r"[,]", "", regex=True)
apartment['공급액(만원)'] = apartment['공급액(만원)'].astype(int)
apartment['공급액(만원)']

0       132040
1       131280
2       131240
3       130910
4       130600
         ...  
3123     30300
3124     29800
3125     29300
3126     28800
3127     28000
Name: 공급액(만원), Length: 2335, dtype: int32

In [244]:
apartment['기초'] = apartment['광역'] + '_' + apartment['기초']

In [245]:
encoding_cols = ['광역', '기초', '입주예정연도']
for_regression = pd.get_dummies(apartment, columns=encoding_cols)

In [246]:
for_regression.columns

Index(['아파트명', '법정동주소', '위도', '경도', '세대수', '임대세대수', '최고층', '최저층', '최대공급면적',
       '최소공급면적',
       ...
       '기초_인천_서구', '기초_인천_연수구', '기초_인천_중구', '입주예정연도_2020', '입주예정연도_2021',
       '입주예정연도_2022', '입주예정연도_2023', '입주예정연도_2024', '입주예정연도_2025',
       '입주예정연도_2026'],
      dtype='object', length=135)

In [247]:
for_regression.rename(columns={'공급액(만원)': '공급액',
                           '1차병원': '병원_1차',
                           '2차병원': '병원_2차',
                           '3차병원': '병원_3차',
                           '노선_인천지하철 1호선': '노선_인천_1호선',
                           '노선_인천지하철 2호선': '노선_인천_2호선'}, inplace=True)

In [248]:
for_regression = for_regression.apply(lambda x: x.replace(r"\s|\n|\t|\([^()]*\)|[,.·]", "", regex=True))
for_regression[['소매', '음식', '교육', '보건의료', '유원지오락', '총인구수']] = for_regression[['소매', '음식', '교육', '보건의료', '유원지오락', '총인구수']].astype(int)

In [249]:
for_regression.to_csv("reg_data.csv", encoding= 'UTF-8', index=False)

In [250]:
variable_list = list(for_regression.columns)
variable_list

['아파트명',
 '법정동주소',
 '위도',
 '경도',
 '세대수',
 '임대세대수',
 '최고층',
 '최저층',
 '최대공급면적',
 '최소공급면적',
 '총아파트동수',
 '용적률',
 '건폐율',
 '세대평균_주차대수',
 '공급면적',
 '전용면적',
 '전용율',
 '방수',
 '욕실수',
 '현관구조',
 '공급액',
 '대형건설사',
 '지하철역',
 '지하철역_거리',
 '역사명',
 '병원_1차',
 '병원_2차',
 '병원_3차',
 '공원',
 '대학',
 '소매',
 '음식',
 '교육',
 '장례식장',
 '보건의료',
 '유원지오락',
 '총인구수',
 '노선_진접선',
 '노선_8호선',
 '노선_경강선',
 '노선_경춘선',
 '노선_신분당선',
 '노선_3호선',
 '노선_수인선',
 '노선_경의중앙선',
 '노선_우이신설선',
 '노선_분당선',
 '노선_김포골드라인',
 '노선_인천_1호선',
 '노선_의정부선',
 '노선_에버라인',
 '노선_인천_2호선',
 '노선_신림선',
 '노선_6호선',
 '노선_2호선',
 '노선_1호선',
 '노선_서해선',
 '노선_9호선',
 '노선_4호선',
 '노선_5호선',
 '노선_인천국제공항선',
 '노선_7호선',
 '타입',
 '광역_경기',
 '광역_서울',
 '광역_인천',
 '기초_경기_가평군',
 '기초_경기_고양시',
 '기초_경기_과천시',
 '기초_경기_광명시',
 '기초_경기_광주시',
 '기초_경기_구리시',
 '기초_경기_군포시',
 '기초_경기_김포시',
 '기초_경기_남양주시',
 '기초_경기_동두천시',
 '기초_경기_부천시',
 '기초_경기_성남시',
 '기초_경기_수원시',
 '기초_경기_시흥시',
 '기초_경기_안산시',
 '기초_경기_안성시',
 '기초_경기_안양시',
 '기초_경기_양주시',
 '기초_경기_양평군',
 '기초_경기_여주시',
 '기초_경기_연천군',
 '기초_경기_오산시',
 '기초_경기_용인시',
 '기초_경기_의왕시',
 

In [251]:
'+'.join(variable_list)

'아파트명+법정동주소+위도+경도+세대수+임대세대수+최고층+최저층+최대공급면적+최소공급면적+총아파트동수+용적률+건폐율+세대평균_주차대수+공급면적+전용면적+전용율+방수+욕실수+현관구조+공급액+대형건설사+지하철역+지하철역_거리+역사명+병원_1차+병원_2차+병원_3차+공원+대학+소매+음식+교육+장례식장+보건의료+유원지오락+총인구수+노선_진접선+노선_8호선+노선_경강선+노선_경춘선+노선_신분당선+노선_3호선+노선_수인선+노선_경의중앙선+노선_우이신설선+노선_분당선+노선_김포골드라인+노선_인천_1호선+노선_의정부선+노선_에버라인+노선_인천_2호선+노선_신림선+노선_6호선+노선_2호선+노선_1호선+노선_서해선+노선_9호선+노선_4호선+노선_5호선+노선_인천국제공항선+노선_7호선+타입+광역_경기+광역_서울+광역_인천+기초_경기_가평군+기초_경기_고양시+기초_경기_과천시+기초_경기_광명시+기초_경기_광주시+기초_경기_구리시+기초_경기_군포시+기초_경기_김포시+기초_경기_남양주시+기초_경기_동두천시+기초_경기_부천시+기초_경기_성남시+기초_경기_수원시+기초_경기_시흥시+기초_경기_안산시+기초_경기_안성시+기초_경기_안양시+기초_경기_양주시+기초_경기_양평군+기초_경기_여주시+기초_경기_연천군+기초_경기_오산시+기초_경기_용인시+기초_경기_의왕시+기초_경기_의정부시+기초_경기_이천시+기초_경기_파주시+기초_경기_평택시+기초_경기_포천시+기초_경기_하남시+기초_경기_화성시+기초_서울_강남구+기초_서울_강동구+기초_서울_강북구+기초_서울_강서구+기초_서울_관악구+기초_서울_광진구+기초_서울_구로구+기초_서울_노원구+기초_서울_도봉구+기초_서울_동대문구+기초_서울_동작구+기초_서울_서대문구+기초_서울_서초구+기초_서울_성동구+기초_서울_성북구+기초_서울_송파구+기초_서울_양천구+기초_서울_영등포구+기초_서울_은평구+기초_서울_종로구+기초_서울_중구+기초_서울_중랑구+기초_인천_강화군+기초_인천_계양구+기초_인천_남동구+기초_인천_동구+기초_인천_미추홀구+기초_인천_부평구+기초_인천_서

In [252]:
for_regression.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2335 entries, 0 to 3127
Columns: 135 entries, 아파트명 to 입주예정연도_2026
dtypes: bool(73), float64(17), int32(7), int64(33), object(5)
memory usage: 1.2+ MB


In [253]:
formula = '공급액 ~ 임대세대수+최대공급면적+최소공급면적+건폐율+타입+전용율+방수+욕실수+대형건설사+지하철역_거리+병원_3차+공원+대학+\
    소매+노선_경강선+노선_신림선+노선_1호선+노선_3호선+노선_신분당선+노선_인천_1호선+노선_인천국제공항선+노선_수인선+노선_인천_2호선+노선_진접선\
        +노선_9호선+노선_에버라인+노선_2호선+광역_경기+광역_인천+기초_경기_가평군+기초_경기_과천시+기초_경기_광명시+기초_경기_성남시+기초_경기_안양시+\
            기초_경기_오산시+기초_경기_의왕시+기초_경기_파주시+기초_서울_강남구+기초_서울_노원구+기초_서울_동작구+기초_서울_서초구+기초_서울_송파구+\
                기초_서울_은평구+기초_서울_중랑구+기초_인천_계양구+기초_인천_남동구+기초_인천_부평구+기초_인천_서구+\
                    입주예정연도_2020+입주예정연도_2021+입주예정연도_2022+입주예정연도_2023+입주예정연도_2024'
        
model = smf.ols(formula, for_regression)
result = model.fit()
print(result.summary())

                            OLS Regression Results                            
Dep. Variable:                    공급액   R-squared:                       0.727
Model:                            OLS   Adj. R-squared:                  0.721
Method:                 Least Squares   F-statistic:                     114.6
Date:                Wed, 23 Aug 2023   Prob (F-statistic):               0.00
Time:                        14:55:56   Log-Likelihood:                -26015.
No. Observations:                2335   AIC:                         5.214e+04
Df Residuals:                    2281   BIC:                         5.245e+04
Df Model:                          53                                         
Covariance Type:            nonrobust                                         
                          coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------------
Intercept           -6.143e+04   8

In [254]:
pd.set_option('display.max_columns', None)

df = pd.DataFrame({'컬럼': column, 'VIF': variance_inflation_factor(result.model.exog, i)}
              for i, column in enumerate(result.model.exog_names)
              if column != 'Intercept')
df.sort_values(by='VIF')

Unnamed: 0,컬럼,VIF
49,노선_진접선,1.095217
6,기초_경기_성남시[T.True],1.136885
5,기초_경기_광명시[T.True],1.137138
9,기초_경기_의왕시[T.True],1.16476
12,기초_서울_노원구[T.True],1.169612
3,기초_경기_가평군[T.True],1.180985
7,기초_경기_안양시[T.True],1.184581
17,기초_서울_중랑구[T.True],1.187002
44,노선_신분당선,1.189226
41,노선_신림선,1.201013


변수 수정

In [256]:
formula = '공급액 ~ 세대수+공급면적+건폐율+대형건설사+지하철역_거리+병원_3차+공원+타입\
    +노선_1호선+노선_2호선+노선_3호선+노선_6호선+노선_9호선+노선_수인선+노선_신림선+노선_신분당선+노선_에버라인+노선_우이신설선+노선_인천국제공항선+노선_인천_2호선+노선_진접선+\
    기초_경기_가평군+기초_경기_고양시+기초_경기_과천시+기초_경기_광명시+기초_경기_성남시+기초_경기_안산시+기초_경기_안양시+기초_경기_양주시+기초_경기_여주시+기초_경기_오산시+\
    기초_경기_의왕시+기초_경기_의정부시+기초_경기_이천시+기초_경기_파주시+기초_서울_강남구+기초_서울_강동구+기초_서울_강북구+기초_서울_강서구+기초_서울_광진구+기초_서울_구로구+\
    기초_서울_도봉구+기초_서울_동대문구+기초_서울_동작구+기초_서울_서대문구+기초_서울_서초구+기초_서울_서초구+기초_서울_성동구+기초_서울_성북구+기초_서울_영등포구+기초_서울_은평구+\
    기초_서울_종로구+기초_인천_남동구+기초_인천_부평구+기초_인천_연수구+입주예정연도_2020+입주예정연도_2023+입주예정연도_2024+입주예정연도_2025+입주예정연도_2026'
        
model = smf.ols(formula, for_regression)
result = model.fit()
print(result.summary())

                            OLS Regression Results                            
Dep. Variable:                    공급액   R-squared:                       0.880
Model:                            OLS   Adj. R-squared:                  0.877
Method:                 Least Squares   F-statistic:                     282.1
Date:                Wed, 23 Aug 2023   Prob (F-statistic):               0.00
Time:                        14:55:57   Log-Likelihood:                -25058.
No. Observations:                2335   AIC:                         5.024e+04
Df Residuals:                    2275   BIC:                         5.058e+04
Df Model:                          59                                         
Covariance Type:            nonrobust                                         
                          coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------------
Intercept           -4.025e+04   1

In [257]:
df = pd.DataFrame({'컬럼': column, 'VIF': variance_inflation_factor(result.model.exog, i)}
              for i, column in enumerate(result.model.exog_names)
              if column != 'Intercept')
df.sort_values(by='VIF')

Unnamed: 0,컬럼,VIF
24,기초_서울_서대문구[T.True],1.026986
9,기초_경기_여주시[T.True],1.028852
21,기초_서울_도봉구[T.True],1.051377
1,기초_경기_가평군[T.True],1.057822
53,노선_신분당선,1.061346
4,기초_경기_광명시[T.True],1.068517
52,노선_신림선,1.070877
31,기초_인천_남동구[T.True],1.083443
11,기초_경기_의왕시[T.True],1.085861
5,기초_경기_성남시[T.True],1.088407


최종

In [262]:
formula = '공급액 ~ 세대수+공급면적+건폐율+대형건설사+지하철역_거리+병원_3차+공원+타입+광역_서울\
    +노선_1호선+노선_2호선+노선_3호선+노선_6호선+노선_9호선+노선_수인선+노선_신림선+노선_신분당선+노선_에버라인+노선_우이신설선+노선_인천국제공항선+노선_인천_2호선+노선_진접선+\
    기초_경기_가평군+기초_경기_고양시+기초_경기_과천시+기초_경기_광명시+기초_경기_성남시+기초_경기_안산시+기초_경기_안양시+기초_경기_양주시+기초_경기_여주시+기초_경기_오산시+\
    기초_경기_의왕시+기초_경기_이천시+기초_경기_파주시+기초_서울_강남구+기초_서울_강동구+기초_서울_강북구+기초_서울_강서구+기초_서울_광진구+기초_서울_구로구+\
    기초_서울_도봉구+기초_서울_동대문구+기초_서울_동작구+기초_서울_서대문구+기초_서울_서초구+기초_서울_서초구+기초_서울_성동구+기초_서울_영등포구+\
    기초_서울_종로구+기초_인천_남동구+기초_인천_부평구+기초_인천_연수구+입주예정연도_2020+입주예정연도_2023+입주예정연도_2024+입주예정연도_2025+입주예정연도_2026'
        
model = smf.ols(formula, for_regression)
result = model.fit()
print(result.summary())

                            OLS Regression Results                            
Dep. Variable:                    공급액   R-squared:                       0.885
Model:                            OLS   Adj. R-squared:                  0.882
Method:                 Least Squares   F-statistic:                     306.1
Date:                Wed, 23 Aug 2023   Prob (F-statistic):               0.00
Time:                        14:58:56   Log-Likelihood:                -25010.
No. Observations:                2335   AIC:                         5.014e+04
Df Residuals:                    2277   BIC:                         5.047e+04
Df Model:                          57                                         
Covariance Type:            nonrobust                                         
                          coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------------
Intercept           -4.195e+04   1

In [263]:
df = pd.DataFrame({'컬럼': column, 'VIF': variance_inflation_factor(result.model.exog, i)}
              for i, column in enumerate(result.model.exog_names)
              if column != 'Intercept')
df.sort_values(by='VIF')

Unnamed: 0,컬럼,VIF
10,기초_경기_여주시[T.True],1.027403
24,기초_서울_서대문구[T.True],1.043797
2,기초_경기_가평군[T.True],1.054663
51,노선_신분당선,1.061309
5,기초_경기_광명시[T.True],1.072458
29,기초_인천_남동구[T.True],1.085057
6,기초_경기_성남시[T.True],1.085172
12,기초_경기_의왕시[T.True],1.093425
7,기초_경기_안산시[T.True],1.094484
30,기초_인천_부평구[T.True],1.127554


In [None]:
output_filename = 'regression_result.txt'
with open(output_filename, 'w') as f:
    f.write(result.summary().as_text())