In [145]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
import time
import itertools

In [2]:
df_x = pd.read_csv('../../DB/x_변수병합_민주.csv', encoding='CP949')
df_y = pd.read_csv('../../DB/폐기물데이터_행정구역변경_2.csv', encoding='CP949')

# Y값 = 종량제방식 등 혼합배출

In [10]:
y_1 = df_y[df_y['폐기물_종류'] == '종량제방식 등 혼합배출'].groupby(['행정구역','연도'])[['발생량']].sum().reset_index()
y_1

Unnamed: 0,행정구역,연도,발생량
0,강원_강릉시,2019,38945.5
1,강원_강릉시,2020,40579.7
2,강원_강릉시,2021,42722.3
3,강원_강릉시,2022,40274.5
4,강원_강릉시,2023,37047.4
...,...,...,...
1140,충북_충주시,2019,82052.0
1141,충북_충주시,2020,60127.2
1142,충북_충주시,2021,56905.0
1143,충북_충주시,2022,41319.7


In [15]:
y_1 = y_1[~(y_1['발생량'] == 0)]
# 855	전남_담양군	2019	0.0

In [16]:
y_1.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1144 entries, 0 to 1144
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   행정구역    1144 non-null   object 
 1   연도      1144 non-null   int64  
 2   발생량     1144 non-null   float64
dtypes: float64(1), int64(1), object(1)
memory usage: 35.8+ KB


In [27]:
df_merged = pd.merge(df_x, y_1, on=['행정구역', '연도'], how='inner')
df_merged.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1144 entries, 0 to 1143
Data columns (total 25 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   행정구역          1144 non-null   object 
 1   시도            1144 non-null   object 
 2   시군구           1144 non-null   object 
 3   연도            1144 non-null   int64  
 4   주민등록세대수       1144 non-null   int64  
 5   총인구수          1144 non-null   int64  
 6   단독주택-계        1144 non-null   int64  
 7   아파트           1144 non-null   int64  
 8   연립주택          1144 non-null   int64  
 9   다세대주택         1144 non-null   int64  
 10  비주거용 건물 내 주택  1144 non-null   int64  
 11  단독주택비율        1144 non-null   float64
 12  아파트비율         1144 non-null   float64
 13  연립주택비율        1144 non-null   float64
 14  다세대주택비율       1144 non-null   float64
 15  비주거용주택비율      1144 non-null   float64
 16  1인가구          1144 non-null   int64  
 17  2인가구          1144 non-null   int64  
 18  3인가구          1144 non-null 

In [21]:
df_merged.isnull().sum()

행정구역            0
시도              0
시군구             0
연도              0
주민등록세대수         0
총인구수            0
단독주택-계          0
아파트             0
연립주택            0
다세대주택           0
비주거용 건물 내 주택    0
단독주택비율          0
아파트비율           0
연립주택비율          0
다세대주택비율         0
비주거용주택비율        0
1인가구            0
2인가구            0
3인가구            0
4인가구            0
5인이상가구          0
총전입             0
총전출             0
순이동             0
발생량             0
dtype: int64

In [30]:
# 비율로 변경
df_merged['1인가구비율'] = df_merged['1인가구'] / (df_merged['1인가구']+ df_merged['2인가구']+ df_merged['3인가구'] + df_merged['4인가구'] + df_merged['5인이상가구'])
df_merged['2인가구비율'] = df_merged['2인가구'] / (df_merged['1인가구']+ df_merged['2인가구']+ df_merged['3인가구'] + df_merged['4인가구'] + df_merged['5인이상가구'])
df_merged['3인가구비율'] = df_merged['3인가구'] / (df_merged['1인가구']+ df_merged['2인가구']+ df_merged['3인가구'] + df_merged['4인가구'] + df_merged['5인이상가구'])
df_merged['4인가구비율'] = df_merged['4인가구'] / (df_merged['1인가구']+ df_merged['2인가구']+ df_merged['3인가구'] + df_merged['4인가구'] + df_merged['5인이상가구'])
df_merged['5인이상가구비율'] = df_merged['5인이상가구'] / (df_merged['1인가구']+ df_merged['2인가구']+ df_merged['3인가구'] + df_merged['4인가구'] + df_merged['5인이상가구'])
df_merged.head(3)

Unnamed: 0,행정구역,시도,시군구,연도,주민등록세대수,총인구수,단독주택-계,아파트,연립주택,다세대주택,...,5인이상가구,총전입,총전출,순이동,발생량,1인가구비율,2인가구비율,3인가구비율,4인가구비율,5인이상가구비율
0,강원_강릉시,강원,강릉시,2019,99086,213442,29481,50175,2919,1470,...,3715.0,31980,30787,1193,38945.5,0.348767,0.310871,0.179677,0.120346,0.04034
1,강원_고성군,강원,고성군,2019,14445,27260,8764,2193,109,121,...,409.0,3096,3835,-739,10402.5,0.343424,0.366152,0.165649,0.088034,0.036741
2,강원_동해시,강원,동해시,2019,41141,90522,10461,24961,1657,1152,...,1580.0,10516,11039,-523,29090.5,0.291131,0.333738,0.198045,0.133457,0.04363


In [49]:
# 선정 피처
print(df_merged.columns)
df_merged_1 = df_merged[['행정구역','연도', '총인구수',
                         '단독주택비율', '아파트비율', '연립주택비율', '다세대주택비율', '비주거용주택비율', 
                         '1인가구비율', '2인가구비율', '3인가구비율', '4인가구비율', '5인이상가구비율',
                         '총전입', '순이동', '발생량']]
print(df_merged_1.info())

Index(['행정구역', '시도', '시군구', '연도', '주민등록세대수', '총인구수', '단독주택-계', '아파트', '연립주택',
       '다세대주택', '비주거용 건물 내 주택', '단독주택비율', '아파트비율', '연립주택비율', '다세대주택비율',
       '비주거용주택비율', '1인가구', '2인가구', '3인가구', '4인가구', '5인이상가구', '총전입', '총전출',
       '순이동', '발생량', '1인가구비율', '2인가구비율', '3인가구비율', '4인가구비율', '5인이상가구비율'],
      dtype='object')
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1144 entries, 0 to 1143
Data columns (total 16 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   행정구역      1144 non-null   object 
 1   연도        1144 non-null   int64  
 2   총인구수      1144 non-null   int64  
 3   단독주택비율    1144 non-null   float64
 4   아파트비율     1144 non-null   float64
 5   연립주택비율    1144 non-null   float64
 6   다세대주택비율   1144 non-null   float64
 7   비주거용주택비율  1144 non-null   float64
 8   1인가구비율    1144 non-null   float64
 9   2인가구비율    1144 non-null   float64
 10  3인가구비율    1144 non-null   float64
 11  4인가구비율    1144 non-null   float64
 12  5인이상가구비율  1144 non-nu

# 데이터 확인

In [None]:
# 히스토그램이랑, 상관관계 그리기

# 데이터 전처리
- 회귀용 - 로그변환 
- 랜덤포레스트 - 그대로 사용

In [110]:
df_merged_regg= df_merged_1.copy()
df_merged_rf= df_merged_1.copy()

In [111]:
print(df_merged_regg['총인구수'].describe())
df_merged_regg['ln총인구수'] = np.log(df_merged_regg['총인구수'])
print(df_merged_regg['ln총인구수'].describe())

count    1.144000e+03
mean     2.633696e+05
std      3.678272e+05
min      8.867000e+03
25%      5.164625e+04
50%      1.476935e+05
75%      3.415570e+05
max      2.394514e+06
Name: 총인구수, dtype: float64
count    1144.000000
mean       11.850523
std         1.125492
min         9.090092
25%        10.852169
50%        11.902890
75%        12.741270
max        14.688691
Name: ln총인구수, dtype: float64


In [112]:
print(df_merged_regg['총전입'].describe())
df_merged_regg['ln총전입'] = np.log(df_merged_regg['총전입'])
print(df_merged_regg['ln총전입'].describe())

count      1144.000000
mean      30007.634615
std       32178.870045
min         972.000000
25%        4946.250000
50%       18686.500000
75%       44890.500000
max      183258.000000
Name: 총전입, dtype: float64
count    1144.000000
mean        9.681803
std         1.212490
min         6.879356
25%         8.506382
50%         9.835556
75%        10.711981
max        12.118650
Name: ln총전입, dtype: float64


In [114]:
df_merged_regg = df_merged_regg[['행정구역', '연도', 'ln총인구수', '단독주택비율', '아파트비율', '연립주택비율', '다세대주택비율',
       '비주거용주택비율', '1인가구비율', '2인가구비율', '3인가구비율', '4인가구비율', '5인이상가구비율', 'ln총전입',
       '순이동', '발생량']]

# 데이터 split
train, valid, test 비율을 60/20/20 %   
1144(행) * 0.6 = 686.4  (686) -> 2019-2021  
1144(행) * 0.2 = 228.8  (229) -> 2022   
1144(행) * 0.2 = 228.8  (229) -> 2023  

### linear regression split

In [115]:
# df_merged_regg[df_merged_regg['연도']<2022].count()
reg_x_train = df_merged_regg[df_merged_regg['연도'] < 2022].iloc[:, 2:-1]
reg_y_train = df_merged_regg[df_merged_regg['연도'] < 2022][['발생량']]
reg_x_valid = df_merged_regg[df_merged_regg['연도'] == 2022].iloc[:, 2:-1]
reg_y_valid = df_merged_regg[df_merged_regg['연도'] == 2022][['발생량']]
reg_x_test = df_merged_regg[df_merged_regg['연도'] == 2023].iloc[:, 2:-1]
reg_y_test = df_merged_regg[df_merged_regg['연도'] == 2023][['발생량']]

- 표준화

In [130]:
scaler = StandardScaler()
# 훈련데이터로 변환
# scaler.fit(reg_x_train)
reg_x_train_s = scaler.fit_transform(reg_x_train)
reg_x_valid_s = scaler.transform(reg_x_valid)
reg_x_test_s = scaler.transform(reg_x_test)

In [131]:
# 컬럼명 유지
reg_x_train_s = pd.DataFrame(reg_x_train_s, columns = reg_x_train.columns)
reg_x_valid_s = pd.DataFrame(reg_x_valid_s, columns = reg_x_valid.columns)
reg_x_test_s = pd.DataFrame(reg_x_test_s, columns = reg_x_test.columns)

In [None]:
# 히스토그램그리기

### randomforest split

In [117]:
rf_x_train = df_merged_rf[df_merged_rf['연도'] < 2022].iloc[:, 2:-1]
rf_y_train = df_merged_rf[df_merged_rf['연도'] < 2022][['발생량']]
rf_x_valid = df_merged_rf[df_merged_rf['연도'] == 2022].iloc[:, 2:-1]
rf_y_valid = df_merged_rf[df_merged_rf['연도'] == 2022][['발생량']]
rf_x_test = df_merged_rf[df_merged_rf['연도'] == 2023].iloc[:, 2:-1]
rf_y_test = df_merged_rf[df_merged_rf['연도'] == 2023][['발생량']]

# 회귀분석

In [132]:
x_contract = sm.add_constant(reg_x_train_s)

model = sm.OLS(reg_y_train, x_contract)
ols_model = model.fit()
ols_model.summary()

0,1,2,3
Dep. Variable:,발생량,R-squared:,0.808
Model:,OLS,Adj. R-squared:,0.805
Method:,Least Squares,F-statistic:,236.0
Date:,"Sun, 22 Jun 2025",Prob (F-statistic):,9.17e-232
Time:,00:55:12,Log-Likelihood:,-7513.2
No. Observations:,686,AIC:,15050.0
Df Residuals:,673,BIC:,15110.0
Df Model:,12,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,3.476e+04,532.408,65.285,0.000,3.37e+04,3.58e+04
ln총인구수,5.277e+04,3012.580,17.515,0.000,4.69e+04,5.87e+04
단독주택비율,4.484e+05,2.32e+06,0.194,0.847,-4.1e+06,5e+06
아파트비율,3.929e+05,2.07e+06,0.190,0.850,-3.67e+06,4.46e+06
연립주택비율,3.353e+04,1.66e+05,0.202,0.840,-2.92e+05,3.6e+05
다세대주택비율,1.517e+05,8.18e+05,0.186,0.853,-1.45e+06,1.76e+06
비주거용주택비율,1.674e+04,5.95e+04,0.282,0.778,-1e+05,1.33e+05
1인가구비율,-1638.4395,973.450,-1.683,0.093,-3549.804,272.925
2인가구비율,-3436.3608,1224.099,-2.807,0.005,-5839.872,-1032.849

0,1,2,3
Omnibus:,116.697,Durbin-Watson:,1.729
Prob(Omnibus):,0.0,Jarque-Bera (JB):,289.097
Skew:,0.888,Prob(JB):,1.6700000000000001e-63
Kurtosis:,5.639,Cond. No.,8570000000000000.0


In [133]:
# Multicollinearity
# VIF
vif = pd.DataFrame()
vif["VIF"] = [variance_inflation_factor(reg_x_train_s.values, i) for i in range(reg_x_train_s.shape[1])]
vif['features'] = reg_x_train_s.columns
vif.sort_values(by = 'VIF', ascending = False)

  vif = 1. / (1. - r_squared_i)


Unnamed: 0,VIF,features
10,inf,5인이상가구비율
9,inf,4인가구비율
8,inf,3인가구비율
7,inf,2인가구비율
6,inf,1인가구비율
1,18921780.0,단독주택비율
2,15128680.0,아파트비율
4,2358067.0,다세대주택비율
3,97256.4,연립주택비율
5,12469.14,비주거용주택비율


In [138]:
#Feature Selection 전진 선택법

#변수 선택법
def processSubset(x,y, feature_set):
    model = sm.OLS(y,x[list(feature_set)])
    regr = model.fit()
    AIC = regr.aic
    return {"model" : regr , "AIC" : AIC}

def forward(x,y,predictors):
    remainingPredictors = [p for p in x.columns.difference(['const'])
                           if p not in predictors]
    tic=time.time()
    results=[]
    for p in remainingPredictors:
        results.append(processSubset(x=x,y=y,feature_set=predictors+[p]+
 ['const']))
    models = pd.DataFrame(results)

    bestModel = models.loc[models['AIC'].argmin()]
    toc = time.time()
    print("Processed ", models.shape[0],"models on", len(predictors)+1,
          "predictors in",(toc-tic))
    print("Selected predictors:",bestModel['model'].model.exog_names,
          'AIC : ',bestModel[0])
    return bestModel

In [142]:
#전진선택법 모델
def forward_model(x,y):
    fModels = pd.DataFrame(columns=["AIC","model"])
    tic = time.time()
    predictors = []
    for i in range(1,len(x.columns.difference(['const']))+1):
        forwardResult= forward(x,y,predictors)
        if i > 1:
            if forwardResult['AIC'] > fmodelBefore:
                break
        fModels.loc[i] = forwardResult
        predictors = fModels.loc[i]["model"].model.exog_names
        fmodelBefore = fModels.loc[i]["AIC"]
        predictors = [k for k in predictors if k != 'const']
    toc = time.time()
    print("Total elapesed time : ", (toc - tic), "seconds.")
    return (fModels['model'][len(fModels['model'])])

forwordBestModel=forward_model(x_contract, reg_y_train)
print(forwordBestModel.summary())

Processed  13 models on 1 predictors in 0.0501408576965332
Selected predictors: ['ln총인구수', 'const'] AIC :  <statsmodels.regression.linear_model.RegressionResultsWrapper object at 0x000001EB03EABA10>
Processed  12 models on 2 predictors in 0.011603832244873047
Selected predictors: ['ln총인구수', '단독주택비율', 'const'] AIC :  <statsmodels.regression.linear_model.RegressionResultsWrapper object at 0x000001EB03F08810>
Processed  11 models on 3 predictors in 0.010717391967773438
Selected predictors: ['ln총인구수', '단독주택비율', '비주거용주택비율', 'const'] AIC :  <statsmodels.regression.linear_model.RegressionResultsWrapper object at 0x000001EB03E596D0>
Processed  10 models on 4 predictors in 0.008002281188964844
Selected predictors: ['ln총인구수', '단독주택비율', '비주거용주택비율', '다세대주택비율', 'const'] AIC :  <statsmodels.regression.linear_model.RegressionResultsWrapper object at 0x000001EB03FA3E50>
Processed  9 models on 5 predictors in 0.009769916534423828
Selected predictors: ['ln총인구수', '단독주택비율', '비주거용주택비율', '다세대주택비율', 'ln총전입',

  'AIC : ',bestModel[0])
  'AIC : ',bestModel[0])
  'AIC : ',bestModel[0])
  'AIC : ',bestModel[0])
  'AIC : ',bestModel[0])
  'AIC : ',bestModel[0])
  'AIC : ',bestModel[0])
  'AIC : ',bestModel[0])
  'AIC : ',bestModel[0])
  'AIC : ',bestModel[0])
  'AIC : ',bestModel[0])


In [None]:
# Processed  4 models on 10 predictors in 0.0050008296966552734
# Selected predictors: ['ln총인구수', '단독주택비율', '비주거용주택비율', '다세대주택비율', 'ln총전입', '2인가구비율', '연립주택비율', '순이동', '4인가구비율', '1인가구비율', 'const']

In [146]:
#Feature Selection 후진 제거법
def backward(x,y,predictors):
    tic = time.time()
    results=[]
    for combo in itertools.combinations(predictors, len(predictors)-1):
        results.append(processSubset(x,y,list(combo)+['const']))
    models = pd.DataFrame(results)
    bestModel = models.loc[models['AIC'].argmin()]
    toc = time.time()
    print("Processed",models.shape[0],"models on",len(predictors)-1,
          "predictors in",(toc - tic))
    print("Selected predictors :",bestModel['model'].model.exog_names,
          ' AIC:',bestModel[0])
    return bestModel

def backword_model(x,y):
    BModels = pd.DataFrame(columns=["AIC","model"])
    tic = time.time()
    predictors = x.columns.difference(['const'])
    BmodelBefore = processSubset(x,y,predictors)['AIC']
    while(len(predictors)>1):
        backwardResult=backward(x_contract, reg_y_train, predictors)
        if backwardResult['AIC'] > BmodelBefore:
            break
        BModels.loc[len(predictors)-1] = backwardResult
        predictors = BModels.loc[len(predictors)-1]["model"].model.exog_names
        BmodelBefore = backwardResult["AIC"]
        predictors = [ k for k in predictors if k != 'const']

    toc = time.time()
    print("Total elapsed time :",(toc - tic), "seconds.")
    return (BModels["model"].dropna().iloc[0])

backwardBestModel = backword_model(x_contract, reg_y_train)
print(backwardBestModel.summary())

Processed 13 models on 12 predictors in 0.03453683853149414
Selected predictors : ['1인가구비율', '2인가구비율', '3인가구비율', '4인가구비율', '5인이상가구비율', 'ln총인구수', 'ln총전입', '단독주택비율', '비주거용주택비율', '순이동', '아파트비율', '연립주택비율', 'const']  AIC: <statsmodels.regression.linear_model.RegressionResultsWrapper object at 0x000001EB0408E790>
Processed 12 models on 11 predictors in 0.011379718780517578
Selected predictors : ['1인가구비율', '2인가구비율', '3인가구비율', '4인가구비율', 'ln총인구수', 'ln총전입', '단독주택비율', '비주거용주택비율', '순이동', '아파트비율', '연립주택비율', 'const']  AIC: <statsmodels.regression.linear_model.RegressionResultsWrapper object at 0x000001EB040D6250>
Processed 11 models on 10 predictors in 0.009362459182739258
Selected predictors : ['1인가구비율', '2인가구비율', '4인가구비율', 'ln총인구수', 'ln총전입', '단독주택비율', '비주거용주택비율', '순이동', '아파트비율', '연립주택비율', 'const']  AIC: <statsmodels.regression.linear_model.RegressionResultsWrapper object at 0x000001EB040D5B10>
Processed 10 models on 9 predictors in 0.006914377212524414
Selected predictors : ['1인가구비율', '2인가구비율', '4

  ' AIC:',bestModel[0])
  ' AIC:',bestModel[0])
  ' AIC:',bestModel[0])
  ' AIC:',bestModel[0])


In [149]:
new_reg_x_train_s = reg_x_train_s[['ln총인구수', '단독주택비율', '비주거용주택비율', '다세대주택비율', '연립주택비율', '1인가구비율', '2인가구비율',  '4인가구비율', 'ln총전입', '순이동']]
new_x_contract = sm.add_constant(new_reg_x_train_s)

model2 = sm.OLS(reg_y_train, new_x_contract)
ols_model2 = model2.fit()
ols_model2.summary()

0,1,2,3
Dep. Variable:,발생량,R-squared:,0.808
Model:,OLS,Adj. R-squared:,0.805
Method:,Least Squares,F-statistic:,284.0
Date:,"Sun, 22 Jun 2025",Prob (F-statistic):,3.2999999999999996e-234
Time:,01:19:13,Log-Likelihood:,-7513.2
No. Observations:,686,AIC:,15050.0
Df Residuals:,675,BIC:,15100.0
Df Model:,10,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,3.476e+04,531.635,65.380,0.000,3.37e+04,3.58e+04
ln총인구수,5.277e+04,2985.175,17.676,0.000,4.69e+04,5.86e+04
단독주택비율,9107.9275,1847.740,4.929,0.000,5479.918,1.27e+04
비주거용주택비율,5443.9894,727.073,7.488,0.000,4016.393,6871.585
다세대주택비율,-3392.6817,733.554,-4.625,0.000,-4833.004,-1952.359
연립주택비율,2040.7328,613.894,3.324,0.001,835.361,3246.105
1인가구비율,1.192e+04,3524.604,3.382,0.001,4998.146,1.88e+04
2인가구비율,8283.1041,4158.695,1.992,0.047,117.571,1.64e+04
4인가구비율,2.32e+04,6255.288,3.709,0.000,1.09e+04,3.55e+04

0,1,2,3
Omnibus:,116.836,Durbin-Watson:,1.73
Prob(Omnibus):,0.0,Jarque-Bera (JB):,290.128
Skew:,0.888,Prob(JB):,9.990000000000001e-64
Kurtosis:,5.645,Cond. No.,36.1


In [155]:
# Multicollinearity
# VIF
vif = pd.DataFrame()
vif["VIF"] = [variance_inflation_factor(new_reg_x_train_s.values, i) for i in range(new_reg_x_train_s.shape[1])]
vif['features'] = new_reg_x_train_s.columns
vif.sort_values(by = 'VIF', ascending = False)

Unnamed: 0,VIF,features
7,138.441659,4인가구비율
6,61.19082,2인가구비율
8,46.177327,ln총전입
5,43.953434,1인가구비율
0,31.52912,ln총인구수
1,12.079638,단독주택비율
3,1.903868,다세대주택비율
2,1.870371,비주거용주택비율
9,1.399549,순이동
4,1.333396,연립주택비율


In [None]:
'ln총인구수', '단독주택비율', '비주거용주택비율', '다세대주택비율', '연립주택비율', '1인가구비율', '2인가구비율', '순이동']]
AIC:	1.509e+04
Df Residuals:	677	BIC:	1.513e+04
0.794  0.792


[['ln총인구수', '비주거용주택비율', '다세대주택비율', '연립주택비율', '1인가구비율', '2인가구비율', '순이동']]
AIC:	1.513e+04
Df Residuals:	678	BIC:	1.517e+04

[['ln총인구수', '단독주택비율', '비주거용주택비율', '다세대주택비율', '연립주택비율', '2인가구비율', '순이동']]
AIC:	1.510e+04
Df Residuals:	678	BIC:	1.513e+04
0.793 0.791

# 다중공선성은 높고
[['ln총인구수', '단독주택비율', '비주거용주택비율', '다세대주택비율', '연립주택비율', '1인가구비율', '2인가구비율', 'ln총전입', '순이동']]
AIC:	1.506e+04
Df Residuals:	676	BIC:	1.511e+04
0.804 0.801

[['ln총인구수', '단독주택비율', '비주거용주택비율', '다세대주택비율', '연립주택비율', '2인가구비율', 'ln총전입', '순이동']]
AIC:	1.506e+04
Df Residuals:	677	BIC:	1.510e+04
0.804 0.801

In [186]:
new2_reg_x_train_s = reg_x_train_s[['ln총인구수', '단독주택비율', '비주거용주택비율', '다세대주택비율', '연립주택비율', '2인가구비율', '순이동']]
new2_x_contract = sm.add_constant(new2_reg_x_train_s)

model2 = sm.OLS(reg_y_train, new2_x_contract)
ols_model2 = model2.fit()
ols_model2.summary()

0,1,2,3
Dep. Variable:,발생량,R-squared:,0.804
Model:,OLS,Adj. R-squared:,0.801
Method:,Least Squares,F-statistic:,346.4
Date:,"Sun, 22 Jun 2025",Prob (F-statistic):,1.69e-233
Time:,01:37:56,Log-Likelihood:,-7520.9
No. Observations:,686,AIC:,15060.0
Df Residuals:,677,BIC:,15100.0
Df Model:,8,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,3.476e+04,536.785,64.753,0.000,3.37e+04,3.58e+04
ln총인구수,5.453e+04,2947.732,18.498,0.000,4.87e+04,6.03e+04
단독주택비율,6568.9309,1589.791,4.132,0.000,3447.417,9690.445
비주거용주택비율,5045.1512,714.770,7.058,0.000,3641.718,6448.584
다세대주택비율,-4397.8457,692.222,-6.353,0.000,-5757.006,-3038.686
연립주택비율,1621.2756,609.536,2.660,0.008,424.467,2818.085
2인가구비율,-5384.5690,1464.474,-3.677,0.000,-8260.026,-2509.112
ln총전입,-2.204e+04,3581.836,-6.154,0.000,-2.91e+04,-1.5e+04
순이동,1558.9337,617.581,2.524,0.012,346.329,2771.539

0,1,2,3
Omnibus:,102.886,Durbin-Watson:,1.715
Prob(Omnibus):,0.0,Jarque-Bera (JB):,230.551
Skew:,0.823,Prob(JB):,8.64e-51
Kurtosis:,5.314,Cond. No.,17.9


In [187]:
# Multicollinearity
# VIF
vif = pd.DataFrame()
vif["VIF"] = [variance_inflation_factor(new2_reg_x_train_s.values, i) for i in range(new2_reg_x_train_s.shape[1])]
vif['features'] = new2_reg_x_train_s.columns
vif.sort_values(by = 'VIF', ascending = False)

Unnamed: 0,VIF,features
6,44.525674,ln총전입
0,30.156092,ln총인구수
1,8.771611,단독주택비율
5,7.443249,2인가구비율
2,1.773096,비주거용주택비율
3,1.662992,다세대주택비율
7,1.323694,순이동
4,1.289432,연립주택비율


In [147]:
x_contract

Unnamed: 0,const,ln총인구수,단독주택비율,아파트비율,연립주택비율,다세대주택비율,비주거용주택비율,1인가구비율,2인가구비율,3인가구비율,4인가구비율,5인이상가구비율,ln총전입,순이동
0,1.0,0.368814,-0.137654,0.404129,0.210428,-0.732867,0.762446,0.240563,-0.012730,-0.074850,-0.168015,-0.253872,0.513796,0.198471
1,1.0,-1.467169,1.353909,-1.186331,-1.024177,-0.800144,2.444557,0.138016,1.207859,-0.525859,-0.918651,-0.635139,-1.401580,-0.122959
2,1.0,-0.396443,-0.406819,0.632314,0.642037,-0.605449,0.355936,-0.865709,0.492184,0.515669,0.136570,0.094673,-0.398540,-0.087023
3,1.0,-0.661858,0.194124,-0.079207,1.063610,-0.601371,0.510130,0.628777,0.441881,-0.463016,-0.689811,-0.919298,-0.676779,-0.144421
4,1.0,-0.486983,-0.522690,0.832329,0.586832,-0.770583,0.327901,-0.220234,0.346855,0.221818,-0.137044,-0.554681,-0.222454,0.040917
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
681,1.0,-0.062686,-0.061007,0.312774,0.130128,-0.732867,1.196992,0.723514,0.397013,-0.693676,-0.576458,-0.983755,-0.059157,-0.079203
682,1.0,-1.208570,-0.415456,0.738962,-0.000358,-0.725732,0.426024,0.720696,-0.481111,-0.450343,-0.077601,0.168426,-1.190616,-0.041104
683,1.0,-0.450750,-0.375873,0.420629,0.591850,-0.086600,-0.485119,0.457636,-0.366704,-0.281841,0.027973,0.034747,-0.354354,0.277498
684,1.0,2.218434,-0.645757,1.063332,-0.713016,-0.701267,-0.232802,0.275665,-0.938191,0.189108,0.498895,0.081778,1.621298,0.634532


In [None]:
vif = pd.DataFrame()
vif["VIF"] = [variance_inflation_factor(reg_x_train_s.values, i) for i in range(reg_x_train_s.shape[1])]
vif['features'] = reg_x_train_s.columns
vif.sort_values(by = 'VIF', ascending = False)

In [80]:
model_reg = LinearRegression()
model_reg.fit(reg_x_train_s, reg_y_train)

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [83]:
model_reg.summary()

AttributeError: 'LinearRegression' object has no attribute 'summary'