In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor

In [2]:
train = pd.read_csv('./final_eda_train.csv')

In [3]:
X = train.iloc[:, 1:-1]
y = train.iloc[:,-1:]

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [5]:
model = RandomForestRegressor(n_jobs=-1, random_state=42)

In [6]:
model.fit(X_train, y_train)

  """Entry point for launching an IPython kernel.


RandomForestRegressor(n_jobs=-1, random_state=42)

In [7]:
pred = model.predict(X_test)

In [8]:
from sklearn.metrics import mean_absolute_error
mean_absolute_error(y_test, pred)

154.9835433070866

In [9]:
import pandas as pd
import numpy as np
import seaborn as sns
from patsy import dmatrices
import statsmodels.api as sm;
from statsmodels.stats.outliers_influence import variance_inflation_factor
%matplotlib inline

train = pd.read_csv('./final_eda_train.csv')
train.head()

Unnamed: 0,단지코드,실세대수,예상등록수,세대수/대중교통,단지평균_면적당_임대보증금,단지평균_면적당_임대료,지역_예상보유수,단지내주차면수,등록차량수
0,C2515,528.0,230.966526,5.170484,334865.630634,2871.531739,235.398388,624.0,205.0
1,C1407,1203.0,578.342118,6.399426,661899.226495,4204.613069,492.186001,1285.0,1064.0
2,C1945,749.0,399.601579,5.232445,440695.351782,3678.55283,304.048417,734.0,730.0
3,C1470,682.0,328.956632,5.831882,421573.181435,2794.616107,283.93004,645.0,553.0
4,C1898,557.0,254.463579,4.530806,401493.726676,2800.158596,231.890077,517.0,415.0


In [10]:
train['intercept'] = 1
lm = sm.OLS(train['등록차량수'], train[['실세대수','예상등록수','세대수/대중교통','단지평균_면적당_임대보증금','단지평균_면적당_임대료','지역_예상보유수','단지내주차면수']])
results = lm.fit()
results.summary()

0,1,2,3
Dep. Variable:,등록차량수,R-squared (uncentered):,0.935
Model:,OLS,Adj. R-squared (uncentered):,0.934
Method:,Least Squares,F-statistic:,768.6
Date:,"Sun, 19 Sep 2021",Prob (F-statistic):,5.27e-218
Time:,00:43:01,Log-Likelihood:,-2500.8
No. Observations:,382,AIC:,5016.0
Df Residuals:,375,BIC:,5043.0
Df Model:,7,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
실세대수,-0.2917,0.067,-4.378,0.000,-0.423,-0.161
예상등록수,1.1444,0.123,9.284,0.000,0.902,1.387
세대수/대중교통,-5.2843,5.503,-0.960,0.338,-16.105,5.536
단지평균_면적당_임대보증금,7.285e-06,2.91e-05,0.250,0.802,-4.99e-05,6.45e-05
단지평균_면적당_임대료,0.0132,0.006,2.199,0.029,0.001,0.025
지역_예상보유수,0.3429,0.167,2.056,0.040,0.015,0.671
단지내주차면수,0.4510,0.065,6.895,0.000,0.322,0.580

0,1,2,3
Omnibus:,53.622,Durbin-Watson:,1.485
Prob(Omnibus):,0.0,Jarque-Bera (JB):,460.11
Skew:,-0.127,Prob(JB):,1.23e-100
Kurtosis:,8.371,Cond. No.,382000.0


In [11]:
def data_eda(df):
    # 실세대수 변수 생성
    df['실세대수'] = df['총세대수'] - df['공가수']
    
    # 임대건물구분
    # 같은 단지내에 임대건물 구분이 2가지인 경우
    apt_shop_list = []
    apt_only_list = []
    for complex_ in df['단지코드'].unique().tolist():
        if len(df[df['단지코드'] == complex_]['임대건물구분'].unique()) != 1:
            apt_shop_list.append(complex_)
        else:
            apt_only_list.append(complex_)
    
    df = df.set_index(['단지코드'])
    # 임대건물구분이 아파트만 있는 경우 0
    df.loc[apt_only_list,'임대건물구분'] = 0
    # 임대건물구분이 아파트와 상가로 되어있는 경우 1
    df.loc[apt_shop_list,'임대건물구분'] = 1
    df = df.reset_index()
    
    
    # 주차규정
    # 구차규정에 따른 지역을 구분
    special_city = ['서울특별시']
    metropolitan_city = ['세종특별자치시', '광주광역시', '부산광역시', '울산광역시', '대전광역시', '대구광역시']
    else_city = ['경상남도', '경기도', '전라북도', '강원도', '충청남도', '제주특별자치도', '충청북도', '전라남도', '경상북도']
    
    df[['주차장_설치기준']] = df.apply(lambda x: parking_rule(x), axis = 1)
    df['전용면적별_주차기준_예상등록수'] = df['전용면적별세대수'] * df['주차장_설치기준']
    pred_by_rule = df.groupby(['단지코드'])['전용면적별_주차기준_예상등록수'].sum()
    
    df = df.set_index(['단지코드'])
    for complex_ in pred_by_rule.index:
        df.loc[complex_,'예상등록수'] = pred_by_rule[complex_]
    df = df.reset_index(['단지코드'])
    
    
    # 임대보증금과 임대료 결측치 제거
    df['임대보증금'] = df['임대보증금'].fillna(0)
    df['임대료'] = df['임대료'].fillna(0)
    df['임대보증금'] = df['임대보증금'].astype('float64')
    df['임대료'] = df['임대료'].astype('float64')
    
    # 임대보증금 / 전용면적, 임대료 / 전용면적 을 통한 면적당 임대보증금과 임대료 산출
    df['면적당_임대보증금'] = df['임대보증금'] / df['전용면적']
    df['면적당_임대료'] = df['임대료'] / df['전용면적']
    
    # 단지별 대표 임대보증금과 임대료 가중치 평균을 통해 산출
    df = df.set_index(['단지코드'])

    for complex_ in df.index.unique():
        # 단지별 면적당 임대보증금 평균값
        avg_rental_deposit = np.dot(df.loc[complex_]['면적당_임대보증금'], df.loc[complex_]['전용면적별세대수']) / df.loc[complex_]['총세대수']
        # 단지별 면적당 임대료 평균값
        avg_rental_cost = np.dot(df.loc[complex_]['면적당_임대료'], df.loc[complex_]['전용면적별세대수']) / df.loc[complex_]['총세대수']
        df.loc[complex_, '단지평균_면적당_임대보증금'] = avg_rental_deposit
        df.loc[complex_, '단지평균_면적당_임대료'] = avg_rental_cost

    df = df.reset_index()
    
    # 대중교통
    df['지하철'] = df['도보 10분거리 내 지하철역 수(환승노선 수 반영)']
    df.drop(columns = ['도보 10분거리 내 지하철역 수(환승노선 수 반영)'], axis = 1, inplace = True)
    df['버스'] = df['도보 10분거리 내 버스정류장 수']
    df.drop(columns = ['도보 10분거리 내 버스정류장 수'], axis = 1, inplace = True)
    
    df['지하철'] = df['지하철'].fillna(0)
    df['대중교통'] = df['지하철'] + df['버스']
    
    df1 = df[['단지코드', '실세대수', '대중교통']].drop_duplicates(subset=None, keep='first', inplace=False, ignore_index=False)
    df1['세대수/대중교통'] = np.log(df1['실세대수'] / (df1['대중교통'] + 0.0000001))
    
    # 단지를 리스트에 담기
    apt_complex = df1['단지코드'].tolist()
    # 세대수 / 대중교통 값 담기
    public_transform_ratio = df1['세대수/대중교통'].tolist()
    
    df = df.set_index(['단지코드'])
    for complex_, ratio_ in zip(apt_complex, public_transform_ratio):
        df.loc[complex_, '세대수/대중교통'] = ratio_
    df  = df.reset_index()
    
    df1 = df[['단지코드', '지역', '실세대수']].drop_duplicates(subset=None, keep='first', inplace=False, ignore_index=False)
    df1 = pd.merge(df1, prob_area_car, how='inner')
    df1['지역_예상보유수'] = df1['실세대수'] * df1['자동차_보유확률']
    # 단지를 리스트에 담기
    apt_complex = df1['단지코드'].tolist()
    # 세대수 / 대중교통 값 담기
    pred_car_num = df1['지역_예상보유수'].tolist()
    
    df = df.set_index(['단지코드'])
    for complex_, car_num in zip(apt_complex, pred_car_num):
        df.loc[complex_, '지역_예상보유수'] = car_num
    df  = df.reset_index()
    
    df = df[['단지코드','실세대수','예상등록수','세대수/대중교통','단지평균_면적당_임대보증금','단지평균_면적당_임대료','지역_예상보유수','단지내주차면수']].drop_duplicates(subset=None, keep='first', inplace=False, ignore_index=False)
    
    return df

In [14]:
test = pd.read_csv('./data/test.csv')
submission = pd.read_csv('./data/sample_submission.csv')

In [15]:
pred = model.predict(data_eda(test))

NameError: name 'parking_rule' is not defined