# 데이터 살펴보기

In [109]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
import numpy as np
import pandas as pd
import os

print(os.getcwd())
train = pd.read_csv("data/train_df_errno.csv")
test = pd.read_csv("data/test_df.csv")
sub = pd.read_csv("data/sample_submission.csv")
age = pd.read_csv("data/age_gender_info.csv")

train.shape, test.shape, sub.shape, age.shape

C:\Users\yanghj\Desktop\dacon_parking_demand_competition


((2896, 15), (1008, 14), (150, 2), (16, 23))

In [110]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2896 entries, 0 to 2895
Data columns (total 15 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   단지코드        2896 non-null   object 
 1   총세대수        2896 non-null   int64  
 2   임대건물구분      2896 non-null   object 
 3   지역          2896 non-null   object 
 4   공급유형        2896 non-null   object 
 5   전용면적        2896 non-null   float64
 6   전용면적별세대수    2896 non-null   int64  
 7   공가수         2896 non-null   float64
 8   자격유형        2896 non-null   object 
 9   임대보증금       2327 non-null   object 
 10  임대료         2327 non-null   object 
 11  10분내지하철수    2685 non-null   float64
 12  10분내버스정류장수  2892 non-null   float64
 13  단지내주차면수     2896 non-null   float64
 14  등록차량수       2896 non-null   float64
dtypes: float64(6), int64(2), object(7)
memory usage: 339.5+ KB


In [111]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1008 entries, 0 to 1007
Data columns (total 14 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   단지코드        1008 non-null   object 
 1   총세대수        1008 non-null   int64  
 2   임대건물구분      1008 non-null   object 
 3   지역          1008 non-null   object 
 4   공급유형        1008 non-null   object 
 5   전용면적        1008 non-null   float64
 6   전용면적별세대수    1008 non-null   int64  
 7   공가수         1008 non-null   float64
 8   자격유형        1006 non-null   object 
 9   임대보증금       828 non-null    object 
 10  임대료         828 non-null    object 
 11  10분내지하철수    970 non-null    float64
 12  10분내버스정류장수  1008 non-null   float64
 13  단지내주차면수     1008 non-null   float64
dtypes: float64(5), int64(2), object(7)
memory usage: 110.4+ KB


# 결측치가 얼마나 될까?

In [112]:
### 결측치를 확인
train.isna().sum()

단지코드            0
총세대수            0
임대건물구분          0
지역              0
공급유형            0
전용면적            0
전용면적별세대수        0
공가수             0
자격유형            0
임대보증금         569
임대료           569
10분내지하철수      211
10분내버스정류장수      4
단지내주차면수         0
등록차량수           0
dtype: int64

In [113]:
test.isna().sum()

단지코드            0
총세대수            0
임대건물구분          0
지역              0
공급유형            0
전용면적            0
전용면적별세대수        0
공가수             0
자격유형            2
임대보증금         180
임대료           180
10분내지하철수       38
10분내버스정류장수      0
단지내주차면수         0
dtype: int64

* 임대보증금, 임대료, 지하철역수, 버스정류장수(train only), 자격유형(test only)

In [114]:
train.head()

Unnamed: 0,단지코드,총세대수,임대건물구분,지역,공급유형,전용면적,전용면적별세대수,공가수,자격유형,임대보증금,임대료,10분내지하철수,10분내버스정류장수,단지내주차면수,등록차량수
0,C2515,545,아파트,경상남도,국민임대,33.48,276,17.0,A,9216000,82940,0.0,3.0,624.0,205.0
1,C2515,545,아파트,경상남도,국민임대,39.6,60,17.0,A,12672000,107130,0.0,3.0,624.0,205.0
2,C2515,545,아파트,경상남도,국민임대,39.6,20,17.0,A,12672000,107130,0.0,3.0,624.0,205.0
3,C2515,545,아파트,경상남도,국민임대,46.9,38,17.0,A,18433000,149760,0.0,3.0,624.0,205.0
4,C2515,545,아파트,경상남도,국민임대,46.9,19,17.0,A,18433000,149760,0.0,3.0,624.0,205.0


In [115]:
train.columns

Index(['단지코드', '총세대수', '임대건물구분', '지역', '공급유형', '전용면적', '전용면적별세대수', '공가수',
       '자격유형', '임대보증금', '임대료', '10분내지하철수', '10분내버스정류장수', '단지내주차면수', '등록차량수'],
      dtype='object')

In [116]:
train['자격유형'].value_counts()

A    1756
D     569
H     155
J     103
C      95
I      49
E      37
L      33
K      33
N      30
B      21
G       9
F       3
M       2
O       1
Name: 자격유형, dtype: int64

In [117]:
mapping = { 'A':1, 'B':2, 'C':3, 'D':4, 'E':5, 'F':6, 'G':7, 'H':8, 'I':9, 'J':10, 'K':11, 'L':12, 'M':13, 'N':14, 'O':15  }
train['자격유형']=train['자격유형'].map(mapping).astype(int)

In [118]:
train.head()

Unnamed: 0,단지코드,총세대수,임대건물구분,지역,공급유형,전용면적,전용면적별세대수,공가수,자격유형,임대보증금,임대료,10분내지하철수,10분내버스정류장수,단지내주차면수,등록차량수
0,C2515,545,아파트,경상남도,국민임대,33.48,276,17.0,1,9216000,82940,0.0,3.0,624.0,205.0
1,C2515,545,아파트,경상남도,국민임대,39.6,60,17.0,1,12672000,107130,0.0,3.0,624.0,205.0
2,C2515,545,아파트,경상남도,국민임대,39.6,20,17.0,1,12672000,107130,0.0,3.0,624.0,205.0
3,C2515,545,아파트,경상남도,국민임대,46.9,38,17.0,1,18433000,149760,0.0,3.0,624.0,205.0
4,C2515,545,아파트,경상남도,국민임대,46.9,19,17.0,1,18433000,149760,0.0,3.0,624.0,205.0


## LinearRegression 모델

In [124]:
#train dataset의 train과 test로 나누기
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

# sel = ['총세대수', '전용면적', '전용면적별세대수', '공가수',  '단지내주차면수', '자격유형']
sel = ['총세대수']
X = train[sel]
y = train['등록차량수']

X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    random_state=0)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((2172, 1), (724, 1), (2172,), (724,))

# 모델 만들기

In [125]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_squared_log_error

model = LinearRegression().fit(X_train, y_train)  #모델 생성 후 훈련
pred = model.predict(X_test) #새로운 데이터로 예측
pred



array([ 700.01760929,  462.72795139,  519.88938484,  472.76605678,
        528.25447267,  611.62651463,  545.54232083,  411.97975194,
        532.43701658,  442.37290436,  735.15097814,  449.9014834 ,
        735.15097814,  413.6527695 ,  503.71688172,  408.35488055,
        798.72564559,  405.28768168,  591.82914012,  527.13912762,
        435.12316158,  588.76194125,  652.89428122,  560.59947891,
        534.94654292,  561.15715144,  556.416935  ,  589.87728629,
        481.96765338,  417.55647715,  459.38191626,  393.85539499,
        567.01271291,  481.96765338,  471.09303921,  922.80778162,
        498.69782903,  476.66976443,  519.88938484,  483.64067095,
        492.56343129,  519.05287606,  452.689846  ,  549.44602848,
        477.50627321,  388.27866977,  334.46327145,  507.62058937,
        492.56343129,  986.38244907,  519.88938484,  473.60256556,
        435.6808341 ,  442.65174062,  855.05057026,  863.69449435,
        985.26710403,  710.33455093,  723.71869145,  535.78305

In [126]:
model.coef_, model.intercept_

(array([0.27883626]), 309.368007980668)

# 모델 평가하기

In [127]:

#평가해보기
'''
MAE(mean absolute error) : 각각의 값에 절대값을 취해서 이를 전부 더한 후, 갯수로 나누어주기
MSE(mean squared error) : (실제값 - 예측값 ) **2 전부 더해서 개수로나누어 준다
RMSE(root mean squared error) : MSE에 루트 씌워준다
'''

######방법 1
print("방법 1")
MAE =  np.mean(abs(y_test - pred))
print("MAE:",MAE)

MSE = mean_squared_error(y_test, pred)
print("MSE:",MSE)
RMSE = mean_squared_error(y_test, pred) ** 0.5
print("RMSE:",RMSE)

dict_dat = {"실제값":y_test, "예측값":pred, "오차":y_test - pred}
dat = pd.DataFrame(dict_dat )
dat['오차절대값'] = abs(dat['오차'])
dat['오차제곱'] = dat['오차'] ** (2)

######방법 2
print("방법 2")
print("MAE:", sum(dat['오차절대값'])/len(dat['오차절대값']))
print("MSE:", sum(dat['오차제곱'])/len(dat['오차제곱']))
print("RMSE:", sum((dat['오차제곱'])/len(dat['오차제곱']))**0.5)

방법 1
MAE: 315.4678535607197
MSE: 177994.0704782866
RMSE: 421.8934349788897
방법 2
MAE: 315.4678535607197
MSE: 177994.07047828645
RMSE: 421.8934349788896


In [123]:
#결정계수 구하기
print("학습(score):", model.score(X_train,y_train)) #결정계수
print("테스트(score):", model.score(X_test,y_test))

학습(score): 0.7841317088048964
테스트(score): 0.7870026023335941
