In [51]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.linear_model import LinearRegression

import pandas as pd

train = pd.read_csv("../data/parking_demand_dataset/train.csv")
test = pd.read_csv("../data/parking_demand_dataset/test.csv")
sub = pd.read_csv("../data/parking_demand_dataset/sample_submission.csv")
age = pd.read_csv("../data/parking_demand_dataset/age_gender_info.csv")

train.shape, test.shape, sub.shape, age.shape

((2952, 15), (1022, 14), (150, 2), (16, 23))

In [2]:
### 각 데이터셋의 컬럼을 확인
print(train.columns,end="\n\n")
print(test.columns,end="\n\n")
print(sub.columns,end="\n\n")
print(age.columns,end="\n\n")

# 임대보증금, 임대료, 지하철역수. 버스정류장수(train only), 자격유형(test cnly)

Index(['단지코드', '총세대수', '임대건물구분', '지역', '공급유형', '전용면적', '전용면적별세대수', '공가수',
       '자격유형', '임대보증금', '임대료', '도보 10분거리 내 지하철역 수(환승노선 수 반영)',
       '도보 10분거리 내 버스정류장 수', '단지내주차면수', '등록차량수'],
      dtype='object')

Index(['단지코드', '총세대수', '임대건물구분', '지역', '공급유형', '전용면적', '전용면적별세대수', '공가수',
       '자격유형', '임대보증금', '임대료', '도보 10분거리 내 지하철역 수(환승노선 수 반영)',
       '도보 10분거리 내 버스정류장 수', '단지내주차면수'],
      dtype='object')

Index(['code', 'num'], dtype='object')

Index(['지역', '10대미만(여자)', '10대미만(남자)', '10대(여자)', '10대(남자)', '20대(여자)',
       '20대(남자)', '30대(여자)', '30대(남자)', '40대(여자)', '40대(남자)', '50대(여자)',
       '50대(남자)', '60대(여자)', '60대(남자)', '70대(여자)', '70대(남자)', '80대(여자)',
       '80대(남자)', '90대(여자)', '90대(남자)', '100대(여자)', '100대(남자)'],
      dtype='object')



In [10]:
### train 데이터 셋의 컬럼정보를 확인
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2952 entries, 0 to 2951
Data columns (total 15 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   단지코드                          2952 non-null   object 
 1   총세대수                          2952 non-null   int64  
 2   임대건물구분                        2952 non-null   object 
 3   지역                            2952 non-null   object 
 4   공급유형                          2952 non-null   object 
 5   전용면적                          2952 non-null   float64
 6   전용면적별세대수                      2952 non-null   int64  
 7   공가수                           2952 non-null   float64
 8   자격유형                          2952 non-null   object 
 9   임대보증금                         2383 non-null   object 
 10  임대료                           2383 non-null   object 
 11  도보 10분거리 내 지하철역 수(환승노선 수 반영)  2741 non-null   float64
 12  도보 10분거리 내 버스정류장 수            2948 non-null   float64
 13  단지내

In [11]:
### 결측치를 확인
train.isna().sum()

단지코드                              0
총세대수                              0
임대건물구분                            0
지역                                0
공급유형                              0
전용면적                              0
전용면적별세대수                          0
공가수                               0
자격유형                              0
임대보증금                           569
임대료                             569
도보 10분거리 내 지하철역 수(환승노선 수 반영)    211
도보 10분거리 내 버스정류장 수                4
단지내주차면수                           0
등록차량수                             0
dtype: int64

In [14]:
train.head()

Unnamed: 0,단지코드,총세대수,임대건물구분,지역,공급유형,전용면적,전용면적별세대수,공가수,자격유형,임대보증금,임대료,도보 10분거리 내 지하철역 수(환승노선 수 반영),도보 10분거리 내 버스정류장 수,단지내주차면수,등록차량수
0,C2483,900,아파트,경상북도,국민임대,39.72,134,38.0,A,15667000,103680,0.0,3.0,1425.0,1015.0
1,C2483,900,아파트,경상북도,국민임대,39.72,15,38.0,A,15667000,103680,0.0,3.0,1425.0,1015.0
2,C2483,900,아파트,경상북도,국민임대,51.93,385,38.0,A,27304000,184330,0.0,3.0,1425.0,1015.0
3,C2483,900,아파트,경상북도,국민임대,51.93,15,38.0,A,27304000,184330,0.0,3.0,1425.0,1015.0
4,C2483,900,아파트,경상북도,국민임대,51.93,41,38.0,A,27304000,184330,0.0,3.0,1425.0,1015.0


In [7]:
train.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
총세대수,2952.0,886.661247,513.540168,26.0,513.5,779.0,1106.0,2568.0
전용면적,2952.0,44.757215,31.87428,12.62,32.1,39.93,51.5625,583.4
전용면적별세대수,2952.0,102.747967,132.640159,1.0,14.0,60.0,144.0,1865.0
공가수,2952.0,12.92107,10.778831,0.0,4.0,11.0,20.0,55.0
도보 10분거리 내 지하철역 수(환승노선 수 반영),2741.0,0.176578,0.427408,0.0,0.0,0.0,0.0,3.0
도보 10분거리 내 버스정류장 수,2948.0,3.695726,2.644665,0.0,2.0,3.0,4.0,20.0
단지내주차면수,2952.0,601.66836,396.407072,13.0,279.25,517.0,823.0,1798.0
등록차량수,2952.0,559.768293,433.375027,13.0,220.0,487.0,770.0,2550.0


In [12]:

print("임대건물구분", end="\n")
print(train["임대건물구분"].unique(), end="\n")
print("지역", end="\n")
print(train["지역"].unique(), end="\n")
print("자격유형", end="\n")
print(train["자격유형"].unique(), end="\n")

임대건물구분
['아파트' '상가']
지역
['경상북도' '경상남도' '대전광역시' '경기도' '전라북도' '강원도' '광주광역시' '충청남도' '부산광역시' '제주특별자치도'
 '울산광역시' '충청북도' '전라남도' '대구광역시' '서울특별시' '세종특별자치시']
자격유형
['A' 'B' 'C' 'D' 'E' 'F' 'G' 'H' 'I' 'J' 'K' 'L' 'M' 'N' 'O']


In [5]:
test.head()

Unnamed: 0,단지코드,총세대수,임대건물구분,지역,공급유형,전용면적,전용면적별세대수,공가수,자격유형,임대보증금,임대료,도보 10분거리 내 지하철역 수(환승노선 수 반영),도보 10분거리 내 버스정류장 수,단지내주차면수
0,C1072,754,아파트,경기도,국민임대,39.79,116,14.0,H,22830000,189840,0.0,2.0,683.0
1,C1072,754,아파트,경기도,국민임대,46.81,30,14.0,A,36048000,249930,0.0,2.0,683.0
2,C1072,754,아파트,경기도,국민임대,46.9,112,14.0,H,36048000,249930,0.0,2.0,683.0
3,C1072,754,아파트,경기도,국민임대,46.9,120,14.0,H,36048000,249930,0.0,2.0,683.0
4,C1072,754,아파트,경기도,국민임대,51.46,60,14.0,H,43497000,296780,0.0,2.0,683.0


In [6]:
age.head()

Unnamed: 0,지역,10대미만(여자),10대미만(남자),10대(여자),10대(남자),20대(여자),20대(남자),30대(여자),30대(남자),40대(여자),...,60대(여자),60대(남자),70대(여자),70대(남자),80대(여자),80대(남자),90대(여자),90대(남자),100대(여자),100대(남자)
0,경상북도,0.030158,0.033195,0.056346,0.06136,0.060096,0.067859,0.053433,0.049572,0.08366,...,0.082684,0.063889,0.047717,0.030172,0.029361,0.011211,0.005578,0.001553,0.000234,1.4e-05
1,경상남도,0.0274,0.026902,0.053257,0.055568,0.06492,0.070618,0.056414,0.05755,0.077092,...,0.087201,0.069562,0.048357,0.033277,0.027361,0.011295,0.00491,0.001086,0.000179,1e-05
2,대전광역시,0.028197,0.029092,0.04049,0.042793,0.060834,0.064247,0.068654,0.066848,0.074667,...,0.088468,0.070261,0.05101,0.037143,0.032455,0.013751,0.006494,0.00174,0.000298,6.6e-05
3,경기도,0.03803,0.039507,0.052546,0.05399,0.058484,0.059894,0.072331,0.068704,0.083208,...,0.074237,0.058419,0.042422,0.032725,0.025136,0.012354,0.00539,0.001707,0.00029,6.7e-05
4,전라북도,0.028089,0.029065,0.059685,0.06008,0.066262,0.070322,0.052027,0.046596,0.077005,...,0.076636,0.068042,0.051025,0.035748,0.035049,0.012641,0.007223,0.001898,0.000158,1.3e-05


In [8]:
age.describe()

Unnamed: 0,10대미만(여자),10대미만(남자),10대(여자),10대(남자),20대(여자),20대(남자),30대(여자),30대(남자),40대(여자),40대(남자),...,60대(여자),60대(남자),70대(여자),70대(남자),80대(여자),80대(남자),90대(여자),90대(남자),100대(여자),100대(남자)
count,16.0,16.0,16.0,16.0,16.0,16.0,16.0,16.0,16.0,16.0,...,16.0,16.0,16.0,16.0,16.0,16.0,16.0,16.0,16.0,16.0
mean,0.029229,0.030512,0.050039,0.05211,0.063256,0.066395,0.06125,0.06014,0.075738,0.070582,...,0.084382,0.065974,0.051411,0.035639,0.030447,0.012664,0.006226,0.001548,0.000263,7.5e-05
std,0.005938,0.006121,0.010954,0.011193,0.006266,0.005236,0.014111,0.014301,0.007822,0.005248,...,0.014841,0.008851,0.016191,0.009233,0.00832,0.003954,0.001818,0.000603,0.000105,0.000114
min,0.015323,0.015627,0.02521,0.026121,0.052712,0.056782,0.047049,0.046596,0.054216,0.060769,...,0.065279,0.051163,0.026056,0.026223,0.013893,0.007086,0.003459,0.000895,8.4e-05,0.0
25%,0.026548,0.026676,0.04751,0.048234,0.060305,0.063974,0.052629,0.051301,0.075033,0.067843,...,0.074245,0.059294,0.042506,0.030571,0.026762,0.011133,0.00538,0.001086,0.000209,1.3e-05
50%,0.029523,0.031196,0.053214,0.054779,0.063399,0.067367,0.058899,0.055473,0.077725,0.070602,...,0.07966,0.065965,0.04815,0.032907,0.028562,0.011622,0.005772,0.001504,0.000262,3.2e-05
75%,0.032665,0.03482,0.059096,0.061413,0.065703,0.070326,0.065544,0.066962,0.080158,0.073796,...,0.089979,0.070076,0.051497,0.036715,0.03272,0.012737,0.00693,0.001715,0.000303,8.9e-05
max,0.03803,0.039507,0.060094,0.063379,0.079284,0.074689,0.106121,0.104447,0.08366,0.079183,...,0.119639,0.085294,0.097543,0.06105,0.047908,0.023463,0.011344,0.003326,0.000532,0.000464


In [9]:
age["지역"].unique()

array(['경상북도', '경상남도', '대전광역시', '경기도', '전라북도', '강원도', '광주광역시', '충청남도',
       '부산광역시', '제주특별자치도', '울산광역시', '충청북도', '전라남도', '대구광역시', '서울특별시',
       '세종특별자치시'], dtype=object)

In [16]:
#총세대수를 이용해서 등록차량수를 예측한다
sel = ["총세대수"]

X_train = train[sel]
X_test = test[sel]
y_train = train['등록차량수']


In [17]:
# 모델을 만들고 예측
model = LinearRegression()
model.fit(X_train, y_train)
pred = model.predict(X_test) #학습 데이터 세트로 예측 수행
pred

array([524.31256846, 524.31256846, 524.31256846, ..., 424.88994317,
       424.88994317, 424.88994317])

### 제출형태 맞춰주기

In [32]:
#제출 형태를 맞춰줘야한다
test.shape, sub.shape

((1022, 14), (150, 2))

In [46]:
print(test.columns,end="\n\n")
print(sub.columns,end="\n\n")

Index(['단지코드', '총세대수', '임대건물구분', '지역', '공급유형', '전용면적', '전용면적별세대수', '공가수',
       '자격유형', '임대보증금', '임대료', '도보 10분거리 내 지하철역 수(환승노선 수 반영)',
       '도보 10분거리 내 버스정류장 수', '단지내주차면수', '등록차량수', '코드별차량수평균'],
      dtype='object')

Index(['code', 'num'], dtype='object')



In [34]:
len(test['단지코드'].unique())

150

In [53]:
#test에 예측한것을 붙여준다
test['등록차량수'] = pred

In [49]:
print(test.isnull().sum())
test.head()


단지코드                               0
총세대수                               0
임대건물구분                             0
지역                                 0
공급유형                               0
전용면적                               0
전용면적별세대수                           0
공가수                                0
자격유형                               2
임대보증금                            180
임대료                              180
도보 10분거리 내 지하철역 수(환승노선 수 반영)      42
도보 10분거리 내 버스정류장 수                 0
단지내주차면수                            0
등록차량수                              0
코드별차량수평균                        1022
dtype: int64


Unnamed: 0,단지코드,총세대수,임대건물구분,지역,공급유형,전용면적,전용면적별세대수,공가수,자격유형,임대보증금,임대료,도보 10분거리 내 지하철역 수(환승노선 수 반영),도보 10분거리 내 버스정류장 수,단지내주차면수,등록차량수,코드별차량수평균
0,C1072,754,아파트,경기도,국민임대,39.79,116,14.0,H,22830000,189840,0.0,2.0,683.0,524.312568,
1,C1072,754,아파트,경기도,국민임대,46.81,30,14.0,A,36048000,249930,0.0,2.0,683.0,524.312568,
2,C1072,754,아파트,경기도,국민임대,46.9,112,14.0,H,36048000,249930,0.0,2.0,683.0,524.312568,
3,C1072,754,아파트,경기도,국민임대,46.9,120,14.0,H,36048000,249930,0.0,2.0,683.0,524.312568,
4,C1072,754,아파트,경기도,국민임대,51.46,60,14.0,H,43497000,296780,0.0,2.0,683.0,524.312568,


In [44]:
#단지 코드별 등록차량수 평균값
test.groupby('단지코드')['등록차량수'].mean()

단지코드
C1003     451.081925
C1006     725.028675
C1016     494.646140
C1019     408.586771
C1030     342.839551
            ...     
C2653     557.720709
C2675     459.634409
C2676    1010.200560
C2688     362.884435
C2691     527.252485
Name: 등록차량수, Length: 150, dtype: float64

In [56]:
# test["코드별차량수평균"]=test.groupby('단지코드')['등록차량수']
test["코드별차량수평균"]=test.groupby('단지코드')['등록차량수'].transform(np.mean)
test.head()

Unnamed: 0,단지코드,총세대수,임대건물구분,지역,공급유형,전용면적,전용면적별세대수,공가수,자격유형,임대보증금,임대료,도보 10분거리 내 지하철역 수(환승노선 수 반영),도보 10분거리 내 버스정류장 수,단지내주차면수,등록차량수,코드별차량수평균
0,C1072,754,아파트,경기도,국민임대,39.79,116,14.0,H,22830000,189840,0.0,2.0,683.0,524.312568,524.312568
1,C1072,754,아파트,경기도,국민임대,46.81,30,14.0,A,36048000,249930,0.0,2.0,683.0,524.312568,524.312568
2,C1072,754,아파트,경기도,국민임대,46.9,112,14.0,H,36048000,249930,0.0,2.0,683.0,524.312568,524.312568
3,C1072,754,아파트,경기도,국민임대,46.9,120,14.0,H,36048000,249930,0.0,2.0,683.0,524.312568,524.312568
4,C1072,754,아파트,경기도,국민임대,51.46,60,14.0,H,43497000,296780,0.0,2.0,683.0,524.312568,524.312568


In [59]:
test_new = test.drop_duplicates(['단지코드'],keep='first').reset_index()
test_new.sort_values('단지코드')

Unnamed: 0,index,단지코드,총세대수,임대건물구분,지역,공급유형,전용면적,전용면적별세대수,공가수,자격유형,임대보증금,임대료,도보 10분거리 내 지하철역 수(환승노선 수 반영),도보 10분거리 내 버스정류장 수,단지내주차면수,등록차량수,코드별차량수평균
125,854,C1003,480,아파트,경상남도,행복주택,16.69,128,29.0,J,12000000,61000,0.0,3.0,339.0,451.081925,451.081925
11,73,C1006,1505,아파트,대전광역시,영구임대,26.37,358,27.0,C,5787000,79980,2.0,5.0,428.0,725.028675,725.028675
76,575,C1016,643,아파트,경기도,국민임대,39.64,177,4.0,A,20233000,182630,1.0,4.0,646.0,494.646140,494.646140
71,545,C1019,321,아파트,제주특별자치도,국민임대,39.84,132,7.0,A,13276000,107740,0.0,3.0,259.0,408.586771,408.586771
129,885,C1030,75,아파트,경기도,행복주택,16.39,14,12.0,J,34240000,142660,1.0,2.0,29.0,342.839551,342.839551
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
90,646,C2653,879,아파트,경기도,국민임대,51.86,348,25.0,A,22876000,232570,0.0,16.0,802.0,557.720709,557.720709
77,579,C2675,512,아파트,경기도,국민임대,36.65,130,9.0,A,18476000,154790,0.0,3.0,1016.0,459.634409,459.634409
14,113,C2676,2572,아파트,부산광역시,영구임대,26.37,294,25.0,C,5787000,79980,0.0,2.0,418.0,1010.200560,1010.200560
128,879,C2688,150,아파트,전라남도,국민임대,26.59,44,1.0,H,4831000,86350,0.0,3.0,107.0,362.884435,362.884435


In [61]:
sub_df= test_new[['단지코드','코드별차량수평균']]
sub_df.columns=['code','num']
sub_df

Unnamed: 0,code,num
0,C1072,524.312568
1,C1128,684.671641
2,C1456,488.231777
3,C1840,481.282884
4,C1332,669.437530
...,...,...
145,C2456,416.070194
146,C1266,482.084679
147,C2152,354.866481
148,C1267,503.198624


In [63]:
sub_df.to_csv('baseline_0713.csv',index=False)


In [65]:
import os
os.listdir(os.getcwd())

['.git',
 '.ipynb_checkpoints',
 '01_데이터셋_살펴보기.ipynb',
 '02_matplotlib를 활용한 시각화.ipynb',
 'baseline_0713.csv',
 'README.md']