In [2]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression

In [3]:
import pandas as pd

train = pd.read_csv('train_df_errno.csv')
test = pd.read_csv('test_df.csv')
sub = pd.read_csv('sample_submission.csv')
age = pd.read_csv('age_gender_info.csv')

train.shape, test.shape, sub.shape, age.shape

((2896, 15), (1008, 14), (150, 2), (16, 23))

In [4]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2896 entries, 0 to 2895
Data columns (total 15 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   단지코드        2896 non-null   object 
 1   총세대수        2896 non-null   int64  
 2   임대건물구분      2896 non-null   object 
 3   지역          2896 non-null   object 
 4   공급유형        2896 non-null   object 
 5   전용면적        2896 non-null   float64
 6   전용면적별세대수    2896 non-null   int64  
 7   공가수         2896 non-null   float64
 8   자격유형        2896 non-null   object 
 9   임대보증금       2327 non-null   object 
 10  임대료         2327 non-null   object 
 11  10분내지하철수    2685 non-null   float64
 12  10분내버스정류장수  2892 non-null   float64
 13  단지내주차면수     2896 non-null   float64
 14  등록차량수       2896 non-null   float64
dtypes: float64(6), int64(2), object(7)
memory usage: 339.5+ KB


In [5]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1008 entries, 0 to 1007
Data columns (total 14 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   단지코드        1008 non-null   object 
 1   총세대수        1008 non-null   int64  
 2   임대건물구분      1008 non-null   object 
 3   지역          1008 non-null   object 
 4   공급유형        1008 non-null   object 
 5   전용면적        1008 non-null   float64
 6   전용면적별세대수    1008 non-null   int64  
 7   공가수         1008 non-null   float64
 8   자격유형        1006 non-null   object 
 9   임대보증금       828 non-null    object 
 10  임대료         828 non-null    object 
 11  10분내지하철수    970 non-null    float64
 12  10분내버스정류장수  1008 non-null   float64
 13  단지내주차면수     1008 non-null   float64
dtypes: float64(5), int64(2), object(7)
memory usage: 110.4+ KB


In [6]:
train.isna().sum()

단지코드            0
총세대수            0
임대건물구분          0
지역              0
공급유형            0
전용면적            0
전용면적별세대수        0
공가수             0
자격유형            0
임대보증금         569
임대료           569
10분내지하철수      211
10분내버스정류장수      4
단지내주차면수         0
등록차량수           0
dtype: int64

In [7]:
test.isna().sum()

단지코드            0
총세대수            0
임대건물구분          0
지역              0
공급유형            0
전용면적            0
전용면적별세대수        0
공가수             0
자격유형            2
임대보증금         180
임대료           180
10분내지하철수       38
10분내버스정류장수      0
단지내주차면수         0
dtype: int64

In [8]:
sel = ['총세대수']
X_train = train[sel]
X_test = test[sel]

y_train = train['등록차량수']  # 원소를 하나씩 출력
y_train

0       205.0
1       205.0
2       205.0
3       205.0
4       205.0
        ...  
2891    146.0
2892    146.0
2893    146.0
2894    146.0
2895    146.0
Name: 등록차량수, Length: 2896, dtype: float64

In [9]:
X_test = test[sel]  # 데이터프레임 형태로 출력됨(이미 sel 리스트가 생성되어 있었음)
X_test

Unnamed: 0,총세대수
0,754
1,754
2,754
3,754
4,754
...,...
1003,675
1004,382
1005,382
1006,382


In [10]:
from sklearn.linear_model import LinearRegression

model = LinearRegression()   # 회귀분석 모델 만들기 (feature 하나의 단순선형회귀)
model.fit(X_train, y_train)  # 학습용 데이터 셋을 이용한 회귀분석 모델 만들기
pred = model.predict(X_test) # 위 회귀분석 모델을 활용한 예측 모델 만들기
pred

array([519.66988273, 519.66988273, 519.66988273, ..., 414.75241208,
       414.75241208, 414.75241208])

In [11]:
model.fit(X_train, y_train)

LinearRegression()

In [12]:
len(pred)

1008

In [13]:
len(sub)

150

In [14]:
len(test['단지코드'].unique())

147

In [15]:
import numpy as np

test['등록차량수'] = pred # -- test set에 '등록차량수' column을 생성하여 pred값을 넣어준다.
test['코드별차량수평균'] = test.groupby('단지코드')['등록차량수'].transform(np.mean)
test.head(10)

Unnamed: 0,단지코드,총세대수,임대건물구분,지역,공급유형,전용면적,전용면적별세대수,공가수,자격유형,임대보증금,임대료,10분내지하철수,10분내버스정류장수,단지내주차면수,등록차량수,코드별차량수평균
0,C1072,754,아파트,경기도,국민임대,39.79,116,14.0,H,22830000,189840,0.0,2.0,683.0,519.669883,519.669883
1,C1072,754,아파트,경기도,국민임대,46.81,30,14.0,A,36048000,249930,0.0,2.0,683.0,519.669883,519.669883
2,C1072,754,아파트,경기도,국민임대,46.9,112,14.0,H,36048000,249930,0.0,2.0,683.0,519.669883,519.669883
3,C1072,754,아파트,경기도,국민임대,46.9,120,14.0,H,36048000,249930,0.0,2.0,683.0,519.669883,519.669883
4,C1072,754,아파트,경기도,국민임대,51.46,60,14.0,H,43497000,296780,0.0,2.0,683.0,519.669883,519.669883
5,C1072,754,아파트,경기도,국민임대,51.71,51,14.0,H,43497000,296780,0.0,2.0,683.0,519.669883,519.669883
6,C1072,754,아파트,경기도,국민임대,51.96,198,14.0,H,43497000,296780,0.0,2.0,683.0,519.669883,519.669883
7,C1072,754,아파트,경기도,국민임대,51.96,67,14.0,H,43497000,296780,0.0,2.0,683.0,519.669883,519.669883
8,C1128,1354,아파트,경기도,국민임대,39.79,368,9.0,H,22830000,189840,0.0,3.0,1216.0,688.89161,688.89161
9,C1128,1354,아파트,경기도,국민임대,39.79,30,9.0,H,22830000,189840,0.0,3.0,1216.0,688.89161,688.89161


In [16]:
test.groupby('단지코드')['등록차량수'].mean()  # -- 그룹별(단지코드별) 등록차량수의 평균

단지코드
C1003     442.391961
C1006     731.479078
C1016     488.363863
C1019     397.548203
C1030     328.167295
            ...     
C2646     471.723727
C2653     554.924409
C2676    1032.411715
C2688     349.320011
C2691     522.772281
Name: 등록차량수, Length: 147, dtype: float64

In [17]:
test.groupby('단지코드')['등록차량수'].mean().unique()

array([ 442.3919608 ,  731.47907752,  488.36386327,  397.54820318,
        328.16729517,  463.26264045,  486.95368221,  492.87644265,
        582.56395788,  455.92969895,  519.66988273,  427.4440416 ,
        601.46038405,  688.89160959,  375.8314149 ,  624.02328096,
        456.77580758,  673.94369039,  380.90806671,  408.26557922,
        393.59969622,  549.84775736,  397.26616697,  364.26793023,
        434.77698309,  447.46861261,  583.69210273,  549.56572115,
        475.10816133,  497.38902203,  517.13155683,  442.95603322,
        506.97825322,  433.64883825,  415.03444829,  395.00987728,
        448.87879366,  672.81554554,  438.16141763,  439.57159869,
        530.95133119,  553.23219189,  574.38490775,  481.59499419,
        514.59323093,  461.57042318,  360.60145949,  512.33694124,
        638.40712775,  466.36503877,  705.81378228,  682.40477673,
        863.75406068,  406.85539816,  667.45685752,  400.65060151,
        422.08535358,  476.8003786 ,  457.62191622,  591.30708

In [18]:
test.groupby('단지코드')['등록차량수'].transform(np.mean)  
### 코드 비교해서 보기
# pred = model.predict(X_test)  - test set으로 예측 모델 만들기
# test['등록차량수'] = pred  - test set에 '등록차량수' column을 생성하여 pred값을 넣어준다.
# test.groupby('단지코드')['등록차량수'].transform(np.mean) - 들어온 메서드에 대해 각 원소를 살리고 그 안에 단지코드별 등록차량수의 평균값을 채운다.
# test.groupby('단지코드')['등록차량수'].mean() - 그룹별(단지코드별) 등록차량수의 평균

0       519.669883
1       519.669883
2       519.669883
3       519.669883
4       519.669883
           ...    
1003    497.389022
1004    414.752412
1005    414.752412
1006    414.752412
1007    414.752412
Name: 등록차량수, Length: 1008, dtype: float64

In [19]:
# 중복 제거
test_new = test.drop_duplicates(['단지코드'], keep='first').reset_index()  
test_new  # 단지코드가 같은 행들 중 첫째 행만 남기고 나머지 모두 제거한 뒤 인덱스를 맨 앞에 재배치

Unnamed: 0,index,단지코드,총세대수,임대건물구분,지역,공급유형,전용면적,전용면적별세대수,공가수,자격유형,임대보증금,임대료,10분내지하철수,10분내버스정류장수,단지내주차면수,등록차량수,코드별차량수평균
0,0,C1072,754,아파트,경기도,국민임대,39.79,116,14.0,H,22830000,189840,0.0,2.0,683.0,519.669883,519.669883
1,8,C1128,1354,아파트,경기도,국민임대,39.79,368,9.0,H,22830000,189840,0.0,3.0,1216.0,688.891610,688.891610
2,17,C1456,619,아파트,부산광역시,국민임대,33.40,82,18.0,A,19706000,156200,0.0,16.0,547.0,481.594994,481.594994
3,26,C1840,593,아파트,전라북도,국민임대,39.57,253,7.0,A,14418000,108130,0.0,3.0,543.0,474.262053,474.262053
4,30,C1332,1297,아파트,경기도,국민임대,39.99,282,11.0,H,28598000,203050,0.0,2.0,1112.0,672.815546,672.815546
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
142,982,C2456,349,아파트,제주특별자치도,국민임대,26.44,24,17.0,H,6992000,117000,0.0,4.0,270.0,405.445217,405.445217
143,986,C1266,596,아파트,충청북도,국민임대,26.94,164,35.0,H,8084000,149910,0.0,1.0,593.0,475.108161,475.108161
144,991,C2152,120,아파트,강원도,영구임대,24.83,66,9.0,C,-,-,0.0,1.0,40.0,340.858925,340.858925
145,993,C1267,675,아파트,경상남도,국민임대,24.87,28,38.0,H,6882000,104370,0.0,1.0,467.0,497.389022,497.389022


In [None]:
sub_df = test_new