In [2]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
import pandas as pd

In [3]:
train = pd.read_csv('train_df_errno.csv')
test = pd.read_csv('test_df.csv')
sub = pd.read_csv('sample_submission.csv')
age = pd.read_csv('age_gender_info.csv')

train.shape, test.shape, sub.shape, age.shape

((2896, 15), (1008, 14), (150, 2), (16, 23))

In [4]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2896 entries, 0 to 2895
Data columns (total 15 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   단지코드        2896 non-null   object 
 1   총세대수        2896 non-null   int64  
 2   임대건물구분      2896 non-null   object 
 3   지역          2896 non-null   object 
 4   공급유형        2896 non-null   object 
 5   전용면적        2896 non-null   float64
 6   전용면적별세대수    2896 non-null   int64  
 7   공가수         2896 non-null   float64
 8   자격유형        2896 non-null   object 
 9   임대보증금       2327 non-null   object 
 10  임대료         2327 non-null   object 
 11  10분내지하철수    2685 non-null   float64
 12  10분내버스정류장수  2892 non-null   float64
 13  단지내주차면수     2896 non-null   float64
 14  등록차량수       2896 non-null   float64
dtypes: float64(6), int64(2), object(7)
memory usage: 339.5+ KB


In [5]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1008 entries, 0 to 1007
Data columns (total 14 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   단지코드        1008 non-null   object 
 1   총세대수        1008 non-null   int64  
 2   임대건물구분      1008 non-null   object 
 3   지역          1008 non-null   object 
 4   공급유형        1008 non-null   object 
 5   전용면적        1008 non-null   float64
 6   전용면적별세대수    1008 non-null   int64  
 7   공가수         1008 non-null   float64
 8   자격유형        1006 non-null   object 
 9   임대보증금       828 non-null    object 
 10  임대료         828 non-null    object 
 11  10분내지하철수    970 non-null    float64
 12  10분내버스정류장수  1008 non-null   float64
 13  단지내주차면수     1008 non-null   float64
dtypes: float64(5), int64(2), object(7)
memory usage: 110.4+ KB


In [6]:
train.isna().sum()

단지코드            0
총세대수            0
임대건물구분          0
지역              0
공급유형            0
전용면적            0
전용면적별세대수        0
공가수             0
자격유형            0
임대보증금         569
임대료           569
10분내지하철수      211
10분내버스정류장수      4
단지내주차면수         0
등록차량수           0
dtype: int64

In [7]:
test.isna().sum()

단지코드            0
총세대수            0
임대건물구분          0
지역              0
공급유형            0
전용면적            0
전용면적별세대수        0
공가수             0
자격유형            2
임대보증금         180
임대료           180
10분내지하철수       38
10분내버스정류장수      0
단지내주차면수         0
dtype: int64

In [10]:
len(pred)

1008

In [11]:
len(sub)

150

In [13]:
len(test['단지코드'].unique())  # 결측치 3개 코드 채워야 함

147

### 결측치 처리

#### 데이터 결합

In [15]:
all_df = pd.concat([train, test], join='inner')
all_df

Unnamed: 0,단지코드,총세대수,임대건물구분,지역,공급유형,전용면적,전용면적별세대수,공가수,자격유형,임대보증금,임대료,10분내지하철수,10분내버스정류장수,단지내주차면수
0,C2515,545,아파트,경상남도,국민임대,33.48,276,17.0,A,9216000,82940,0.0,3.0,624.0
1,C2515,545,아파트,경상남도,국민임대,39.60,60,17.0,A,12672000,107130,0.0,3.0,624.0
2,C2515,545,아파트,경상남도,국민임대,39.60,20,17.0,A,12672000,107130,0.0,3.0,624.0
3,C2515,545,아파트,경상남도,국민임대,46.90,38,17.0,A,18433000,149760,0.0,3.0,624.0
4,C2515,545,아파트,경상남도,국민임대,46.90,19,17.0,A,18433000,149760,0.0,3.0,624.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1003,C1267,675,아파트,경상남도,행복주택,36.77,126,38.0,L,-,-,0.0,1.0,467.0
1004,C2189,382,아파트,전라북도,국민임대,29.19,96,45.0,H,6872000,106400,0.0,2.0,300.0
1005,C2189,382,아파트,전라북도,국민임대,29.19,20,45.0,H,6872000,106400,0.0,2.0,300.0
1006,C2189,382,아파트,전라북도,국민임대,39.45,202,45.0,H,13410000,144600,0.0,2.0,300.0


In [16]:
all_df.isna().sum()

단지코드            0
총세대수            0
임대건물구분          0
지역              0
공급유형            0
전용면적            0
전용면적별세대수        0
공가수             0
자격유형            2
임대보증금         749
임대료           749
10분내지하철수      249
10분내버스정류장수      4
단지내주차면수         0
dtype: int64

#### 자격유형(test) 결측치 처리

In [17]:
all_df['지역'].unique()

array(['경상남도', '대전광역시', '경기도', '전라북도', '강원도', '광주광역시', '충청남도', '부산광역시',
       '제주특별자치도', '울산광역시', '충청북도', '전라남도', '경상북도', '대구광역시', '서울특별시',
       '세종특별자치시'], dtype=object)

In [18]:
all_df.loc[all_df['자격유형'].isnull()]

Unnamed: 0,단지코드,총세대수,임대건물구분,지역,공급유형,전용면적,전용면적별세대수,공가수,자격유형,임대보증금,임대료,10분내지하철수,10분내버스정류장수,단지내주차면수
196,C2411,962,아파트,경상남도,국민임대,46.9,240,25.0,,71950000,37470,0.0,2.0,840.0
258,C2253,1161,아파트,강원도,영구임대,26.37,745,0.0,,2249000,44770,0.0,2.0,173.0


In [19]:
grouped = all_df.groupby(['단지코드', '임대건물구분','지역','공급유형'])
group1 = grouped.get_group(('C2411', '아파트', '경상남도', '국민임대'))
group1

Unnamed: 0,단지코드,총세대수,임대건물구분,지역,공급유형,전용면적,전용면적별세대수,공가수,자격유형,임대보증금,임대료,10분내지하철수,10분내버스정류장수,단지내주차면수
193,C2411,962,아파트,경상남도,국민임대,39.43,56,25.0,A,11992000,100720,0.0,2.0,840.0
194,C2411,962,아파트,경상남도,국민임대,39.72,336,25.0,A,11992000,100720,0.0,2.0,840.0
195,C2411,962,아파트,경상남도,국민임대,39.82,179,25.0,A,11992000,100720,0.0,2.0,840.0
196,C2411,962,아파트,경상남도,국민임대,46.9,240,25.0,,71950000,37470,0.0,2.0,840.0
197,C2411,962,아파트,경상남도,국민임대,51.93,150,25.0,A,21586000,171480,0.0,2.0,840.0


In [20]:
group2 = grouped.get_group(('C2253', '아파트', '강원도', '영구임대'))
group2

Unnamed: 0,단지코드,총세대수,임대건물구분,지역,공급유형,전용면적,전용면적별세대수,공가수,자격유형,임대보증금,임대료,10분내지하철수,10분내버스정류장수,단지내주차면수
258,C2253,1161,아파트,강원도,영구임대,26.37,745,0.0,,2249000,44770,0.0,2.0,173.0
259,C2253,1161,아파트,강원도,영구임대,31.32,239,0.0,C,3731000,83020,0.0,2.0,173.0
260,C2253,1161,아파트,강원도,영구임대,31.32,149,0.0,C,3731000,83020,0.0,2.0,173.0


In [21]:
all_df.loc[196, '자격유형'] = 'A'
all_df.loc[258, '자격유형'] = 'C'

In [23]:
all_df.loc[all_df['자격유형'].isnull()]  # 자격유형 결측치 처리 확인

Unnamed: 0,단지코드,총세대수,임대건물구분,지역,공급유형,전용면적,전용면적별세대수,공가수,자격유형,임대보증금,임대료,10분내지하철수,10분내버스정류장수,단지내주차면수


In [24]:
print(all_df.자격유형.unique())

['A' 'B' 'C' 'D' 'E' 'F' 'G' 'H' 'I' 'J' 'K' 'L' 'M' 'N' 'O']


In [25]:
mapping = {'A':1, 'B':2, 'C':3, 'D':4, 'E':5, 'F':6, 'G':7, 'H':8, 'I':9, 'J':10, 
          'K':11, 'L':12, 'M':13, 'N':14, 'O':15}
all_df['자격유형'] = all_df['자격유형'].map(mapping).astype(int)
print(all_df.자격유형.unique())

[ 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15]


#### 10분내버스정류장수(train) 결측치 처리

In [26]:
all_df.isna().sum()

단지코드            0
총세대수            0
임대건물구분          0
지역              0
공급유형            0
전용면적            0
전용면적별세대수        0
공가수             0
자격유형            0
임대보증금         749
임대료           749
10분내지하철수      249
10분내버스정류장수      4
단지내주차면수         0
dtype: int64

In [28]:
all_df.loc[all_df['10분내버스정류장수'].isnull(), :]

Unnamed: 0,단지코드,총세대수,임대건물구분,지역,공급유형,전용면적,전용면적별세대수,공가수,자격유형,임대보증금,임대료,10분내지하철수,10분내버스정류장수,단지내주차면수
2293,N2431,1047,아파트,경상남도,공공임대(10년),74.97,80,15.0,1,46000000,456000,,,1066.0
2294,N2431,1047,아파트,경상남도,공공임대(10년),84.95,124,15.0,1,57000000,462000,,,1066.0
2295,N2431,1047,아파트,경상남도,공공임대(10년),84.96,289,15.0,1,57000000,462000,,,1066.0
2296,N2431,1047,아파트,경상남도,공공임대(10년),84.98,82,15.0,1,57000000,462000,,,1066.0


In [30]:
grouped = all_df.groupby(['임대건물구분','지역','공급유형','자격유형'])
group1 = grouped.get_group(('아파트','경상남도','공공임대(10년)',1))
group1

Unnamed: 0,단지코드,총세대수,임대건물구분,지역,공급유형,전용면적,전용면적별세대수,공가수,자격유형,임대보증금,임대료,10분내지하철수,10분내버스정류장수,단지내주차면수
2158,C1788,376,아파트,경상남도,공공임대(10년),51.59,116,28.0,1,29000000,340000,0.0,3.0,380.0
2159,C1788,376,아파트,경상남도,공공임대(10년),59.97,260,28.0,1,36000000,390000,0.0,3.0,380.0
2208,C2405,600,아파트,경상남도,공공임대(10년),75.84,48,22.0,1,36019000,603480,0.0,8.0,728.0
2209,C2405,600,아파트,경상남도,공공임대(10년),75.99,54,22.0,1,36113000,603480,0.0,8.0,728.0
2210,C2405,600,아파트,경상남도,공공임대(10년),84.95,182,22.0,1,44868000,636400,0.0,8.0,728.0
2293,N2431,1047,아파트,경상남도,공공임대(10년),74.97,80,15.0,1,46000000,456000,,,1066.0
2294,N2431,1047,아파트,경상남도,공공임대(10년),84.95,124,15.0,1,57000000,462000,,,1066.0
2295,N2431,1047,아파트,경상남도,공공임대(10년),84.96,289,15.0,1,57000000,462000,,,1066.0
2296,N2431,1047,아파트,경상남도,공공임대(10년),84.98,82,15.0,1,57000000,462000,,,1066.0
2498,C1941,404,아파트,경상남도,공공임대(10년),84.94,64,19.0,1,47288000,627080,0.0,3.0,490.0


In [31]:
group1['10분내버스정류장수'].mean()

4.318181818181818