In [153]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import MinMaxScaler, StandardScaler, PolynomialFeatures
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from itertools import combinations
from sklearn.tree import DecisionTreeRegressor

longitude : 경도  
latitude : 위도  
housing_median_age : 주택 나이(중앙값)  
total_rooms : 전체 방 수  
total_bedrooms : 전체 침실 수  
population : 인구 수  
households : 세대 수  
median_income : 소득(중앙값)  
median_house_value : 주택 가치(중앙값)  
ocean_proximity : 바다 근접도  

[1] 데이터 불러오기

In [3]:
df = pd.read_csv('housing.csv')
df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20640 non-null  float64
 1   latitude            20640 non-null  float64
 2   housing_median_age  20640 non-null  float64
 3   total_rooms         20640 non-null  float64
 4   total_bedrooms      20433 non-null  float64
 5   population          20640 non-null  float64
 6   households          20640 non-null  float64
 7   median_income       20640 non-null  float64
 8   median_house_value  20640 non-null  float64
 9   ocean_proximity     20640 non-null  object 
dtypes: float64(9), object(1)
memory usage: 1.6+ MB


[2] 결측치 제거

In [9]:
df.dropna(inplace = True)
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 20433 entries, 0 to 20639
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20433 non-null  float64
 1   latitude            20433 non-null  float64
 2   housing_median_age  20433 non-null  float64
 3   total_rooms         20433 non-null  float64
 4   total_bedrooms      20433 non-null  float64
 5   population          20433 non-null  float64
 6   households          20433 non-null  float64
 7   median_income       20433 non-null  float64
 8   median_house_value  20433 non-null  float64
 9   ocean_proximity     20433 non-null  object 
dtypes: float64(9), object(1)
memory usage: 1.7+ MB


[3] 중복값 확인

In [49]:
df.duplicated().sum()

0

In [19]:
df.describe()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
count,20433.0,20433.0,20433.0,20433.0,20433.0,20433.0,20433.0,20433.0,20433.0
mean,-119.570689,35.633221,28.633094,2636.504233,537.870553,1424.946949,499.433465,3.871162,206864.413155
std,2.003578,2.136348,12.591805,2185.269567,421.38507,1133.20849,382.299226,1.899291,115435.667099
min,-124.35,32.54,1.0,2.0,1.0,3.0,1.0,0.4999,14999.0
25%,-121.8,33.93,18.0,1450.0,296.0,787.0,280.0,2.5637,119500.0
50%,-118.49,34.26,29.0,2127.0,435.0,1166.0,409.0,3.5365,179700.0
75%,-118.01,37.72,37.0,3143.0,647.0,1722.0,604.0,4.744,264700.0
max,-114.31,41.95,52.0,39320.0,6445.0,35682.0,6082.0,15.0001,500001.0


In [10]:
df['ocean_proximity'].value_counts()

ocean_proximity
<1H OCEAN     9034
INLAND        6496
NEAR OCEAN    2628
NEAR BAY      2270
ISLAND           5
Name: count, dtype: int64

[3] EDA(탐색적 자료 분석)

In [18]:
df.corr(numeric_only=True)

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
longitude,1.0,-0.924616,-0.109357,0.04548,0.069608,0.10027,0.056513,-0.01555,-0.045398
latitude,-0.924616,1.0,0.011899,-0.036667,-0.066983,-0.108997,-0.071774,-0.079626,-0.144638
housing_median_age,-0.109357,0.011899,1.0,-0.360628,-0.320451,-0.295787,-0.302768,-0.118278,0.106432
total_rooms,0.04548,-0.036667,-0.360628,1.0,0.93038,0.857281,0.918992,0.197882,0.133294
total_bedrooms,0.069608,-0.066983,-0.320451,0.93038,1.0,0.877747,0.979728,-0.007723,0.049686
population,0.10027,-0.108997,-0.295787,0.857281,0.877747,1.0,0.907186,0.005087,-0.0253
households,0.056513,-0.071774,-0.302768,0.918992,0.979728,0.907186,1.0,0.013434,0.064894
median_income,-0.01555,-0.079626,-0.118278,0.197882,-0.007723,0.005087,0.013434,1.0,0.688355
median_house_value,-0.045398,-0.144638,0.106432,0.133294,0.049686,-0.0253,0.064894,0.688355,1.0


> 일단 모든 변수 다 때려넣고 모델 훈련

[4] 선형 회귀 적용

In [63]:
# 피쳐에 따른 선형 회귀 성능평가 함수 생성
def process_regression(x, y, use_model):
    feature = x
    target = y

    X_train, X_test, y_train, y_test = train_test_split(feature, target,
                                                        train_size = 0.8, random_state = 42)
    
    mm = MinMaxScaler()
    mm.fit(X_train)

    X_train_scaled = mm.transform(X_train)
    X_test_scaled = mm.transform(X_test)

    model = use_model
    model.fit(X_train_scaled, y_train)

    y_pred_train = model.predict(X_train_scaled)
    y_pred_test = model.predict(X_test_scaled)

    r2_train = r2_score(y_train, y_pred_train)
    mse_train = mean_squared_error(y_train, y_pred_train)
    rmse_train = np.sqrt(mse_train)
    mae_train = mean_absolute_error(y_train, y_pred_train)

    r2_test = r2_score(y_test, y_pred_test)
    mse_test = mean_squared_error(y_test, y_pred_test)
    rmse_test = np.sqrt(mse_test)
    mae_test = mean_absolute_error(y_test, y_pred_test)

    print(f"features : {x.columns}")
    print(f"train set r2 : {r2_train:.2f}, test set r2 : {r2_test:.2f}")
    print(f"train set mse : {mse_train:.2f}, test set mse : {mse_test:.2f}")
    print(f"train set rmse : {rmse_train:.2f}, test set r2 : {rmse_test:.2f}")
    print(f"train set mae : {mae_train:.2f}, test set r2 : {mae_test:.2f}\n")

    return [r2_train, r2_test]

In [99]:
features = df[['longitude', 'latitude', 'median_income', 'total_rooms', 'total_bedrooms', 'population']]
target = df['median_house_value']

In [131]:
process_regression(features, target, LinearRegression())

features : Index(['longitude', 'latitude', 'median_income', 'total_rooms',
       'total_bedrooms', 'population'],
      dtype='object')
train set r2 : 0.62, test set r2 : 0.63
train set mse : 4995217308.06, test set mse : 5117795982.57
train set rmse : 70676.85, test set r2 : 71538.77
train set mae : 51321.45, test set r2 : 52219.24



[0.6226287541184156, 0.6257602687203938]

> 성능이 좋지 않다

> 중요한 피쳐로 median_income, latitude, total_rooms 사용

[5] 다항 회귀 적용

In [144]:
features = df[['median_income', 'total_rooms', 'housing_median_age']]
target = df['median_house_value']

X_train, X_test, y_train, y_test = train_test_split(features, target,
                                                    train_size = 0.8, random_state = 42)

poly = PolynomialFeatures(include_bias=False)
poly.fit(features)

mm = MinMaxScaler()
mm.fit(X_train)

X_train_scaled = mm.transform(X_train)
X_test_scaled = mm.transform(X_test)

model = LinearRegression()
model.fit(X_train_scaled, y_train)

print(f"train : {model.score(X_train_scaled, y_train)}")
print(f"test : {model.score(X_test_scaled, y_test)}")

train : 0.5141394299793411
test : 0.5137415878339668


> 성능이 더 안좋아졌다

[6] 결정트리 적용

In [186]:
features = df[['longitude', 'latitude', 'median_income', 'total_rooms', 'total_bedrooms', 'population']]
target = df['median_house_value']

X_train, X_test, y_train, y_test = train_test_split(features, target,
                                                    train_size = 0.8, random_state = 42)

mm = MinMaxScaler()
mm.fit(X_train)

X_train_scaled = mm.transform(X_train)
X_test_scaled = mm.transform(X_test)


dt = DecisionTreeRegressor(max_depth=11)

dt.fit(X_train_scaled, y_train)

dt.score(X_train_scaled, y_train), dt.score(X_test_scaled, y_test)

(0.8758807887065271, 0.7388137651240192)

In [195]:
# encoding=utf8
import sys
reload(sys)
sys.setdefaultencoding('utf8')

NameError: name 'reload' is not defined

In [192]:
dt = DecisionTreeRegressor()

# 탐색할 하이퍼 파라미터 그리드 설정
param_dt = {
    'max_depth': list(range(1, 16)),
    'min_samples_split': list(range(1, 11)),
    'min_samples_leaf': list(range(1, 5))
}

# GridSearchCV 객체 생성
dt_grid_search = GridSearchCV(estimator = dt, param_grid = param_dt,
                              cv = 5, n_jobs = -1)

# 그리드 탐색 수행
dt_grid_search.fit(X_train_scaled, y_train)

UnicodeEncodeError: 'ascii' codec can't encode characters in position 18-20: ordinal not in range(128)