In [1]:
import pandas as pd

In [4]:
demo_df = pd.DataFrame({"숫자특성":[0,1,2,1],
                        "범주형":["a", "b", "c", "a"]})

범주형 데이터는 선형모델에 사용불가
-> 원핫인코딩 : 범주의 갯수만큼 dummy variable을 만들어 0 또는 1을 각 범주마다 할당해 새로운 특성으로 바꿈 
-> 차수가 늘어나더라도 범주마다 0,1로 할당하는 게 더 중요함

판다스로 하는 원핫인코딩은 데이터 분석에서만 사용함 
ski-learn 으로 하는 원핫인코딩은 모델을 만들때 사용함

**결정트리(XGboost)를 사용하려면 범주형 전처리가 필요없음

In [None]:
pd.get_dummies(demo_df)

Unnamed: 0,숫자특성,범주형_a,범주형_b,범주형_c
0,0,True,False,False
1,1,False,True,False
2,2,False,False,True
3,1,True,False,False


In [1]:
from preamble import *

df = pd.read_csv("data.housing.csv")
df.isnull(sum)

In [17]:
df = pd.read_csv("data/housing.csv")
df["ocean_proximity"].value_counts() # value_counts() : 범주형 데이터의 범주와 범주에 해당하는 데이터의 갯수를 세주는 함수
#df["ocean_proximity"].unique() # unique() : 고유값들이 어떠한 종류들이 있는지 알고 싶을때 사용하는 함수
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20640 non-null  float64
 1   latitude            20640 non-null  float64
 2   housing_median_age  20640 non-null  float64
 3   total_rooms         20640 non-null  float64
 4   total_bedrooms      20433 non-null  float64
 5   population          20640 non-null  float64
 6   households          20640 non-null  float64
 7   median_income       20640 non-null  float64
 8   median_house_value  20640 non-null  float64
 9   ocean_proximity     20640 non-null  object 
dtypes: float64(9), object(1)
memory usage: 1.6+ MB


In [19]:
df = df.dropna() # 일단은 결측지 제거
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 20433 entries, 0 to 20639
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20433 non-null  float64
 1   latitude            20433 non-null  float64
 2   housing_median_age  20433 non-null  float64
 3   total_rooms         20433 non-null  float64
 4   total_bedrooms      20433 non-null  float64
 5   population          20433 non-null  float64
 6   households          20433 non-null  float64
 7   median_income       20433 non-null  float64
 8   median_house_value  20433 non-null  float64
 9   ocean_proximity     20433 non-null  object 
dtypes: float64(9), object(1)
memory usage: 1.7+ MB


데이터 전처리

In [None]:
####데이터 전처리
## 1. x(독립변인)와 y 정하기 
X = df.drop("median_house_value", axis=1)
y = df["median_house_value"]

## 2. 수치형과 범주형으로 데이터를 나누기
#num_feature = X.drop("ocean_proximity")
#cate_feature = X["ocean_proximity"]

## 2-1. 수치형과 범주형으로 데이터를 나누기(리스트로 만들어라**=>타입으로 모을 수 있음!!!)
num_feature = X.select_dtypes(include=["float64", "int64", "float32", "int32"]).columns.to_list()
cate_feature = ["ocean_proximity"]

In [24]:
## 3. 데이터 분리 
from sklearn.model_selection import train_test_split
X = df.drop("median_house_value", axis=1)
y = df["median_house_value"]
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state=42)

In [27]:
## X_train 중에서 수치형/범주형 분리하기
X_train_num = X_train[num_feature]
X_train_cate = X_train[cate_feature]
X_test_num = X_test[num_feature]
X_test_cate = X_test[cate_feature]

In [28]:
## 스케일러 적용
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_num_scaler = scaler.fit_transform(X_train_num)
X_test_num_scaler = scaler.transform(X_test_num)

In [None]:
## 원핫인코딩
from sklearn.preprocessing import OneHotEncoder

encoder = OneHotEncoder()
X_train_cate_encoder = encoder.fit_transform(X_train_cate).toarray() #numpy의 array
X_test_cate_encoder = encoder.transform(X_test_cate).toarray() #numpy의 array

In [37]:
X_train_cate_encoder, X_test_cate_encoder, X_train_num_scaler, X_test_num_scaler

(array([[1., 0., 0., 0., 0.],
        [0., 1., 0., 0., 0.],
        [0., 1., 0., 0., 0.],
        ...,
        [1., 0., 0., 0., 0.],
        [0., 0., 0., 1., 0.],
        [0., 0., 0., 1., 0.]], shape=(16346, 5)),
 array([[0., 0., 0., 0., 1.],
        [0., 1., 0., 0., 0.],
        [1., 0., 0., 0., 0.],
        ...,
        [0., 1., 0., 0., 0.],
        [0., 1., 0., 0., 0.],
        [0., 0., 0., 0., 1.]], shape=(4087, 5)),
 array([[-1.107,  0.786, -1.162, ...,  1.131,  1.041,  0.438],
        [-0.025,  0.468,  0.349, ..., -0.709, -0.856, -0.242],
        [ 0.758, -0.712, -0.287, ...,  0.378,  0.692, -0.109],
        ...,
        [ 0.579, -0.763,  1.064, ..., -0.415, -0.359, -0.407],
        [-1.226,  0.903, -1.321, ...,  1.785,  1.48 ,  0.747],
        [-1.421,  0.978,  1.859, ...,  0.752,  0.395,  0.012]],
       shape=(16346, 8)),
 array([[ 1.167, -1.334, -0.685, ..., -0.793, -0.645, -0.362],
        [-0.852,  1.109, -2.116, ...,  1.688,  1.215,  0.325],
        [ 0.718, -0.805,  0.19 

In [40]:
## 스케일러의 결과와 원핫인코딩의 결과를 합치기
X_train_processed = np.hstack([X_train_num_scaler, X_train_cate_encoder])
X_test_processed = np.hstack([X_test_num_scaler, X_test_cate_encoder])

모델링

In [None]:
#### 모델링
from sklearn.metrics import r2_score, mean_squared_error

def evaluate_model(name, y_true, y_pred):
    r2 = r2_score(y_true, y_pred)
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    print(f"{name}")
    print(f" -- r2 : {r2:.2f}") # 1에 가까울수록 좋음
    print(f" -- RMSE : {rmse:.2f}") # 오차: XX $ , 낮을수록 좋음
    return {"model":name, "r2":r2, "rmse":rmse}

In [None]:
## Linear 모델
from sklearn.linear_model import LinearRegression

lr = LinearRegression()
lr.fit(X_train_processed, y_train)
lr_ypred = lr.predict(X_test_processed)

result_lr = evaluate_model("Linear Regression", y_test, lr_ypred)  
result_lr # -- r2 : 0.65
          # -- RMSE : 69297.72 $

Linear Regression
 -- r2 : 0.65
 -- RMSE : 69297.72


{'model': 'Linear Regression',
 'r2': 0.6488402154431994,
 'rmse': np.float64(69297.7166911303)}

In [46]:
## 다항식 Polynomial # y = b+ax+ax^2+ax^3...
### 범주형은 제외해야함 -> 수치형만 가능!!
from sklearn.preprocessing import PolynomialFeatures
poly = PolynomialFeatures(degree=2, include_bias=False, interaction_only=True)  # include_bias=False : y = b+ax+ax^2+ax^3...에서 b를 제외
                                                                                # interaction_only=True :               
X_train_num_poly = poly.fit_transform(X_train_num_scaler) 
X_test_num_poly = poly.transform(X_test_num_scaler)

In [48]:
## poly 결과와 원핫인코딩의 결과를 합치기
X_train_poly_processed = np.hstack([X_train_num_poly, X_train_cate_encoder])
X_test_poly_processed = np.hstack([X_test_num_poly, X_test_cate_encoder])

In [None]:
lr.fit(X_train_poly_processed, y_train)
lr_ypred = lr.predict(X_test_poly_processed)

result_lr = evaluate_model("Linear Regression (poly)", y_test, lr_ypred)  
result_lr # -- r2 : 0.69
          # -- RMSE : 65124.70 $

Linear Regression (poly)
 -- r2 : 0.69
 -- RMSE : 65124.70


{'model': 'Linear Regression (poly)',
 'r2': 0.6898595246215415,
 'rmse': np.float64(65124.7039906632)}

In [None]:
from sklearn.tree import DecisionTreeRegressor
dt = DecisionTreeRegressor()
dt.fit(X_train_processed, y_train)
dt_ypred = dt.predict(X_test_processed)
result_dt = evaluate_model("Decision Tree Regressor", y_test, dt_ypred)
result_dt # -- r2 : 0.66
          # -- RMSE : 67958.99 $

Decision Tree Regressor
 -- r2 : 0.66
 -- RMSE : 67958.99


{'model': 'Decision Tree Regressor',
 'r2': 0.662276907502265,
 'rmse': np.float64(67958.9901744425)}

In [56]:
num_names = num_feature
cate_names = list(encoder.get_feature_names_out(cate_feature))
all_names = num_names + cate_names
all_names

['longitude',
 'latitude',
 'housing_median_age',
 'total_rooms',
 'total_bedrooms',
 'population',
 'households',
 'median_income',
 'ocean_proximity_<1H OCEAN',
 'ocean_proximity_INLAND',
 'ocean_proximity_ISLAND',
 'ocean_proximity_NEAR BAY',
 'ocean_proximity_NEAR OCEAN']

In [59]:
importances = dt.feature_importances_
fi = pd.DataFrame({
    "features":all_names,
    "importances":importances}).sort_values("importances", ascending=False)
fi

Unnamed: 0,features,importances
7,median_income,0.48
9,ocean_proximity_INLAND,0.142
1,latitude,0.119
0,longitude,0.115
2,housing_median_age,0.0462
5,population,0.0261
3,total_rooms,0.0217
6,households,0.0199
4,total_bedrooms,0.0188
12,ocean_proximity_NEAR OCEAN,0.00899


In [None]:
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor()
rf.fit(X_train_processed, y_train)
rf_ypred = rf.predict(X_test_processed)
result_rf = evaluate_model("Random Forest Regressor", y_test, rf_ypred)
result_rf # -- r2 : 0.83
          # -- RMSE : 48815.92 $

Random Forest Regressor
 -- r2 : 0.83
 -- RMSE : 48815.92


{'model': 'Random Forest Regressor',
 'r2': 0.8257431152513077,
 'rmse': np.float64(48815.92365357865)}

In [63]:
importances = rf.feature_importances_
fi = pd.DataFrame({
    "features":all_names,
    "importances":importances}).sort_values("importances", ascending=False)
fi

Unnamed: 0,features,importances
7,median_income,0.486
9,ocean_proximity_INLAND,0.142
0,longitude,0.11
1,latitude,0.104
2,housing_median_age,0.051
5,population,0.032
3,total_rooms,0.0237
4,total_bedrooms,0.0212
6,households,0.0183
12,ocean_proximity_NEAR OCEAN,0.00618
