In [21]:
import pandas as pd
import numpy as np
import warnings

warnings.filterwarnings('ignore')

from sklearn.preprocessing import OneHotEncoder
from sklearn import preprocessing

from sklearn.model_selection import train_test_split
from sklearn.model_selection import TimeSeriesSplit

from sklearn.model_selection import cross_val_score

from optuna import Trial
import optuna
from optuna.samplers import TPESampler

In [28]:
sample_df = pd.read_csv('train_data_final.csv', sep=',')

# 시간 순서대로 데이터 반영하기 위해서 거래년월일 정렬 필요
sample_df = sample_df.sort_values('계약날짜인코딩')
sample_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 237987 entries, 64104 to 208092
Data columns (total 19 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   전용면적         237987 non-null  float64
 1   거래금액         237987 non-null  int64  
 2   층            237987 non-null  int64  
 3   건축년도         237987 non-null  int64  
 4   평당단가         237987 non-null  float64
 5   구            237987 non-null  object 
 6   동            237987 non-null  object 
 7   단지명브랜드       237987 non-null  object 
 8   계약날짜인코딩      237987 non-null  int64  
 9   구별 공원 갯수     237987 non-null  float64
 10  8학군          237987 non-null  int64  
 11  동별지하철역수      237987 non-null  int64  
 12  구별 교과학원 갯수   237987 non-null  int64  
 13  구별 대형마트 수    235898 non-null  float64
 14  구별 백화점 수     235898 non-null  float64
 15  구별 전체 마트 수   235898 non-null  float64
 16  구별 대학병원 병원수  237987 non-null  int64  
 17  구별 종합병원 병원수  237987 non-null  int64  
 18  단지명별 신축거래비율  237987 

In [31]:
sample_df.drop(['거래금액','전용면적'], axis=1, inplace=True)
sample_df

Unnamed: 0,층,건축년도,평당단가,구,동,단지명브랜드,계약날짜인코딩,구별 공원 갯수,8학군,동별지하철역수,구별 교과학원 갯수,구별 대형마트 수,구별 백화점 수,구별 전체 마트 수,구별 대학병원 병원수,구별 종합병원 병원수,단지명별 신축거래비율
64104,5,2000,2252.073255,송파구,풍납동,기타,0,144.0,0,0,655,4.0,2.0,6.0,2,16,0.110657
78845,3,1993,1464.680951,중랑구,면목동,두산|두산위브,0,11.0,0,3,174,6.0,0.0,6.0,3,12,0.127080
79128,11,2015,2318.034033,중랑구,면목동,하늘채,0,11.0,0,3,174,6.0,0.0,6.0,3,12,0.673469
15567,8,1996,2022.630835,관악구,봉천동,삼성,0,21.0,0,3,253,1.0,1.0,2.0,2,8,0.021814
1983,19,2008,7051.616886,강남구,삼성동,힐스테이트,0,132.0,0,3,1304,1.0,6.0,7.0,4,33,0.550668
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
187413,11,2004,2691.103789,노원구,월계동,기타,1095,2.0,0,1,506,2.0,2.0,4.0,3,7,0.110657
186358,3,1998,3027.522936,노원구,상계동,기타,1095,2.0,0,6,506,2.0,2.0,4.0,3,7,0.110657
167568,13,1994,3764.880952,강서구,등촌동,기타,1095,107.0,0,0,455,3.0,1.0,4.0,3,16,0.110657
185716,3,1988,4493.392070,노원구,상계동,기타,1095,2.0,0,6,506,2.0,2.0,4.0,3,7,0.110657


In [38]:
'''
    - 카테고리: 데이터 전처리
    - 개요: 무지성 인코딩 => 전체 컬럼 데이터 타입 확인 후 통째로 범주형 인코딩 처리
    - param: df
    - return: encoded columns dataframe
'''
def one_hot(df):
    # 컬럼의 이름 리스트로 뽑아오기
    columns_name_list = list(df.columns)

    # 컬럼마다 for문 반복
    for col_name in columns_name_list:
        # 만약 컬럼의 값 타입이 범주형이면
        if df[col_name].dtype == object or df[col_name].dtype == str:

            # # 컬럼의 유니크한 값을 리스트로 만들어둠
            col_items = df[col_name].unique().tolist()
            # print(col_items)

            onehot = OneHotEncoder(sparse=False)
            onehot_encoded_arr = onehot.fit_transform(df[col_name].values.reshape(-1, 1))
#             onehot_encoded_arr_flask = onehot.fit_transform(df[col_name].values.reshape(-1, 1)).toarray()
            onehot_encoded_label = onehot.categories_[0]
#             print(onehot_encoded_label)
            onehot_encoded_df = pd.DataFrame(onehot_encoded_arr, columns=onehot_encoded_label)
#             print(onehot_encoded_df)
            df.drop(col_name, axis=1, inplace=True)
            df = pd.concat([df, onehot_encoded_df], axis=1)
        else:
            # 종속(타겟) 컬럼만 인코딩 제외
            if col_name == '평당단가':
                continue
            scaled_label = [col_name]
            x = df[col_name].values.reshape(-1, 1) #returns a numpy array
            min_max_scaler = preprocessing.MinMaxScaler()
            x_scaled = min_max_scaler.fit_transform(x)
            scaled_df = pd.DataFrame(x_scaled, columns=scaled_label)

            df.drop(col_name, axis=1, inplace=True)
            df = pd.concat([df, scaled_df], axis=1)

    return df
'''
    - 카테고리: 데이터 전처리
    - 개요: 무지성 인코딩 => 새로운 데이터 컬럼 추가 시 별도 인코딩 결과만 출력할 때 사용 ㄱㄱ
    - param: df, col_name
    - return: encoded columns dataframe or series
'''
def encode_column(df, col_name):

    # column data type이 object 또는 str 즉 범주형일 경우
    # onehot 인코딩 수행된 데이터프레임 return
    if df[col_name].dtype == object or df[col_name].dtype == str:
        onehot = OneHotEncoder(sparse=False)

        onehot_encoded_arr = onehot.fit_transform(df[col_name].values.reshape(-1, 1)).toarray()
        onehot_encoded_label = onehot.categories_[0]
        onehot_encoded_df = pd.DataFrame(onehot_encoded_arr, columns=onehot_encoded_label)

        return onehot_encoded_df

    # column data type이 나머지 타입일 경우
    # 해당 컬럼의 series return
    else:
        return df[col_name]

In [37]:
# preprocessed_train_df_gu=encode_column(sample_df, '구')
# preprocessed_train_df_gu

Unnamed: 0,강남구,강동구,강북구,강서구,관악구,광진구,구로구,금천구,노원구,도봉구,...,성동구,성북구,송파구,양천구,영등포구,용산구,은평구,종로구,중구,중랑구
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
237982,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
237983,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
237984,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
237985,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [33]:
# # 1-1 test one_hot function
# preprocessed_train_df = one_hot(sample_df)
# # print(len(preprocessed_train_df.columns))
# preprocessed_train_df.head()

Unnamed: 0,평당단가,층,건축년도,강남구,강동구,강북구,강서구,관악구,광진구,구로구,...,구별 공원 갯수,8학군,동별지하철역수,구별 교과학원 갯수,구별 대형마트 수,구별 백화점 수,구별 전체 마트 수,구별 대학병원 병원수,구별 종합병원 병원수,단지명별 신축거래비율
0,5364.511692,0.058824,0.433333,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.916667,1.0,0.142857,1.0,0.166667,1.0,0.857143,0.571429,1.0,0.110657
1,4828.060523,0.029412,0.433333,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.916667,1.0,0.142857,1.0,0.166667,1.0,0.857143,0.571429,1.0,0.110657
2,5364.511692,0.147059,0.433333,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.916667,1.0,0.142857,1.0,0.166667,1.0,0.857143,0.571429,1.0,0.110657
3,5756.5337,0.102941,0.433333,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.916667,1.0,0.142857,1.0,0.166667,1.0,0.857143,0.571429,1.0,0.110657
4,6452.346308,0.264706,0.433333,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.916667,1.0,0.142857,1.0,0.166667,1.0,0.857143,0.571429,1.0,0.110657


In [None]:
# preprocessed_train_df.to_csv('')

## socore

In [24]:
'''
    - 카테고리: 모델링
    - 개요: 머신러닝 모델링 수행 및 점수 도출
        - 교차 검증 방법으로 TimeSeriesSplit 수행
    - param: 

        1. model_tuple => ex. ('LR', LinearRegression())
        2. X_train, y_train, X_test, y_test

    - return: rmse
'''
def execute_modeling(model_tuple, data, target):

    name = model_tuple[0]
    model = model_tuple[1]

    # 각 모델에 대하여 실질적 학습 수행
    # clf = model.fit(X_train, y_train)
    # pred = clf.predict(X_test)

    # TimeSeries Cross validation 
    tscv = TimeSeriesSplit(n_splits=15)

    # 각 모델에 대하여 교차 검증한 결과 점수 확인
    # scoring parameter option 어캐 줘야 함?
    rmse_scores = cross_val_score(model, data, target, cv=tscv, scoring='neg_mean_squared_error')
    rmse_scores = np.sqrt(-rmse_scores)
    print(f'{name} average rmse_scores: {rmse_scores.mean()}, rmse_scores: {rmse_scores}')

    return rmse_scores.mean()
from sklearn.linear_model import LinearRegression 
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
import lightgbm as lgb

# 2. test execute_modeling function
preprocessed_train_df = one_hot(sample_df)
preprocessed_train_df = preprocessed_train_df.drop_duplicates()

# 시간 순서대로 데이터 반영하기 위해서 거래년월일 정렬 필요
# preprocessed_train_df = preprocessed_train_df.sort_values('거래년월일')

# 인코딩된 데이터에서 data, target 분류
data = preprocessed_train_df[preprocessed_train_df.columns.difference(['평당단가'])]
target = preprocessed_train_df['평당단가'] 

model_list = [
                # ('LR', LinearRegression()), 
                ('RF', RandomForestRegressor(n_estimators = 10))
                # ('model_xgb', xgb.XGBRegressor(n_estimators=500, max_depth=9, min_child_weight=5, gamma=0.1, n_jobs=-1)),
                # ('model_lgb', lgb.LGBMRegressor(n_estimators=500, max_depth=9, min_child_weight=5, n_jobs=-1))
            ]

for model_tuple in model_list:
    execute_modeling(model_tuple, data, target)
    pass

RF average rmse_scores: nan, rmse_scores: [0.02940796 0.0338806  0.07787299 0.06270633        nan        nan
        nan        nan        nan        nan        nan        nan
        nan        nan        nan]


In [None]:
# 3. test get_best_param function

# linear regression model object
def linear_object(trial:Trial, data, target):
    params = {
        'fit_intercept': trial.suggest_categorical('fit_intercept', [True, False])
    }
    
    test_model = LinearRegression(**params)
    test_model_score = execute_modeling(('LR', test_model), data, target)

    return test_model_score

# xgbr regression model object
def xgbr_object(trial:Trial, data, target):
    params = {
        "n_estimators" : trial.suggest_int('n_estimators', 500, 1000),
        'max_depth':trial.suggest_int('max_depth', 8, 16),
        'min_child_weight':trial.suggest_int('min_child_weight', 1, 100),
        'gamma':trial.suggest_int('gamma', 1, 3),
        'learning_rate': 0.01,
        'colsample_bytree':trial.suggest_discrete_uniform('colsample_bytree',0.5, 1, 0.1),
        'nthread' : -1,
        'tree_method': 'gpu_hist',
        'predictor': 'gpu_predictor',
        'lambda': trial.suggest_loguniform('lambda', 1e-3, 1.0),
        'alpha': trial.suggest_loguniform('alpha', 1e-3, 1.0),
        'subsample': trial.suggest_categorical('subsample', [0.6,0.7,0.8,1.0] ),
        'random_state': 42
    }
    
    test_model = xgb.XGBRegressor(**params)
    test_model_score = execute_modeling(('XGBR', test_model), data, target)

    return test_model_score

# light gbm regression model object
def lgb_object(trial:Trial, data, target):
    params = {
        "n_estimators" : trial.suggest_int('n_estimators', 500, 1000),
        'max_depth':trial.suggest_int('max_depth', 8, 16),
        'min_child_weight':trial.suggest_int('min_child_weight', 1, 100)
    }
    
    test_model = lgb.LGBMRegressor(**params)
    test_model_score = execute_modeling(('LGBM', test_model), data, target)

    return test_model_score

# 하이퍼 파라미터 결과 도출
# 위에서 분할한  X_train, y_train, X_test, y_test 파라미터 삽입
study = optuna.create_study(direction='minimize', sampler=TPESampler())
study.optimize(lambda trial: lgb_object(trial, data, target), n_trials=3)

best_score = study.best_value
best_param_dict = study.best_trial.params

print(best_score, best_param_dict)