In [1]:
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
import optuna

In [2]:
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score, mean_squared_error
from sklearn.model_selection import train_test_split, GridSearchCV
from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor

In [3]:
x_data = pd.read_csv('X.csv')
y_data = pd.read_csv('Y.csv')

In [4]:
x_codes = x_data['유치원코드'].unique()
y_codes = y_data['유치원코드'].unique()

In [5]:
x_data.fillna(0, inplace=True)
y_data.fillna(0, inplace=True)

In [6]:
df = pd.concat([x_data, y_data])

In [7]:
def extract_sum(string):
    numbers = re.findall(r'\d+', string)
    return sum(map(int, numbers))

In [8]:
df['건물층수'] = df['건물층수'].apply(extract_sum)

In [9]:
def remove_len(string):
    return ''.join(re.findall(r'\d+', string))

In [10]:
df['건물전용면적'] = df['건물전용면적'].apply(remove_len)
df['대지총면적'] = df['대지총면적'].apply(remove_len)

In [11]:
lab_encoder = LabelEncoder()
df['설립유형'] = lab_encoder.fit_transform(df['설립유형'])

In [12]:
not_encoding = ['설립유형',  '특수교사수', '유치원코드', '교육지원청명']
minmax_encoding = ['건물전용면적', '대지총면적']
all_columns = df.columns.tolist()
std_encoding = [col for col in all_columns if col not in not_encoding + minmax_encoding]

In [13]:
df[minmax_encoding] = df[minmax_encoding].astype(float)

In [14]:
std_scaler = StandardScaler()
df[std_encoding] = std_scaler.fit_transform(df[std_encoding])

In [15]:
df[minmax_encoding] = np.log1p(df[minmax_encoding])
minmax_scaler = MinMaxScaler()
df[minmax_encoding] = minmax_scaler.fit_transform(df[minmax_encoding])

In [16]:
X = df[df['유치원코드'].isin(x_codes)]
Pred = df[df['유치원코드'].isin(y_codes)]

key = Pred['유치원코드']
y = X['특수교사수']
Pred = Pred.drop(['특수교사수', '유치원코드', '교육지원청명'], axis=1)
X = X.drop(['특수교사수', '유치원코드', '교육지원청명'], axis=1)

In [17]:
X_train, X_valid, y_train, y_valid = train_test_split(X,y,test_size=0.2,random_state=42)

In [18]:
xgb = XGBRegressor()
cb = CatBoostRegressor()
lg = LGBMRegressor()

In [19]:
xgb.fit(X_train, y_train)
cb.fit(X_train, y_train)
lg.fit(X_train, y_train)

Learning rate set to 0.028227
0:	learn: 0.4569831	total: 134ms	remaining: 2m 13s
1:	learn: 0.4527562	total: 138ms	remaining: 1m 8s
2:	learn: 0.4502006	total: 143ms	remaining: 47.4s
3:	learn: 0.4481098	total: 147ms	remaining: 36.7s
4:	learn: 0.4444610	total: 151ms	remaining: 30.1s
5:	learn: 0.4421121	total: 154ms	remaining: 25.5s
6:	learn: 0.4391824	total: 159ms	remaining: 22.5s
7:	learn: 0.4365691	total: 163ms	remaining: 20.2s
8:	learn: 0.4332870	total: 167ms	remaining: 18.4s
9:	learn: 0.4318725	total: 171ms	remaining: 16.9s
10:	learn: 0.4294754	total: 175ms	remaining: 15.7s
11:	learn: 0.4276208	total: 178ms	remaining: 14.7s
12:	learn: 0.4255754	total: 180ms	remaining: 13.7s
13:	learn: 0.4232618	total: 188ms	remaining: 13.2s
14:	learn: 0.4211088	total: 193ms	remaining: 12.7s
15:	learn: 0.4176361	total: 197ms	remaining: 12.1s
16:	learn: 0.4152394	total: 202ms	remaining: 11.7s
17:	learn: 0.4127858	total: 205ms	remaining: 11.2s
18:	learn: 0.4109062	total: 209ms	remaining: 10.8s
19:	learn:

In [20]:
y_pred_xgb = xgb.predict(X_valid)
y_pred_cb = cb.predict(X_valid)
y_pred_lg = lg.predict(X_valid)

In [21]:
rmse_xgb = np.sqrt(mean_squared_error(y_valid, y_pred_xgb))
rmse_cb = np.sqrt(mean_squared_error(y_valid, y_pred_cb))
rmse_lg = np.sqrt(mean_squared_error(y_valid, y_pred_lg))

In [22]:
r2_xgb = r2_score(y_valid, y_pred_xgb)
r2_cb = r2_score(y_valid, y_pred_cb)
r2_lg = r2_score(y_valid, y_pred_lg)

In [23]:
print("XGBoost RMSE:", rmse_xgb, "R^2:", r2_xgb)
print("CatBoost RMSE:", rmse_cb, "R^2:", r2_cb)
print("LightGBM RMSE:", rmse_lg, "R^2:", r2_lg)

XGBoost RMSE: 0.557149345839675 R^2: -0.5025148461855191
CatBoost RMSE: 0.3788184023582461 R^2: 0.30539539485725786
LightGBM RMSE: 0.4379803207335068 R^2: 0.0714939954831979


In [24]:
def objective(trial):
    params = {
        'iterations': trial.suggest_int('iterations', 100, 1000),
        'depth': trial.suggest_int('depth', 4, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
        'random_strength': trial.suggest_int('random_strength', 0, 100),
        'bagging_temperature': trial.suggest_float('bagging_temperature', 0.0, 1.0),
        'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1e-8, 10.0, log=True),
        'border_count': trial.suggest_int('border_count', 1, 255),
    }
    
    model = CatBoostRegressor(**params, loss_function='RMSE', verbose=False)
    model.fit(X_train, y_train, eval_set=[(X_valid, y_valid)], early_stopping_rounds=100)

    preds = model.predict(X_valid)
    rmse = mean_squared_error(y_valid, preds, squared=False)
    
    return rmse

In [25]:
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=100, gc_after_trial=True)

[I 2024-05-04 14:40:28,506] A new study created in memory with name: no-name-0dfd6606-1fb8-4f0d-9a39-e23a630361c5
[I 2024-05-04 14:40:29,953] Trial 0 finished with value: 0.36127777012077966 and parameters: {'iterations': 821, 'depth': 4, 'learning_rate': 0.01127243020335601, 'random_strength': 65, 'bagging_temperature': 0.039317953319819576, 'l2_leaf_reg': 2.2534388963582532e-05, 'border_count': 205}. Best is trial 0 with value: 0.36127777012077966.
[I 2024-05-04 14:40:31,032] Trial 1 finished with value: 0.3680559389781199 and parameters: {'iterations': 721, 'depth': 6, 'learning_rate': 0.025828116169112063, 'random_strength': 82, 'bagging_temperature': 0.7496400138366645, 'l2_leaf_reg': 5.755471425394787e-08, 'border_count': 222}. Best is trial 0 with value: 0.36127777012077966.
[I 2024-05-04 14:40:33,253] Trial 2 finished with value: 0.3779460015916832 and parameters: {'iterations': 519, 'depth': 10, 'learning_rate': 0.04192037968652142, 'random_strength': 93, 'bagging_temperature'

In [26]:
best_params = study.best_params
best_params['loss_function'] = 'RMSE'
best_params['verbose'] = False

In [27]:
optimized_model = CatBoostRegressor(**best_params)
optimized_model.fit(X_train, y_train, eval_set=[(X_valid, y_valid)], early_stopping_rounds=100, verbose=False)

<catboost.core.CatBoostRegressor at 0x25297071b20>

In [28]:
preds = optimized_model.predict(X_valid)

In [29]:
rmse = mean_squared_error(y_valid, preds, squared=False)
print(f"Optimized CatBoost RMSE: {rmse}")

Optimized CatBoost RMSE: 0.3384899750759986


In [30]:
rs = optimized_model.predict(Pred)

In [31]:
rs = np.round(rs).astype(int)

In [32]:
result = pd.DataFrame()
result['유치원코드'] = key
result['특수교사수_추천'] = rs

In [33]:
def join(df1, df2, using, type):
    merged = pd.merge(df1, df2, on=using, how=type, suffixes=('', ''))
    return merged

In [34]:
full_rs = join(y_data, result, '유치원코드', 'inner')

In [35]:
full_rs['특수교사수증감폭'] = full_rs['특수교사수_추천'] - full_rs['특수교사수']

In [36]:
full_rs[['유치원코드', '특수교사수', '특수교사수_추천', '특수교사수증감폭']]

Unnamed: 0,유치원코드,특수교사수,특수교사수_추천,특수교사수증감폭
0,0764f8c8-364c-495b-9fbc-b389148b7b4f,1,1,0
1,1ecec08c-f1fc-b044-e053-0a32095ab044,1,1,0
2,1ecec08c-f491-b044-e053-0a32095ab044,0,1,1
3,1ecec08c-fa6b-b044-e053-0a32095ab044,1,1,0
4,1ecec08c-fe02-b044-e053-0a32095ab044,0,1,1
5,1ecec08d-01d5-b044-e053-0a32095ab044,1,1,0
6,1ecec08d-078d-b044-e053-0a32095ab044,1,1,0
7,1ecec08d-0a6b-b044-e053-0a32095ab044,2,2,0
8,1ecec08d-0add-b044-e053-0a32095ab044,1,1,0
9,1ecec08d-0c8d-b044-e053-0a32095ab044,0,1,1


In [37]:
full_rs.to_csv('예측결과.csv', index=False)