# 모델 불러오기

In [2]:
# 모듈 불러오기
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np
import datetime
import os

## single_year_model

In [59]:
# 모델이 저장된 폴더 경로
SINGLE_MODEL_DIR = "single_year_model"    # 2004, 2005, 2006, ... , 2023 각 년도 각각 모델

# 모델들을 저장할 리스트
loaded_single_year_models = []

# 폴더 내 모든 파일 탐색
for filename in os.listdir(SINGLE_MODEL_DIR):
    # 모델 파일이라고 가정할 확장자를 체크하거나, 필요한 경우 필터링할 수 있음
    if filename.endswith(".json"):
        file_path = os.path.join(SINGLE_MODEL_DIR, filename)
        
        # XGBoost scikit-learn 래퍼 객체를 생성
        model = xgb.XGBRegressor()

        # 모델 불러오기
        model.load_model(file_path)
        
        # 리스트에 저장
        loaded_single_year_models.append(model)

# 이제 loaded_models에 폴더 내의 모든 XGBoost 모델이 로드됨
print(f"{len(loaded_single_year_models)}개 모델을 불러왔습니다.")

20개 모델을 불러왔습니다.


In [60]:
# 예측 함수
def predict_single_year_model(input_data, year):
    # 각 모델로부터 예측
    pred_model = []
    for i in range(len(loaded_single_year_models)):
        pred_model.append(loaded_single_year_models[i].predict(input_data))

    w = [0 for _ in range(20)]
    if 2006 <= year <= 2021:
        w[year % 2004 - 2] = 0.05
        w[year % 2004 - 1] = 0.1
        w[year % 2004] = 0.8
        w[year % 2004 + 1] = 0.1
        w[year % 2004 + 2] = 0.05
    elif year == 2005 or year == 2022:
        w[year % 2004 - 1] = 0.1
        w[year % 2004] = 0.9
        w[year % 2004 + 1] = 0.1
    elif year == 2004:
        w[year % 2004] = 0.85
        w[year % 2004 + 1] = 0.1
        w[year % 2004 + 2] = 0.05
    else:
        w[year % 2004 - 2] = 0.05
        w[year % 2004 - 1] = 0.1
        w[year % 2004] = 0.85

    # 가중 평균
    ensemble_pred = sum([pred_model[i] * w[i] for i in range(20)])
    return ensemble_pred


## group_year_model

In [61]:
# 모델이 저장된 폴더 경로
GROUP_MODEL_DIR = "group_year_model"   # 2004 ~ 2008, 2009 ~ 2013, 2014 ~ 2018, 2019 ~ 2023 그룹형 모델

# 모델들을 저장할 리스트
loaded_group_year_models = []

# 폴더 내 모든 파일 탐색
for filename in os.listdir(GROUP_MODEL_DIR):
    # 모델 파일이라고 가정할 확장자를 체크하거나, 필요한 경우 필터링할 수 있음
    if filename.endswith(".json"):
        file_path = os.path.join(GROUP_MODEL_DIR, filename)
        
        # XGBoost scikit-learn 래퍼 객체를 생성
        model = xgb.XGBRegressor()

        # 모델 불러오기
        model.load_model(file_path)
        
        # 리스트에 저장
        loaded_group_year_models.append(model)

# 이제 loaded_models에 폴더 내의 모든 XGBoost 모델이 로드됨
print(f"{len(loaded_group_year_models)}개 모델을 불러왔습니다.")

4개 모델을 불러왔습니다.


In [62]:
# 예측 함수
def predict_group_year_model(input_data, year):
    # 각 모델로부터 예측
    pred_model = []
    for i in range(len(loaded_group_year_models)):
        pred_model.append(loaded_group_year_models[i].predict(input_data))

    w = [0 for _ in range(4)]
    if 2004 <= year <= 2008:
        w[0], w[1], w[2], w[3] = 0.8, 0.1, 0.05, 0.05
    elif 2009 <= year <= 2013:
        w[0], w[1], w[2], w[3] = 0.08, 0.9, 0.08, 0.04
    elif 2014 <= year <= 2018:
        w[0], w[1], w[2], w[3] = 0.04, 0.08, 0.9, 0.08
    else:
        w[0], w[1], w[2], w[3] = 0.05, 0.05, 0.1, 0.8

    # 가중 평균
    ensemble_pred = sum([pred_model[i] * w[i] for i in range(4)])
    return ensemble_pred


## term_year_model

In [63]:
# 모델이 저장된 폴더 경로
TERM_MODEL_DIR = "term_year_model"    # 2004, 2009, 2014, 2019년 모델 각 +1 년 모델로 일정한 간격을 둔 모델

# 모델들을 저장할 리스트
loaded_term_year_models = []

# 폴더 내 모든 파일 탐색
for filename in os.listdir(TERM_MODEL_DIR):
    # 모델 파일이라고 가정할 확장자를 체크하거나, 필요한 경우 필터링할 수 있음
    if filename.endswith(".json"):
        file_path = os.path.join(TERM_MODEL_DIR, filename)
        
        # XGBoost scikit-learn 래퍼 객체를 생성
        model = xgb.XGBRegressor()

        # 모델 불러오기
        model.load_model(file_path)
        
        # 리스트에 저장
        loaded_term_year_models.append(model)

# 이제 loaded_models에 폴더 내의 모든 XGBoost 모델이 로드됨
print(f"{len(loaded_term_year_models)}개 모델을 불러왔습니다.")

5개 모델을 불러왔습니다.


In [245]:
# 예측 함수
def predict_term_year_model(input_data, year):
    # 각 모델로부터 예측
    pred_model = []
    for i in range(len(loaded_term_year_models)):
        pred_model.append(loaded_term_year_models[i].predict(input_data))

    w = [0 for _ in range(5)]
    if year % 5 == 4: # 2004
        w[0], w[1], w[2], w[3], w[4] = 0.7, 0.1, 0.05, 0.05, 0.1
    elif year % 5 == 0: # 2005
        w[0], w[1], w[2], w[3], w[4] = 0.1, 0.7, 0.1, 0.05, 0.05
    elif year % 5 == 1: # 2006
        w[0], w[1], w[2], w[3], w[4] = 0.05, 0.1, 0.7, 0.1, 0.05
    elif year % 5 == 2: # 2007
        w[0], w[1], w[2], w[3], w[4] = 0.05, 0.05, 0.1, 0.7, 0.1
    else:
        w[0], w[1], w[2], w[3], w[4] = 0.1, 0.05, 0.05, 0.1, 0.7
        
    # 가중 평균
    ensemble_pred = sum([pred_model[i] * w[i] for i in range(5)])
    return ensemble_pred


In [None]:
if year % 5 == 4: # 2004
        w[0], w[1], w[2], w[3], w[4] = 0.7, 0.1, 0.05, 0.05, 0.1
    elif year % 5 == 0: # 2005
        w[0], w[1], w[2], w[3], w[4] = 0.1, 0.7, 0.1, 0.05, 0.05
    elif year % 5 == 1: # 2006
        w[0], w[1], w[2], w[3], w[4] = 0.05, 0.1, 0.7, 0.1, 0.05
    elif year % 5 == 2: # 2007
        w[0], w[1], w[2], w[3], w[4] = 0.05, 0.05, 0.1, 0.7, 0.1
    else:
        w[0], w[1], w[2], w[3], w[4] = 0.1, 0.05, 0.05, 0.1, 0.7

# 데이터 불러오기

In [229]:
start_year = 2023
end_year   = 2024
end_date_str = "2024-10-31"

In [230]:
import os
from mysql import connector
from dotenv import load_dotenv
import pandas as pd

# 환경변수 로드
load_dotenv()

NAME = os.getenv("DB_NAME")
USER = os.getenv("DB_USER")
PASSWORD = os.getenv("DB_PASSWORD")
IP = os.getenv("DB_IP")
PORT = os.getenv("DB_PORT")

class Database:
    _connection = None  # 클래스 변수로 연결 상태 관리

    @staticmethod
    def get_connection():
        if Database._connection is None or not Database._connection.is_connected():
            Database._connection = connector.connect(
                database = NAME,
                user = USER,
                password = PASSWORD,
                host = IP,
                port = PORT
            )
        return Database._connection

try:
    # 1) DB 연결
    conn = Database.get_connection()
    cursor = conn.cursor()

    years_range = range(start_year, end_year + 1)
    rc_sum_str = " + ".join([f"SUM(rc.`{year}`)" for year in years_range])
    irc_sum_str = " + ".join([f"SUM(irc.`{year}`)" for year in years_range])

    cursor.execute(f"""
        SELECT 
            book.ID,
            registration_year,
            registration_month,
            get_course,
            DDC,
            title,
            publication_year,
            location,
            duration,
            COALESCE({rc_sum_str}, 0) AS total_rent,
            COALESCE({irc_sum_str}, 0) AS rent_count,
            ID_count
        FROM book
        LEFT JOIN recent_rent       ON book.ID = recent_rent.ID
        LEFT JOIN rent_count AS rc        ON book.ID = rc.ID
        LEFT JOIN ISBN_rent_count AS irc   ON book.ISBN = irc.ISBN
        GROUP BY book.ID, registration_year, registration_month,
                 get_course, DDC, publication_year, location, duration
        ORDER BY book.ID
    """)
    rows_base = cursor.fetchall()
    cursor.close()

except connector.Error as e:
    print(f"Error: {e}")

df = pd.DataFrame(rows_base, columns=[
        'ID', '등록연도', '등록월', '수서방법',
        '분류코드', '제목', '출판연도', '소장위치', '최근대출',
        '총 대출 횟수', 'rent_count', 'book_count'
    ])

In [231]:
import pandas as pd

class Database:
    _connection = None  # 클래스 변수로 연결 상태 관리

    @staticmethod
    def get_connection():
        if Database._connection is None or not Database._connection.is_connected():
            Database._connection = connector.connect(
                database = NAME,
                user = USER,
                password = PASSWORD,
                host = IP,
                port = PORT
            )
        return Database._connection

try:
    conn = Database.get_connection()
    cursor = conn.cursor()

    years_range = range(start_year, end_year + 1)
    sum_str = " + ".join([f"SUM(`{year}`)" for year in years_range])

    cursor.execute(f"SELECT title, ID_count, COALESCE({sum_str}, 0) AS rent_count FROM None_ISBN_rent_count GROUP BY title;")
    
    rows = cursor.fetchall()
    
    cursor.close()
except connector.Error as e:
    print(f"Error: {e}")

None_ISBN_df = pd.DataFrame(rows, columns=['제목', 'book_count', 'rent_count'])

None_ISBN_df = None_ISBN_df.astype(object)
None_ISBN_df['rent_count'] = None_ISBN_df['rent_count'].astype(float)
None_ISBN_df['book_count'] = None_ISBN_df['book_count'].astype(float)

None_ISBN_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6966 entries, 0 to 6965
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   제목          6966 non-null   object 
 1   book_count  6966 non-null   float64
 2   rent_count  6966 non-null   float64
dtypes: float64(2), object(1)
memory usage: 163.4+ KB


In [232]:
df_merged = pd.merge(df, None_ISBN_df, on='제목', how='left')
df_merged['rent_count_x'] = df_merged['rent_count_x'].fillna(df_merged['rent_count_y'])
df_merged['book_count_x'] = df_merged['book_count_x'].fillna(df_merged['book_count_y'])
df_merged.drop('rent_count_y', axis=1, inplace=True)
df_merged.drop('book_count_y', axis=1, inplace=True)
df_merged.rename(columns={'rent_count_x': 'rent_count', 'book_count_x': 'book_count'}, inplace=True)
df_merged.drop(columns=['제목'], inplace=True)
df_merged['ID'] = df_merged['ID'].str.split('_').str[-1].astype(int)
df_merged['수서방법'] = df_merged['수서방법'].astype('category')
df_merged['분류코드'] = df_merged['분류코드'].astype(float)
df_merged['출판연도'] = df_merged['출판연도'].astype(int)
df_merged['소장위치'] = df_merged['소장위치'].astype('category')
df_merged['최근대출'] = df_merged['최근대출'].fillna(7305).astype(int)
df_merged['rent_count'] = df_merged['rent_count'].fillna(0).astype(int)
df_merged['book_count'] = df_merged['book_count'].fillna(0).astype(int)
df_merged['총 대출 횟수'] = df_merged['총 대출 횟수'].astype(int)
df_onehot = pd.get_dummies(df_merged)
y_data = df_onehot['rent_count'] / df_onehot['book_count']
y_data = y_data.fillna(0)
df_onehot.drop(columns=['rent_count', 'book_count'], inplace=True)
start_date = datetime.date(start_year, 1, 1)           # 2019-01-01
year_, month_, day_ = map(int, end_date_str.split('-'))  # 2024, 10, 31
end_date = datetime.date(year_, month_, day_)
diff_days = (end_date - start_date).days  # 2019-01-01 ~ 2024-10-31까지 일수
df_onehot.loc[df_onehot['최근대출'] > diff_days, '최근대출'] = diff_days
df_onehot.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 299315 entries, 0 to 299314
Data columns (total 15 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   ID           299315 non-null  int64  
 1   등록연도         299315 non-null  int64  
 2   등록월          299315 non-null  int64  
 3   분류코드         299315 non-null  float64
 4   출판연도         299315 non-null  int64  
 5   최근대출         299315 non-null  int64  
 6   총 대출 횟수      299315 non-null  int64  
 7   수서방법_기타      299315 non-null  bool   
 8   수서방법_사서선정    299315 non-null  bool   
 9   수서방법_수서정보없음  299315 non-null  bool   
 10  수서방법_수업지정    299315 non-null  bool   
 11  수서방법_이용자희망   299315 non-null  bool   
 12  수서방법_학과신청    299315 non-null  bool   
 13  소장위치_4층인문    299315 non-null  bool   
 14  소장위치_보존서고    299315 non-null  bool   
dtypes: bool(8), float64(1), int64(6)
memory usage: 18.3 MB


# 예측

In [246]:
# 예측
single_y_pred = predict_single_year_model(df_onehot, start_year)
group_y_pred = predict_group_year_model(df_onehot, start_year)
term_y_pred = predict_term_year_model(df_onehot, start_year)

In [247]:
def score(y_data, y_pred):
    rmse = np.sqrt(mean_squared_error(y_data, y_pred))
    r2 = r2_score(y_data, y_pred)
    return rmse, r2

In [248]:
print("SINGLE_YEAR_MODEL")
rmse, r2 = score(y_data, single_y_pred)
print(f"RMSE: {rmse:.4f}")
print(f"R^2 : {r2:.4f}")
print()
print("GROUP_YEAR_MODEL")
rmse, r2 = score(y_data, group_y_pred)
print(f"RMSE: {rmse:.4f}")
print(f"R^2 : {r2:.4f}")
print()
print("TERM_YEAR_MODEL")
rmse, r2 = score(y_data, term_y_pred)
print(f"RMSE: {rmse:.4f}")
print(f"R^2 : {r2:.4f}")

SINGLE_YEAR_MODEL
RMSE: 0.2751
R^2 : 0.7967

GROUP_YEAR_MODEL
RMSE: 0.3381
R^2 : 0.6928

TERM_YEAR_MODEL
RMSE: 0.3944
R^2 : 0.5821


In [237]:
df_y_result = pd.DataFrame({
    "Actual": y_data,
    "Single Prediction": single_y_pred,
    "Group Prediction": group_y_pred,
    "Term Prediction": term_y_pred
}, index=y_data.index)

In [240]:
sample = df_y_result.sample(n = 20)
display(sample)

Unnamed: 0,Actual,Single Prediction,Group Prediction,Term Prediction
14720,0.0,0.055637,0.137376,0.116206
203740,0.0,0.020955,0.039165,0.012528
133009,0.0,0.07807,0.290113,0.394273
195182,0.0,0.006327,-0.040613,-0.077419
166032,0.0,0.243996,0.338135,0.292771
65422,0.0,0.123974,0.201971,0.207683
257056,0.0,0.076995,0.239043,0.219818
109333,0.0,0.080457,0.109984,0.120193
249427,0.0,0.039545,0.094312,0.001381
287353,0.0,0.063509,0.19034,0.163059


In [57]:
df_onehot[132485:132492]

Unnamed: 0,ID,등록연도,등록월,분류코드,출판연도,최근대출,총 대출 횟수,수서방법_기타,수서방법_사서선정,수서방법_수서정보없음,수서방법_수업지정,수서방법_이용자희망,수서방법_학과신청,소장위치_4층인문,소장위치_보존서고
132485,132752,2018,7,230.0,2018,669,0,False,True,False,False,False,False,True,False
132486,132753,2018,7,242.5,2018,669,0,False,True,False,False,False,False,True,False
132487,132754,2018,7,248.4,2018,669,0,False,True,False,False,False,False,True,False
132488,132755,2018,7,230.0,2018,669,0,False,True,False,False,False,False,True,False
132489,132756,2018,7,251.0,2018,669,0,False,True,False,False,False,False,True,False
132490,132757,2018,7,234.0,2017,669,0,False,True,False,False,False,False,True,False
132491,132758,2018,7,230.0,2018,669,0,False,True,False,False,False,False,True,False


ID                 1
등록연도            1982
등록월                7
분류코드           232.0
출판연도            1982
최근대출             669
총 대출 횟수            0
수서방법_기타        False
수서방법_사서선정      False
수서방법_수서정보없음     True
수서방법_수업지정      False
수서방법_이용자희망     False
수서방법_학과신청      False
소장위치_4층인문      False
소장위치_보존서고       True
Name: 0, dtype: object

In [None]:
# 가정: 이미 4가지 모델을 학습해서 아래 변수에 저장했다고 하자
model_2004_2024 = XGBRegressor(...)
model_2009_2024 = XGBRegressor(...)
model_2014_2024 = XGBRegressor(...)
model_2019_2024 = XGBRegressor(...)

# 예측 함수
def predict_with_ensemble(input_data, year):
    """
    input_data: 예측할 데이터(특성)
    year: 해당 데이터의 연도(또는 어떤 연도대 데이터인지)
    """

    # 각 모델로부터 예측
    pred_2004_2024 = model_2004_2024.predict(input_data)
    pred_2009_2024 = model_2009_2024.predict(input_data)
    pred_2014_2024 = model_2014_2024.predict(input_data)
    pred_2019_2024 = model_2019_2024.predict(input_data)

    # 연도(year)에 따라 가중치 결정 (예시)
    if 2004 <= year < 2009:
        w1, w2, w3, w4 = 1.0, 0.0, 0.0, 0.0
    elif 2009 <= year < 2014:
        w1, w2, w3, w4 = 0.2, 0.8, 0.0, 0.0
    elif 2014 <= year < 2019:
        w1, w2, w3, w4 = 0.1, 0.1, 0.8, 0.0
    else:  # 2019~
        w1, w2, w3, w4 = 0.05, 0.05, 0.1, 0.8

    # 가중 평균
    ensemble_pred = (pred_2004_2024 * w1
                     + pred_2009_2024 * w2
                     + pred_2014_2024 * w3
                     + pred_2019_2024 * w4)
    return ensemble_pred
