# 모델 불러오기

In [39]:
# 모듈 불러오기
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np
import datetime

In [40]:
# 모델 불러오기
loaded_booster = xgb.Booster()
loaded_booster.load_model("xgb_model_v3.json")

# 데이터 불러오기

In [79]:
start_year = 2023
end_year   = 2024
end_date_str = "2024-10-31"

In [80]:
import os
from mysql import connector
from dotenv import load_dotenv
import pandas as pd

# 환경변수 로드
load_dotenv()

NAME = os.getenv("DB_NAME")
USER = os.getenv("DB_USER")
PASSWORD = os.getenv("DB_PASSWORD")
IP = os.getenv("DB_IP")
PORT = os.getenv("DB_PORT")

class Database:
    _connection = None  # 클래스 변수로 연결 상태 관리

    @staticmethod
    def get_connection():
        if Database._connection is None or not Database._connection.is_connected():
            Database._connection = connector.connect(
                database = NAME,
                user = USER,
                password = PASSWORD,
                host = IP,
                port = PORT
            )
        return Database._connection

try:
    # 1) DB 연결
    conn = Database.get_connection()
    cursor = conn.cursor()

    years_range = range(start_year, end_year + 1)
    rc_sum_str = " + ".join([f"SUM(rc.`{year}`)" for year in years_range])
    irc_sum_str = " + ".join([f"SUM(irc.`{year}`)" for year in years_range])

    cursor.execute(f"""
        SELECT 
            book.ID,
            registration_year,
            registration_month,
            get_course,
            DDC,
            title,
            publication_year,
            location,
            duration,
            COALESCE({rc_sum_str}, 0) AS total_rent,
            COALESCE({irc_sum_str}, 0) AS rent_count,
            ID_count
        FROM book
        LEFT JOIN recent_rent       ON book.ID = recent_rent.ID
        LEFT JOIN rent_count AS rc        ON book.ID = rc.ID
        LEFT JOIN ISBN_rent_count AS irc   ON book.ISBN = irc.ISBN
        GROUP BY book.ID, registration_year, registration_month,
                 get_course, DDC, publication_year, location, duration
        ORDER BY book.ID
    """)
    rows_base = cursor.fetchall()
    cursor.close()

except connector.Error as e:
    print(f"Error: {e}")

df = pd.DataFrame(rows_base, columns=[
        'ID', '등록연도', '등록월', '수서방법',
        '분류코드', '제목', '출판연도', '소장위치', '최근대출',
        '총 대출 횟수', 'rent_count', 'book_count'
    ])

In [81]:
import pandas as pd

class Database:
    _connection = None  # 클래스 변수로 연결 상태 관리

    @staticmethod
    def get_connection():
        if Database._connection is None or not Database._connection.is_connected():
            Database._connection = connector.connect(
                database = NAME,
                user = USER,
                password = PASSWORD,
                host = IP,
                port = PORT
            )
        return Database._connection

try:
    conn = Database.get_connection()
    cursor = conn.cursor()

    years_range = range(start_year, end_year + 1)
    sum_str = " + ".join([f"SUM(`{year}`)" for year in years_range])

    cursor.execute(f"SELECT title, ID_count, COALESCE({sum_str}, 0) AS rent_count FROM None_ISBN_rent_count GROUP BY title;")
    
    rows = cursor.fetchall()
    
    cursor.close()
except connector.Error as e:
    print(f"Error: {e}")

None_ISBN_df = pd.DataFrame(rows, columns=['제목', 'book_count', 'rent_count'])

None_ISBN_df = None_ISBN_df.astype(object)
None_ISBN_df['rent_count'] = None_ISBN_df['rent_count'].astype(float)
None_ISBN_df['book_count'] = None_ISBN_df['book_count'].astype(float)

None_ISBN_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6966 entries, 0 to 6965
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   제목          6966 non-null   object 
 1   book_count  6966 non-null   float64
 2   rent_count  6966 non-null   float64
dtypes: float64(2), object(1)
memory usage: 163.4+ KB


In [82]:
df_merged = pd.merge(df, None_ISBN_df, on='제목', how='left')
df_merged['rent_count_x'] = df_merged['rent_count_x'].fillna(df_merged['rent_count_y'])
df_merged['book_count_x'] = df_merged['book_count_x'].fillna(df_merged['book_count_y'])
df_merged.drop('rent_count_y', axis=1, inplace=True)
df_merged.drop('book_count_y', axis=1, inplace=True)
df_merged.rename(columns={'rent_count_x': 'rent_count', 'book_count_x': 'book_count'}, inplace=True)
df_merged.drop(columns=['제목'], inplace=True)
df_merged['ID'] = df_merged['ID'].str.split('_').str[-1].astype(int)
df_merged['수서방법'] = df_merged['수서방법'].astype('category')
df_merged['분류코드'] = df_merged['분류코드'].astype(float)
df_merged['출판연도'] = df_merged['출판연도'].astype(int)
df_merged['소장위치'] = df_merged['소장위치'].astype('category')
df_merged['최근대출'] = df_merged['최근대출'].fillna(7305).astype(int)
df_merged['rent_count'] = df_merged['rent_count'].fillna(0).astype(int)
df_merged['book_count'] = df_merged['book_count'].fillna(0).astype(int)
df_merged['총 대출 횟수'] = df_merged['총 대출 횟수'].astype(int)
df_onehot = pd.get_dummies(df_merged)
y_data = df_onehot['rent_count'] / df_onehot['book_count']
y_data = y_data.fillna(0)
df_onehot.drop(columns=['rent_count', 'book_count'], inplace=True)
start_date = datetime.date(start_year, 1, 1)           # 2019-01-01
year_, month_, day_ = map(int, end_date_str.split('-'))  # 2024, 10, 31
end_date = datetime.date(year_, month_, day_)
diff_days = (end_date - start_date).days  # 2019-01-01 ~ 2024-10-31까지 일수
df_onehot.loc[df_onehot['최근대출'] > diff_days, '최근대출'] = diff_days
df_onehot.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 299315 entries, 0 to 299314
Data columns (total 15 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   ID           299315 non-null  int64  
 1   등록연도         299315 non-null  int64  
 2   등록월          299315 non-null  int64  
 3   분류코드         299315 non-null  float64
 4   출판연도         299315 non-null  int64  
 5   최근대출         299315 non-null  int64  
 6   총 대출 횟수      299315 non-null  int64  
 7   수서방법_기타      299315 non-null  bool   
 8   수서방법_사서선정    299315 non-null  bool   
 9   수서방법_수서정보없음  299315 non-null  bool   
 10  수서방법_수업지정    299315 non-null  bool   
 11  수서방법_이용자희망   299315 non-null  bool   
 12  수서방법_학과신청    299315 non-null  bool   
 13  소장위치_4층인문    299315 non-null  bool   
 14  소장위치_보존서고    299315 non-null  bool   
dtypes: bool(8), float64(1), int64(6)
memory usage: 18.3 MB


# 예측

In [83]:
# 예측
dtest = xgb.DMatrix(df_onehot)
y_pred = loaded_booster.predict(dtest)

In [84]:
rmse = np.sqrt(mean_squared_error(y_data, y_pred))
r2 = r2_score(y_data, y_pred)

print(f"RMSE: {rmse:.4f}")
print(f"R^2 : {r2:.4f}")

RMSE: 0.7484
R^2 : -0.5050


In [73]:
df_y_result = pd.DataFrame({
    "Prediction": y_pred,
    "Actual": y_data
}, index=y_data.index)

In [78]:
sample = df_y_result.sample(n = 20)
display(sample)

Unnamed: 0,Prediction,Actual
51977,0.185604,1.0
205732,0.182161,0.0
150085,7.790118,5.0
247108,-0.036277,0.0
42107,0.405001,0.0
178115,0.269526,0.5
96724,0.267069,0.0
186581,0.02998,0.0
130427,1.948824,2.0
191348,-0.01492,0.0


In [57]:
df_onehot[132485:132492]

Unnamed: 0,ID,등록연도,등록월,분류코드,출판연도,최근대출,총 대출 횟수,수서방법_기타,수서방법_사서선정,수서방법_수서정보없음,수서방법_수업지정,수서방법_이용자희망,수서방법_학과신청,소장위치_4층인문,소장위치_보존서고
132485,132752,2018,7,230.0,2018,669,0,False,True,False,False,False,False,True,False
132486,132753,2018,7,242.5,2018,669,0,False,True,False,False,False,False,True,False
132487,132754,2018,7,248.4,2018,669,0,False,True,False,False,False,False,True,False
132488,132755,2018,7,230.0,2018,669,0,False,True,False,False,False,False,True,False
132489,132756,2018,7,251.0,2018,669,0,False,True,False,False,False,False,True,False
132490,132757,2018,7,234.0,2017,669,0,False,True,False,False,False,False,True,False
132491,132758,2018,7,230.0,2018,669,0,False,True,False,False,False,False,True,False


ID                 1
등록연도            1982
등록월                7
분류코드           232.0
출판연도            1982
최근대출             669
총 대출 횟수            0
수서방법_기타        False
수서방법_사서선정      False
수서방법_수서정보없음     True
수서방법_수업지정      False
수서방법_이용자희망     False
수서방법_학과신청      False
소장위치_4층인문      False
소장위치_보존서고       True
Name: 0, dtype: object