# 함수 선언

하나의 함수로 만들어 연도 범위 지정하고 데이터 반환 받기

In [1]:
import os
from mysql import connector
from dotenv import load_dotenv
import pandas as pd
import datetime

In [2]:
def load_data(start_year, end_year):
    # 환경변수 로드
    load_dotenv()
    
    NAME = os.getenv("DB_NAME")
    USER = os.getenv("DB_USER")
    PASSWORD = os.getenv("DB_PASSWORD")
    IP = os.getenv("DB_IP")
    PORT = os.getenv("DB_PORT")
    
    class Database:
        _connection = None  # 클래스 변수로 연결 상태 관리
    
        @staticmethod
        def get_connection():
            if Database._connection is None or not Database._connection.is_connected():
                Database._connection = connector.connect(
                    database = NAME,
                    user = USER,
                    password = PASSWORD,
                    host = IP,
                    port = PORT
                )
            return Database._connection
    
    try:
        conn = Database.get_connection()
        cursor = conn.cursor()
        years_range = range(start_year, end_year + 1)
        rc_sum_str = " + ".join([f"SUM(rc.`{year}`)" for year in years_range])
        irc_sum_str = " + ".join([f"SUM(irc.`{year}`)" for year in years_range])
    
        cursor.execute(f"""
            SELECT 
                book.ID,
                registration_year,
                registration_month,
                get_course,
                DDC,
                title,
                publication_year,
                location,
                duration,
                COALESCE({rc_sum_str}, 0) AS total_rent,
                COALESCE({irc_sum_str}, 0) AS rent_count,
                ID_count
            FROM book
            LEFT JOIN recent_rent       ON book.ID = recent_rent.ID
            LEFT JOIN rent_count AS rc        ON book.ID = rc.ID
            LEFT JOIN ISBN_rent_count AS irc   ON book.ISBN = irc.ISBN
            GROUP BY book.ID, registration_year, registration_month,
                     get_course, DDC, publication_year, location, duration
            ORDER BY book.ID
        """)
        rows_base = cursor.fetchall()
        cursor.close()
    
    except connector.Error as e:
        print(f"Error: {e}")
    
    df = pd.DataFrame(rows_base, columns=[
            'ID', '등록연도', '등록월', '수서방법',
            '분류코드', '제목', '출판연도', '소장위치', '최근대출',
            '총 대출 횟수', 'rent_count', 'book_count'
        ])

    try:
        conn = Database.get_connection()
        cursor = conn.cursor()
        years_range = range(start_year, end_year + 1)
        sum_str = " + ".join([f"SUM(`{year}`)" for year in years_range])
        cursor.execute(f"SELECT title, ID_count, COALESCE({sum_str}, 0) AS rent_count FROM None_ISBN_rent_count GROUP BY title;")
        rows = cursor.fetchall()
        cursor.close()
    except connector.Error as e:
        print(f"Error: {e}")
    
    None_ISBN_df = pd.DataFrame(rows, columns=['제목', 'book_count', 'rent_count'])
    
    None_ISBN_df = None_ISBN_df.astype(object)
    None_ISBN_df['rent_count'] = None_ISBN_df['rent_count'].astype(float)
    None_ISBN_df['book_count'] = None_ISBN_df['book_count'].astype(float)

    df_merged = pd.merge(df, None_ISBN_df, on='제목', how='left')
    df_merged['rent_count_x'] = df_merged['rent_count_x'].fillna(df_merged['rent_count_y'])
    df_merged['book_count_x'] = df_merged['book_count_x'].fillna(df_merged['book_count_y'])
    df_merged.drop('rent_count_y', axis=1, inplace=True)
    df_merged.drop('book_count_y', axis=1, inplace=True)
    df_merged.rename(columns={'rent_count_x': 'rent_count', 'book_count_x': 'book_count'}, inplace=True)
    df_merged.drop(columns=['제목'], inplace=True)
    df_merged['ID'] = df_merged['ID'].str.split('_').str[-1].astype(int)
    df_merged['수서방법'] = df_merged['수서방법'].astype('category')
    df_merged['분류코드'] = df_merged['분류코드'].astype(float)
    df_merged['출판연도'] = df_merged['출판연도'].astype(int)
    df_merged['소장위치'] = df_merged['소장위치'].astype('category')
    df_merged['최근대출'] = df_merged['최근대출'].fillna(7305).astype(int)
    df_merged['rent_count'] = df_merged['rent_count'].fillna(0).astype(int)
    df_merged['book_count'] = df_merged['book_count'].fillna(0).astype(int)
    df_merged['총 대출 횟수'] = df_merged['총 대출 횟수'].astype(int)
    df_onehot = pd.get_dummies(df_merged)

    end_date_str = "2024-10-31"
    start_date = datetime.date(start_year, 1, 1)           # 2019-01-01
    year_, month_, day_ = map(int, end_date_str.split('-'))  # 2024, 10, 31
    end_date = datetime.date(year_, month_, day_)
    diff_days = (end_date - start_date).days  # 2019-01-01 ~ 2024-10-31까지 일수
    df_onehot.loc[df_onehot['최근대출'] > diff_days, '최근대출'] = diff_days
    
    y_data = df_onehot['rent_count'] / df_onehot['book_count']
    y_data = y_data.fillna(0)
    df_onehot.drop(columns=['rent_count', 'book_count'], inplace=True)
    date_series = pd.to_datetime({
        "year":  df_onehot["등록연도"],
        "month": df_onehot["등록월"],
        "day":   1
    })
    end_date = pd.to_datetime("2024-10-31")
    df_onehot["reg_diff_days"] = (end_date - date_series).dt.days
    df_onehot["최근대출"] = df_onehot["최근대출"].fillna(df_onehot["reg_diff_days"])
    df_onehot.loc[df_onehot["최근대출"] > df_onehot["reg_diff_days"], "최근대출"] = df_onehot["reg_diff_days"]
    df_onehot.drop("reg_diff_days", axis=1, inplace=True)
    df_onehot["최근대출"] = df_onehot["최근대출"].astype(int)

    return df_onehot, y_data

# XGBoost

In [24]:
# 모듈 불러오기
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

In [29]:
def xgb(x, y, year):
    import xgboost as xgb
    # 데이터 분리
    X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42)

    # 모델 생성
    xgb_reg = xgb.XGBRegressor(
        n_estimators = 500,   # 트리 개수
        learning_rate = 0.05,  # 학습률
        max_depth = 5,        # 트리 최대 깊이
        subsample = 0.8,
        colsample_bytree=0.8,
        random_state = 42
    )
    xgb_reg.fit(X_train, y_train)
    y_pred = xgb_reg.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    r2 = r2_score(y_test, y_pred)
    
    print(f"RMSE: {rmse:.4f}")
    print(f"R^2 : {r2:.4f}")
    xgb_reg.save_model(f"xgb_model_{year}.json")

# 데이터 불러오기

In [12]:
x_list = []
y_list = []
for year in range(2004, 2009):
    x, y = load_data(year, 2024)
    x_list.append(x)
    y_list.append(y)
    print(f'{year} complete')

2004 complete
2005 complete
2006 complete
2007 complete
2008 complete


In [15]:
for year in range(2009, 2014):
    x, y = load_data(year, 2024)
    x_list.append(x)
    y_list.append(y)
    print(f'{year} complete')

2009 complete
2010 complete
2011 complete
2012 complete
2013 complete


In [17]:
for year in range(2014, 2019):
    x, y = load_data(year, 2024)
    x_list.append(x)
    y_list.append(y)
    print(f'{year} complete')

2014 complete
2015 complete
2016 complete
2017 complete
2018 complete


In [19]:
for year in range(2019, 2024):
    x, y = load_data(year, 2024)
    x_list.append(x)
    y_list.append(y)
    print(f'{year} complete')

2019 complete
2020 complete
2021 complete
2022 complete
2023 complete


In [20]:
len(x_list)

20

In [30]:
for i in range(2004, 2024):
    xgb(x_list[i-2004], y_list[i-2004], i)    

RMSE: 2.3589
R^2 : 0.8588
RMSE: 2.3475
R^2 : 0.8583
RMSE: 2.2557
R^2 : 0.8606
RMSE: 2.1495
R^2 : 0.8621
RMSE: 2.0351
R^2 : 0.8629
RMSE: 1.9151
R^2 : 0.8626
RMSE: 1.7988
R^2 : 0.8609
RMSE: 1.6867
R^2 : 0.8582
RMSE: 1.5731
R^2 : 0.8538
RMSE: 1.4453
R^2 : 0.8502
RMSE: 1.2982
R^2 : 0.8473
RMSE: 1.1454
R^2 : 0.8450
RMSE: 1.0067
R^2 : 0.8392
RMSE: 0.8869
R^2 : 0.8332
RMSE: 0.7521
R^2 : 0.8250
RMSE: 0.6379
R^2 : 0.8199
RMSE: 0.5395
R^2 : 0.8035
RMSE: 0.4659
R^2 : 0.7893
RMSE: 0.3838
R^2 : 0.7888
RMSE: 0.2909
R^2 : 0.7784


In [33]:
for i in range(2004, 2024, 5):
    merge_x = pd.concat([x_list[i-2004], x_list[i-2003], x_list[i-2002], x_list[i-2001], x_list[i-2000]], axis = 0, ignore_index = True)
    merge_y = pd.concat([y_list[i-2004], y_list[i-2003], y_list[i-2002], y_list[i-2001], y_list[i-2000]], axis = 0, ignore_index = True)
    xgb(merge_x, merge_y, i)    

RMSE: 2.1591
R^2 : 0.8677



KeyboardInterrupt



In [34]:
for i in range(2004, 2009):
    merge_x = pd.concat([x_list[i-2004], x_list[i-1999], x_list[i-1994], x_list[i-1989]], axis = 0, ignore_index = True)
    merge_y = pd.concat([y_list[i-2004], y_list[i-1999], y_list[i-1994], y_list[i-1989]], axis = 0, ignore_index = True)
    xgb(merge_x, merge_y, i)    

RMSE: 1.6781
R^2 : 0.8624
RMSE: 1.6151
R^2 : 0.8606
RMSE: 1.5315
R^2 : 0.8592
RMSE: 1.4433
R^2 : 0.8576
RMSE: 1.3383
R^2 : 0.8577
