## 메인 데이터 불러오기

In [39]:
import os
from mysql import connector
from dotenv import load_dotenv
import pandas as pd

# 환경변수 로드
load_dotenv()

NAME = os.getenv("DB_NAME")
USER = os.getenv("DB_USER")
PASSWORD = os.getenv("DB_PASSWORD")
IP = os.getenv("DB_IP")
PORT = os.getenv("DB_PORT")

class Database:
    _connection = None  # 클래스 변수로 연결 상태 관리

    @staticmethod
    def get_connection():
        if Database._connection is None or not Database._connection.is_connected():
            Database._connection = connector.connect(
                database = NAME,
                user = USER,
                password = PASSWORD,
                host = IP,
                port = PORT
            )
        return Database._connection

try:
    # 1) DB 연결
    conn = Database.get_connection()
    cursor = conn.cursor()

    # 2) 필요한 연도 범위를 지정 (예: 2014~2024)
    start_year = 2004
    end_year   = 2024

    years_range = range(start_year, end_year + 1)
    rc_sum_str = " + ".join([f"SUM(rc.`{year}`)" for year in years_range])
    irc_sum_str = " + ".join([f"SUM(irc.`{year}`)" for year in years_range])

    cursor.execute(f"""
        SELECT 
            book.ID,
            registration_year,
            registration_month,
            get_course,
            DDC,
            title,
            publication_year,
            location,
            duration,
            COALESCE({rc_sum_str}, 0) AS total_rent,
            COALESCE({irc_sum_str}, 0) AS rent_count,
            ID_count
        FROM book
        LEFT JOIN recent_rent       ON book.ID = recent_rent.ID
        LEFT JOIN rent_count AS rc        ON book.ID = rc.ID
        LEFT JOIN ISBN_rent_count AS irc   ON book.ISBN = irc.ISBN
        GROUP BY book.ID, registration_year, registration_month,
                 get_course, DDC, publication_year, location, duration
        ORDER BY book.ID
    """)
    rows_base = cursor.fetchall()
    cursor.close()

except connector.Error as e:
    print(f"Error: {e}")

df = pd.DataFrame(rows_base, columns=[
        'ID', '등록연도', '등록월', '수서방법',
        '분류코드', '제목', '출판연도', '소장위치', '최근대출',
        '총 대출 횟수', 'rent_count', 'book_count'
    ])

In [40]:
df[52478:52500]

Unnamed: 0,ID,등록연도,등록월,수서방법,분류코드,제목,출판연도,소장위치,최근대출,총 대출 횟수,rent_count,book_count
52478,SS_052649,2013,6,이용자희망,155.7,본성과 양육이라는 신기루,2013,4층인문,3298.0,1,2,2.0
52479,SS_052650,2013,6,이용자희망,153.35,크리에이티브 블록,2013,4층인문,3524.0,3,3,1.0
52480,SS_052651,2013,6,이용자희망,741.6,마카로니 구멍의 비밀,2013,4층인문,1422.0,4,6,2.0
52481,SS_052652,2013,6,이용자희망,741.6,마카로니 구멍의 비밀,2013,4층인문,2186.0,2,6,2.0
52482,SS_052653,2013,6,이용자희망,746.43,"(1, 2, 3 스텝으로 쉽게 배우는) 멋진 태팅레이스",2013,4층인문,,0,1,1.0
52483,SS_052654,2013,6,이용자희망,746.43,"(1, 2, 3 스텝으로 쉽게 배우는) 멋진 태팅레이스",2013,4층인문,2807.0,1,1,1.0
52484,SS_052655,2013,6,이용자희망,780.15,스마트 클래식 100,2013,4층인문,3116.0,2,5,2.0
52485,SS_052656,2013,6,이용자희망,808.5,마음을 움직이는 따뜻한 대화법,2013,4층인문,3419.0,2,2,1.0
52486,SS_052657,2013,6,이용자희망,808.5,마음을 움직이는 따뜻한 대화법,2013,4층인문,,0,2,1.0
52487,SS_052658,2013,6,이용자희망,780.15,스마트 클래식 100,2013,보존서고,3124.0,3,5,2.0


In [41]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 299315 entries, 0 to 299314
Data columns (total 12 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   ID          299315 non-null  object 
 1   등록연도        299315 non-null  int64  
 2   등록월         299315 non-null  int64  
 3   수서방법        299315 non-null  object 
 4   분류코드        299315 non-null  object 
 5   제목          299315 non-null  object 
 6   출판연도        299315 non-null  object 
 7   소장위치        299315 non-null  object 
 8   최근대출        160686 non-null  float64
 9   총 대출 횟수     299315 non-null  object 
 10  rent_count  299315 non-null  object 
 11  book_count  177749 non-null  float64
dtypes: float64(2), int64(2), object(8)
memory usage: 27.4+ MB


## ISBN 0 데이터 전처리

In [50]:
import pandas as pd

class Database:
    _connection = None  # 클래스 변수로 연결 상태 관리

    @staticmethod
    def get_connection():
        if Database._connection is None or not Database._connection.is_connected():
            Database._connection = connector.connect(
                database = NAME,
                user = USER,
                password = PASSWORD,
                host = IP,
                port = PORT
            )
        return Database._connection

try:
    conn = Database.get_connection()
    cursor = conn.cursor()

    # 2) 필요한 연도 범위를 지정 (예: 2014~2024)
    start_year = 2004
    end_year   = 2024

    years_range = range(start_year, end_year + 1)
    sum_str = " + ".join([f"SUM(`{year}`)" for year in years_range])

    cursor.execute(f"SELECT title, ID_count, COALESCE({sum_str}, 0) AS rent_count FROM None_ISBN_rent_count GROUP BY title;")
    
    rows = cursor.fetchall()
    
    cursor.close()
except connector.Error as e:
    print(f"Error: {e}")

None_ISBN_df = pd.DataFrame(rows, columns=['제목', 'book_count', 'rent_count'])

None_ISBN_df = None_ISBN_df.astype(object)
None_ISBN_df['rent_count'] = None_ISBN_df['rent_count'].astype(float)
None_ISBN_df['book_count'] = None_ISBN_df['book_count'].astype(float)

None_ISBN_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6966 entries, 0 to 6965
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   제목          6966 non-null   object 
 1   book_count  6966 non-null   float64
 2   rent_count  6966 non-null   float64
dtypes: float64(2), object(1)
memory usage: 163.4+ KB


In [51]:
None_ISBN_df[:5]

Unnamed: 0,제목,book_count,rent_count
0,"""국내외 한국어 교육기관간 협력망 구축을 위한 기반 조성"" 및 ""국제학술회의 개최""...",1.0,1.0
1,"""한국어 중급 교재 교수 요목 개발"" 사업 보고서",1.0,5.0
2,"""한국어 초급(쓰기.읽기)교사용 지침서 및 학습자용 워크북 개발"" 사업 보고서",2.0,17.0
3,"""한국어"" 초급(말하기.듣기)교재 개발 사업보고서",2.0,18.0
4,'90년대 연극평론 자료집,1.0,1.0


## 병합 및 y 데이터 분리

In [52]:
df_merged = pd.merge(df, None_ISBN_df, on='제목', how='left')

In [53]:
df_merged.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 299315 entries, 0 to 299314
Data columns (total 14 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   ID            299315 non-null  object 
 1   등록연도          299315 non-null  int64  
 2   등록월           299315 non-null  int64  
 3   수서방법          299315 non-null  object 
 4   분류코드          299315 non-null  object 
 5   제목            299315 non-null  object 
 6   출판연도          299315 non-null  object 
 7   소장위치          299315 non-null  object 
 8   최근대출          160686 non-null  float64
 9   총 대출 횟수       299315 non-null  object 
 10  rent_count_x  299315 non-null  object 
 11  book_count_x  177749 non-null  float64
 12  book_count_y  20614 non-null   float64
 13  rent_count_y  20614 non-null   float64
dtypes: float64(4), int64(2), object(8)
memory usage: 32.0+ MB


In [54]:
# → 왼쪽이 NaN인 경우만 오른쪽으로 채우려면:
df_merged['rent_count_x'] = df_merged['rent_count_x'].fillna(df_merged['rent_count_y'])
df_merged['book_count_x'] = df_merged['book_count_x'].fillna(df_merged['book_count_y'])

# 오른쪽 컬럼은 더 이상 필요 없으면 제거
df_merged.drop('rent_count_y', axis=1, inplace=True)
df_merged.drop('book_count_y', axis=1, inplace=True)

# 필요하면 이름을 다시 'val'로 원상복구
df_merged.rename(columns={'rent_count_x': 'rent_count', 'book_count_x': 'book_count'}, inplace=True)

df_merged.drop(columns=['제목'], inplace=True)

In [55]:
df_merged.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 299315 entries, 0 to 299314
Data columns (total 11 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   ID          299315 non-null  object 
 1   등록연도        299315 non-null  int64  
 2   등록월         299315 non-null  int64  
 3   수서방법        299315 non-null  object 
 4   분류코드        299315 non-null  object 
 5   출판연도        299315 non-null  object 
 6   소장위치        299315 non-null  object 
 7   최근대출        160686 non-null  float64
 8   총 대출 횟수     299315 non-null  object 
 9   rent_count  299315 non-null  object 
 10  book_count  194945 non-null  float64
dtypes: float64(2), int64(2), object(7)
memory usage: 25.1+ MB


In [56]:
# 1. ID 처리: '_' 기준으로 분리, 뒤쪽 값만 사용 후 int로 변환
df_merged['ID'] = df_merged['ID'].str.split('_').str[-1].astype(int)

# 2. 수서 방법을 category로 변환
df_merged['수서방법'] = df_merged['수서방법'].astype('category')

# 3. 분류코드를 float로 변환
df_merged['분류코드'] = df_merged['분류코드'].astype(float)

# 4. 출판연도를 int로 변환
df_merged['출판연도'] = df_merged['출판연도'].astype(int)

# 5. 소장위치를 category로 변환
df_merged['소장위치'] = df_merged['소장위치'].astype('category')

# 6. 최근대출을 int로 변환하고 NaN 값을 7305로 설정
df_merged['최근대출'] = df_merged['최근대출'].fillna(7305).astype(int)
df_merged['rent_count'] = df_merged['rent_count'].fillna(0).astype(int)
df_merged['book_count'] = df_merged['book_count'].fillna(0).astype(int)


# 7. 총 대출 횟수, rent_count, book_count를 int로 변환
df_merged['총 대출 횟수'] = df_merged['총 대출 횟수'].astype(int)

# 결과 확인
df_onehot = pd.get_dummies(df_merged)
df_onehot.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 299315 entries, 0 to 299314
Data columns (total 17 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   ID           299315 non-null  int64  
 1   등록연도         299315 non-null  int64  
 2   등록월          299315 non-null  int64  
 3   분류코드         299315 non-null  float64
 4   출판연도         299315 non-null  int64  
 5   최근대출         299315 non-null  int64  
 6   총 대출 횟수      299315 non-null  int64  
 7   rent_count   299315 non-null  int64  
 8   book_count   299315 non-null  int64  
 9   수서방법_기타      299315 non-null  bool   
 10  수서방법_사서선정    299315 non-null  bool   
 11  수서방법_수서정보없음  299315 non-null  bool   
 12  수서방법_수업지정    299315 non-null  bool   
 13  수서방법_이용자희망   299315 non-null  bool   
 14  수서방법_학과신청    299315 non-null  bool   
 15  소장위치_4층인문    299315 non-null  bool   
 16  소장위치_보존서고    299315 non-null  bool   
dtypes: bool(8), float64(1), int64(8)
memory usage: 22.8 MB


In [57]:
y_data = df_onehot['rent_count'] / df_onehot['book_count']

y_data = y_data.fillna(0)
y_data.info()

<class 'pandas.core.series.Series'>
RangeIndex: 299315 entries, 0 to 299314
Series name: None
Non-Null Count   Dtype  
--------------   -----  
299315 non-null  float64
dtypes: float64(1)
memory usage: 2.3 MB


In [58]:
df_onehot.drop(columns=['rent_count', 'book_count'], inplace=True)

df_onehot.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 299315 entries, 0 to 299314
Data columns (total 15 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   ID           299315 non-null  int64  
 1   등록연도         299315 non-null  int64  
 2   등록월          299315 non-null  int64  
 3   분류코드         299315 non-null  float64
 4   출판연도         299315 non-null  int64  
 5   최근대출         299315 non-null  int64  
 6   총 대출 횟수      299315 non-null  int64  
 7   수서방법_기타      299315 non-null  bool   
 8   수서방법_사서선정    299315 non-null  bool   
 9   수서방법_수서정보없음  299315 non-null  bool   
 10  수서방법_수업지정    299315 non-null  bool   
 11  수서방법_이용자희망   299315 non-null  bool   
 12  수서방법_학과신청    299315 non-null  bool   
 13  소장위치_4층인문    299315 non-null  bool   
 14  소장위치_보존서고    299315 non-null  bool   
dtypes: bool(8), float64(1), int64(6)
memory usage: 18.3 MB


## XGBoost

In [59]:
# 모듈 불러오기
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

In [60]:
# 데이터 분리
X_train, X_test, y_train, y_test = train_test_split(df_onehot, y_data, test_size=0.3, random_state=42)

print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(209520, 15)
(89795, 15)
(209520,)
(89795,)


In [61]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_estimators': [100, 200, 300, 400, 500],
    'max_depth': [3, 5, 7, 10, 15],
    'learning_rate': [0.1, 0.01],
    'subsample': [0.8, 1.0]
}

xgb_reg = xgb.XGBRegressor(random_state=42)
grid_cv = GridSearchCV(
    estimator=xgb_reg,
    param_grid=param_grid,
    scoring='neg_mean_squared_error',  # 회귀의 경우, MSE 계열 사용
    cv=3,
    n_jobs=-1,
    verbose=2
)

grid_cv.fit(X_train, y_train)
print("최적 파라미터:", grid_cv.best_params_)
print("최고 성능 (MSE):", -grid_cv.best_score_)

# 최적 모델을 이용해 테스트 세트에서 R^2 점수 확인
best_estimator = grid_cv.best_estimator_
y_pred_test = best_estimator.predict(X_test)

r2_test = r2_score(y_test, y_pred_test)
print("테스트 세트 R^2 :", r2_test)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


Exception ignored on calling ctypes callback function: <bound method DataIter._next_wrapper of <xgboost.data.SingleBatchInternalIter object at 0x7f3bdfa92c10>>
Traceback (most recent call last):
  File "/home/haechan/miniconda3/envs/datathon/lib/python3.11/site-packages/xgboost/core.py", line 582, in _next_wrapper
    def _next_wrapper(self, this: None) -> int:  # pylint: disable=unused-argument

KeyboardInterrupt: 
Exception ignored on calling ctypes callback function: <bound method DataIter._next_wrapper of <xgboost.data.SingleBatchInternalIter object at 0x7fb687eeeb10>>
Traceback (most recent call last):
  File "/home/haechan/miniconda3/envs/datathon/lib/python3.11/site-packages/xgboost/core.py", line 582, in _next_wrapper
    def _next_wrapper(self, this: None) -> int:  # pylint: disable=unused-argument

KeyboardInterrupt: 
Exception ignored on calling ctypes callback function: <bound method DataIter._next_wrapper of <xgboost.data.SingleBatchInternalIter object at 0x7f4f49f7b3d0>>


KeyboardInterrupt: 

In [62]:
def xgb(n, d):
    # 모듈 불러오기
    import xgboost as xgb
    from sklearn.model_selection import train_test_split
    from sklearn.model_selection import train_test_split
    from sklearn.metrics import mean_squared_error, r2_score
    import numpy as np

    # 데이터 분리
    X_train, X_test, y_train, y_test = train_test_split(df_onehot, y_data, test_size=0.3, random_state=42)

    # 모델 생성
    xgb_reg = xgb.XGBRegressor(
        n_estimators = n,   # 트리 개수
        learning_rate = 0.1,  # 학습률
        max_depth = d,        # 트리 최대 깊이
        subsample = 0.8,
        random_state = 42
    )
    xgb_reg.fit(X_train, y_train)
    y_pred = xgb_reg.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    r2 = r2_score(y_test, y_pred)
    
    print(f"RMSE: {rmse:.4f}")
    print(f"R^2 : {r2:.4f}")

In [63]:
xgb(500, 7)

RMSE: 2.2828
R^2 : 0.8678


In [64]:
xgb(1000, 7)

RMSE: 2.2618
R^2 : 0.8702


In [65]:
xgb(1000, 10)

RMSE: 2.2488
R^2 : 0.8717


In [66]:
xgb(1200, 7)

RMSE: 2.2587
R^2 : 0.8705


In [67]:
xgb(1200, 10)

RMSE: 2.2522
R^2 : 0.8713


In [68]:
# 모듈 불러오기
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

# 데이터 분리
X_train, X_test, y_train, y_test = train_test_split(df_onehot, y_data, test_size=0.3, random_state=42)

# 모델 생성
xgb_reg = xgb.XGBRegressor(
    n_estimators = 1000,   # 트리 개수
    learning_rate = 0.1,  # 학습률
    max_depth = 10,        # 트리 최대 깊이
    subsample = 0.8,
    random_state = 42
)
xgb_reg.fit(X_train, y_train)
y_pred = xgb_reg.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print(f"RMSE: {rmse:.4f}")
print(f"R^2 : {r2:.4f}")

y_pred = xgb_reg.predict(df_onehot)

RMSE: 2.2488
R^2 : 0.8717


In [69]:
y_pred[52478:52500]

array([ 1.9504932 ,  3.3848772 ,  3.59284   ,  2.3238902 ,  0.47096318,
        1.1672841 ,  2.0880592 ,  2.6727366 ,  0.7204064 ,  2.8965564 ,
        0.6273065 ,  2.4741728 ,  1.4512951 ,  3.5379891 ,  3.710767  ,
       24.113304  , 23.593155  ,  4.858259  ,  6.3410277 ,  2.0655117 ,
        1.1422119 ,  1.2161691 ], dtype=float32)

In [70]:
y_data[52478:52500]

52478     1.0
52479     3.0
52480     3.0
52481     3.0
52482     1.0
52483     1.0
52484     2.5
52485     2.0
52486     2.0
52487     2.5
52488     0.0
52489     1.5
52490     1.5
52491     3.5
52492     3.5
52493    24.0
52494    24.0
52495     5.5
52496     5.5
52497     2.0
52498     2.0
52499     1.0
dtype: float64