In [16]:
import os
from mysql import connector
from dotenv import load_dotenv
import pandas as pd

# Access environment variables
load_dotenv()

NAME = os.getenv("DB_NAME")
USER = os.getenv("DB_USER")
PASSWORD = os.getenv("DB_PASSWORD")
IP = os.getenv("DB_IP")
PORT = os.getenv("DB_PORT")

class Database:
    _connection = None  # 클래스 변수로 연결 상태 관리

    @staticmethod
    def get_connection():
        if Database._connection is None or not Database._connection.is_connected():
            Database._connection = connector.connect(
                database = NAME,
                user = USER,
                password = PASSWORD,
                host = IP,
                port = PORT
            )
        return Database._connection

try:
    conn = Database.get_connection()
    cursor = conn.cursor()
    
    cursor.execute("""SELECT 
                        book.ID,
                        registration_year,
                        registration_month,
                        get_course,
                        DDC,
                        publication_year,
                        location,
                        duration,
                        COALESCE(SUM(`2004`) + SUM(`2005`) + SUM(`2006`) + SUM(`2007`) + SUM(`2008`) +
                                 SUM(`2009`) + SUM(`2010`) + SUM(`2011`) + SUM(`2012`) + SUM(`2013`) +
                                 SUM(`2014`) + SUM(`2015`) + SUM(`2016`) + SUM(`2017`) + SUM(`2018`) +
                                 SUM(`2019`) + SUM(`2020`) + SUM(`2021`) + SUM(`2022`) + SUM(`2023`) +
                                 SUM(`2024`), 0) AS total_rent,
                        COALESCE(rent_count, 0) AS rent_count,
                        COALESCE(book_count, 0) AS book_count
                    FROM 
                        book
                    LEFT JOIN 
                        recent_rent ON book.ID = recent_rent.ID
                    LEFT JOIN 
                        rent_count ON book.ID = rent_count.ID
                    LEFT JOIN 
                        ISBN_rent_count ON book.ISBN = ISBN_rent_count.ISBN
                    GROUP BY 
                        book.ID, registration_year, registration_month, get_course, DDC, publication_year, location, duration
                    ORDER BY 
                        book.ID
                    """)

    rows = cursor.fetchall()
    
    cursor.close()
except connector.Error as e:
    print(f"Error: {e}")

df = pd.DataFrame(rows, columns=['ID', '등록연도', '등록월', '수서방법', '분류코드', '출판연도', '소장위치', '최근대출', '총 대출 횟수', 'rent_count', 'book_count'])

In [17]:
df[52478:52500]

Unnamed: 0,ID,등록연도,등록월,수서방법,분류코드,출판연도,소장위치,최근대출,총 대출 횟수,rent_count,book_count
52478,SS_052649,2013,6,이용자희망,155.7,2013,4층인문,3298.0,1,2,2
52479,SS_052650,2013,6,이용자희망,153.35,2013,4층인문,3524.0,3,3,1
52480,SS_052651,2013,6,이용자희망,741.6,2013,4층인문,1422.0,4,6,2
52481,SS_052652,2013,6,이용자희망,741.6,2013,4층인문,2186.0,2,6,2
52482,SS_052653,2013,6,이용자희망,746.43,2013,4층인문,,0,1,1
52483,SS_052654,2013,6,이용자희망,746.43,2013,4층인문,2807.0,1,1,1
52484,SS_052655,2013,6,이용자희망,780.15,2013,4층인문,3116.0,2,5,2
52485,SS_052656,2013,6,이용자희망,808.5,2013,4층인문,3419.0,2,2,1
52486,SS_052657,2013,6,이용자희망,808.5,2013,4층인문,,0,2,1
52487,SS_052658,2013,6,이용자희망,780.15,2013,보존서고,3124.0,3,5,2


In [18]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 299315 entries, 0 to 299314
Data columns (total 11 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   ID          299315 non-null  object 
 1   등록연도        299315 non-null  int64  
 2   등록월         299315 non-null  int64  
 3   수서방법        299315 non-null  object 
 4   분류코드        299315 non-null  object 
 5   출판연도        299315 non-null  object 
 6   소장위치        299315 non-null  object 
 7   최근대출        160686 non-null  float64
 8   총 대출 횟수     299315 non-null  object 
 9   rent_count  299315 non-null  object 
 10  book_count  299315 non-null  object 
dtypes: float64(1), int64(2), object(8)
memory usage: 25.1+ MB


In [19]:
# 1. ID 처리: '_' 기준으로 분리, 뒤쪽 값만 사용 후 int로 변환
df['ID'] = df['ID'].str.split('_').str[-1].astype(int)

# 2. 수서 방법을 category로 변환
df['수서방법'] = df['수서방법'].astype('category')

# 3. 분류코드를 float로 변환
df['분류코드'] = df['분류코드'].astype(float)

# 4. 출판연도를 int로 변환
df['출판연도'] = df['출판연도'].astype(int)

# 5. 소장위치를 category로 변환
df['소장위치'] = df['소장위치'].astype('category')

# 6. 최근대출을 int로 변환하고 NaN 값을 7305로 설정
df['최근대출'] = df['최근대출'].fillna(7305).astype(int)

# 7. 총 대출 횟수, rent_count, book_count를 int로 변환
df['총 대출 횟수'] = df['총 대출 횟수'].astype(int)
df['rent_count'] = df['rent_count'].astype(int)
df['book_count'] = df['book_count'].astype(int)

# 결과 확인
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 299315 entries, 0 to 299314
Data columns (total 11 columns):
 #   Column      Non-Null Count   Dtype   
---  ------      --------------   -----   
 0   ID          299315 non-null  int64   
 1   등록연도        299315 non-null  int64   
 2   등록월         299315 non-null  int64   
 3   수서방법        299315 non-null  category
 4   분류코드        299315 non-null  float64 
 5   출판연도        299315 non-null  int64   
 6   소장위치        299315 non-null  category
 7   최근대출        299315 non-null  int64   
 8   총 대출 횟수     299315 non-null  int64   
 9   rent_count  299315 non-null  int64   
 10  book_count  299315 non-null  int64   
dtypes: category(2), float64(1), int64(8)
memory usage: 21.1 MB


In [20]:
df_onehot = pd.get_dummies(df)

In [21]:
df_onehot.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 299315 entries, 0 to 299314
Data columns (total 17 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   ID           299315 non-null  int64  
 1   등록연도         299315 non-null  int64  
 2   등록월          299315 non-null  int64  
 3   분류코드         299315 non-null  float64
 4   출판연도         299315 non-null  int64  
 5   최근대출         299315 non-null  int64  
 6   총 대출 횟수      299315 non-null  int64  
 7   rent_count   299315 non-null  int64  
 8   book_count   299315 non-null  int64  
 9   수서방법_기타      299315 non-null  bool   
 10  수서방법_사서선정    299315 non-null  bool   
 11  수서방법_수서정보없음  299315 non-null  bool   
 12  수서방법_수업지정    299315 non-null  bool   
 13  수서방법_이용자희망   299315 non-null  bool   
 14  수서방법_학과신청    299315 non-null  bool   
 15  소장위치_4층인문    299315 non-null  bool   
 16  소장위치_보존서고    299315 non-null  bool   
dtypes: bool(8), float64(1), int64(8)
memory usage: 22.8 MB
