# book 테이블 초기 데이터 입력

### 데이터 전처리

In [6]:
import pandas as pd
import numpy as np
import re

# 단행본(도서)정보.txt
pre_df = pd.read_csv('../Data/단행본(도서)정보.txt', sep=',', encoding='EUC-KR')

# 분류코드 전처리
pre_df = pre_df[pre_df['분류코드'] != '0198129009 (v.2)']
pre_df['분류코드'] = pre_df['분류코드'].str.replace(',', '.')
pre_df['분류코드'] = pre_df['분류코드'].str.replace('..', '.')
pre_df['분류코드'] = pre_df['분류코드'].str.extract(r'([\d.]+)')
pre_df['분류코드'] = pre_df['분류코드'].str.rstrip('.')

# 출판년도 전처리
pre_df['출판년도'] = pre_df['출판년도'].apply(lambda x: re.findall(r'\d{4}', str(x)))
def extract_year(years):
    for year in reversed(years):
        if int(year) < 2025:
            return year
    return None
pre_df['출판년도'] = pre_df['출판년도'].apply(extract_year)
pre_df = pre_df.dropna(subset=['출판년도'])

# ISBN 전처리
def clean_isbn(isbn):
    isbn = re.sub(r'\D', '', isbn)
    return int(isbn) if isbn.isdigit() else None
pre_df['ISBN'] = pre_df['ISBN'].apply(clean_isbn)
pre_df = pre_df.dropna(subset=['ISBN'])
pre_df['ISBN'] = pre_df['ISBN'].astype(object)

# NULL 데이터 제거
pre_df['출판사'] = pre_df['출판사'].fillna('출판사 없음')
pre_df['저자'] = pre_df['저자'].fillna('저자 없음')

# 등록일자 연/월 분리
pre_df['등록연도'] = pd.to_datetime(pre_df['등록일자']).dt.year
pre_df['등록월'] = pd.to_datetime(pre_df['등록일자']).dt.month
pre_df['등록연도'] = pre_df['등록연도'].astype(object)
pre_df['등록월'] = pre_df['등록월'].astype(object)

pre_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 299315 entries, 0 to 301150
Data columns (total 12 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   도서ID    299315 non-null  object
 1   등록일자    299315 non-null  object
 2   수서방법    299315 non-null  object
 3   분류코드    299315 non-null  object
 4   ISBN    299315 non-null  object
 5   서명      299315 non-null  object
 6   저자      299315 non-null  object
 7   출판사     299315 non-null  object
 8   출판년도    299315 non-null  object
 9   소장위치    299315 non-null  object
 10  등록연도    299315 non-null  object
 11  등록월     299315 non-null  object
dtypes: object(12)
memory usage: 29.7+ MB


### DB에 데이터 삽입

In [7]:
import os
from mysql import connector
from dotenv import load_dotenv

# Access environment variables
load_dotenv()

NAME = os.getenv("DB_NAME")
USER = os.getenv("DB_USER")
PASSWORD = os.getenv("DB_PASSWORD")
IP = os.getenv("DB_IP")
PORT = os.getenv("DB_PORT")

class Database:
    _connection = None  # 클래스 변수로 연결 상태 관리

    @staticmethod
    def get_connection():
        if Database._connection is None or not Database._connection.is_connected():
            Database._connection = connector.connect(
                database = NAME,
                user = USER,
                password = PASSWORD,
                host = IP,
                port = PORT
            )
        return Database._connection

def batch_insert(query, values_list, batch_size=1000):
    try:
        conn = Database.get_connection()
        cursor = conn.cursor()

        # 트랜잭션 시작
        total_rows = len(values_list)
        for i in range(0, total_rows, batch_size):
            batch = values_list[i : i + batch_size]
            cursor.executemany(query, batch)  # Batch Insert
            conn.commit()  # Batch마다 커밋 (너무 자주 커밋하지 않음)
            print(f'Current Batch : {i + batch_size}')

        cursor.close()
    except connector.Error as e:
        print(f"Error: {e}")
        conn.rollback()  # 오류 발생 시 롤백


# SQL 쿼리
query = """
INSERT INTO book 
(ID, registration_year, registration_month, get_course, DDC, ISBN, title, author, publisher, publication_year, location) 
VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
"""

# 데이터 준비
values_list = [
    (
        pre_df.iloc[i, 0], pre_df.iloc[i, 10], pre_df.iloc[i, 11], pre_df.iloc[i, 2], 
        pre_df.iloc[i, 3], pre_df.iloc[i, 4], pre_df.iloc[i, 5], pre_df.iloc[i, 6], 
        pre_df.iloc[i, 7], pre_df.iloc[i, 8], pre_df.iloc[i, 9]
    )
    for i in range(len(pre_df))
]

# Batch Insert 실행
batch_insert(query, values_list, batch_size=20000)


Current Batch : 20000
Current Batch : 40000
Current Batch : 60000
Current Batch : 80000
Current Batch : 100000
Current Batch : 120000
Current Batch : 140000
Current Batch : 160000
Current Batch : 180000
Current Batch : 200000
Current Batch : 220000
Current Batch : 240000
Current Batch : 260000
Current Batch : 280000
Current Batch : 300000


# rent 테이블 초기 데이터 입력

### 데이터 전처리

In [8]:
import pandas as pd
import numpy as np

# 대출정보.txt
rent_df = pd.read_csv('../Data/대출정보.txt', sep=',', encoding='EUC-KR')

# '대출일시'에서 월 추출
rent_df['대출월'] = pd.to_datetime(rent_df['대출일시']).dt.month

# '대출월' 기준으로 학기와 비학기 구분
rent_df['TAG'] = rent_df['대출월'].apply(lambda x: '방학' if x in [1, 2, 7, 8] else '학기')

rent_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 916709 entries, 0 to 916708
Data columns (total 4 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   도서ID    916709 non-null  object
 1   대출일시    916709 non-null  object
 2   대출월     916709 non-null  int32 
 3   TAG     916709 non-null  object
dtypes: int32(1), object(3)
memory usage: 24.5+ MB


### DB에 데이터 삽입

In [9]:
import os
from mysql import connector
from dotenv import load_dotenv

# Access environment variables
load_dotenv()

NAME = os.getenv("DB_NAME")
USER = os.getenv("DB_USER")
PASSWORD = os.getenv("DB_PASSWORD")
IP = os.getenv("DB_IP")
PORT = os.getenv("DB_PORT")

class Database:
    _connection = None  # 클래스 변수로 연결 상태 관리

    @staticmethod
    def get_connection():
        if Database._connection is None or not Database._connection.is_connected():
            Database._connection = connector.connect(
                database = NAME,
                user = USER,
                password = PASSWORD,
                host = IP,
                port = PORT
            )
        return Database._connection

def batch_insert(query, values_list, batch_size=1000):
    try:
        conn = Database.get_connection()
        cursor = conn.cursor()

        # 트랜잭션 시작
        total_rows = len(values_list)
        for i in range(0, total_rows, batch_size):
            batch = values_list[i : i + batch_size]
            cursor.executemany(query, batch)  # Batch Insert
            conn.commit()  # Batch마다 커밋 (너무 자주 커밋하지 않음)
            print(f'Current Batch : {i + batch_size}')

        cursor.close()
    except connector.Error as e:
        print(f"Error: {e}")
        conn.rollback()  # 오류 발생 시 롤백


# SQL 쿼리
query = """
INSERT INTO rent 
(ID, rent_date, TAG) 
VALUES (%s, %s, %s)
"""

# 데이터 준비
values_list = [
    (
        rent_df.iloc[i, 0], rent_df.iloc[i, 1], rent_df.iloc[i, 3]
    )
    for i in range(len(rent_df))
]

# Batch Insert 실행
batch_insert(query, values_list, batch_size=20000)


Current Batch : 20000
Current Batch : 40000
Current Batch : 60000
Current Batch : 80000
Current Batch : 100000
Current Batch : 120000
Current Batch : 140000
Current Batch : 160000
Current Batch : 180000
Current Batch : 200000
Current Batch : 220000
Current Batch : 240000
Current Batch : 260000
Current Batch : 280000
Current Batch : 300000
Current Batch : 320000
Current Batch : 340000
Current Batch : 360000
Current Batch : 380000
Current Batch : 400000
Current Batch : 420000
Current Batch : 440000
Current Batch : 460000
Current Batch : 480000
Current Batch : 500000
Current Batch : 520000
Current Batch : 540000
Current Batch : 560000
Current Batch : 580000
Current Batch : 600000
Current Batch : 620000
Current Batch : 640000
Current Batch : 660000
Current Batch : 680000
Current Batch : 700000
Current Batch : 720000
Current Batch : 740000
Current Batch : 760000
Current Batch : 780000
Current Batch : 800000
Current Batch : 820000
Current Batch : 840000
Current Batch : 860000
Current Batch :

# rent_count 테이블 초기 데이터 입력

### 데이터 전처리

In [10]:
import pandas as pd
import numpy as np
from collections import defaultdict

# 대출정보.txt
rent_df = pd.read_csv('../Data/대출정보.txt', sep=',', encoding='EUC-KR')

# '대출일시'에서 월 추출
rent_df['대출연도'] = pd.to_datetime(rent_df['대출일시']).dt.year

count = defaultdict(lambda: {year: 0 for year in range(2004, 2025)})

# 데이터 처리
for i in range(1, len(rent_df)):
    book_id = rent_df.iloc[i, 0]  # 도서 ID
    year = rent_df.iloc[i, 2]

    if 2004 <= year <= 2024:  # 유효한 연도만 처리
        count[book_id][year] += 1

# 결과를 데이터프레임으로 변환 (선택)
count_df = pd.DataFrame.from_dict(count, orient='index')
count_df.reset_index(inplace=True)
count_df.rename(columns={'index': 'ID'}, inplace=True)

count_df = count_df.astype(object)

In [11]:
count_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 160872 entries, 0 to 160871
Data columns (total 22 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   ID      160872 non-null  object
 1   2004    160872 non-null  object
 2   2005    160872 non-null  object
 3   2006    160872 non-null  object
 4   2007    160872 non-null  object
 5   2008    160872 non-null  object
 6   2009    160872 non-null  object
 7   2010    160872 non-null  object
 8   2011    160872 non-null  object
 9   2012    160872 non-null  object
 10  2013    160872 non-null  object
 11  2014    160872 non-null  object
 12  2015    160872 non-null  object
 13  2016    160872 non-null  object
 14  2017    160872 non-null  object
 15  2018    160872 non-null  object
 16  2019    160872 non-null  object
 17  2020    160872 non-null  object
 18  2021    160872 non-null  object
 19  2022    160872 non-null  object
 20  2023    160872 non-null  object
 21  2024    160872 non-null  object
d

### DB에 데이터 삽입

In [12]:
import os
from mysql import connector
from dotenv import load_dotenv

# Access environment variables
load_dotenv()

NAME = os.getenv("DB_NAME")
USER = os.getenv("DB_USER")
PASSWORD = os.getenv("DB_PASSWORD")
IP = os.getenv("DB_IP")
PORT = os.getenv("DB_PORT")

class Database:
    _connection = None  # 클래스 변수로 연결 상태 관리

    @staticmethod
    def get_connection():
        if Database._connection is None or not Database._connection.is_connected():
            Database._connection = connector.connect(
                database = NAME,
                user = USER,
                password = PASSWORD,
                host = IP,
                port = PORT
            )
        return Database._connection

def batch_insert(query, values_list, batch_size=1000):
    try:
        conn = Database.get_connection()
        cursor = conn.cursor()

        # 트랜잭션 시작
        total_rows = len(values_list)
        for i in range(0, total_rows, batch_size):
            batch = values_list[i : i + batch_size]
            cursor.executemany(query, batch)  # Batch Insert
            conn.commit()  # Batch마다 커밋 (너무 자주 커밋하지 않음)
            print(f'Current Batch : {i + batch_size}')

        cursor.close()
    except connector.Error as e:
        print(f"Error: {e}")
        conn.rollback()  # 오류 발생 시 롤백


# SQL 쿼리
query = """
INSERT INTO rent_count 
(ID, `2004`, `2005`, `2006`, `2007`, `2008`, `2009`, `2010`, `2011`, `2012`, `2013`, 
`2014`, `2015`, `2016`, `2017`, `2018`, `2019`, `2020`, `2021`, `2022`, `2023`, `2024`) 
VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
"""

# 데이터 준비
values_list = [
    (
        count_df.iloc[i, 0], count_df.iloc[i, 1], count_df.iloc[i, 2], count_df.iloc[i, 3], count_df.iloc[i, 4], count_df.iloc[i, 5], 
        count_df.iloc[i, 6], count_df.iloc[i, 7], count_df.iloc[i, 8], count_df.iloc[i, 9], count_df.iloc[i, 10], count_df.iloc[i, 11], 
        count_df.iloc[i, 12], count_df.iloc[i, 13], count_df.iloc[i, 14], count_df.iloc[i, 15], count_df.iloc[i, 16], count_df.iloc[i, 17], 
        count_df.iloc[i, 18], count_df.iloc[i, 19], count_df.iloc[i, 20], count_df.iloc[i, 21]
    )
    for i in range(len(count_df))
]

# Batch Insert 실행
batch_insert(query, values_list, batch_size=20000)


Current Batch : 20000
Current Batch : 40000
Current Batch : 60000
Current Batch : 80000
Current Batch : 100000
Current Batch : 120000
Current Batch : 140000
Current Batch : 160000
Current Batch : 180000


# DDC_count 테이블 초기 데이터 입력

### 데이터 전처리

In [13]:
class Database:
    _connection = None  # 클래스 변수로 연결 상태 관리

    @staticmethod
    def get_connection():
        if Database._connection is None or not Database._connection.is_connected():
            Database._connection = connector.connect(
                database = NAME,
                user = USER,
                password = PASSWORD,
                host = IP,
                port = PORT
            )
        return Database._connection

try:
    conn = Database.get_connection()
    cursor = conn.cursor()
    
    cursor.execute("SELECT ID, DDC FROM book WHERE location = '4층인문'")

    rows = cursor.fetchall()
    
    cursor.close()
except connector.Error as e:
    print(f"Error: {e}")

DDC_df = pd.DataFrame(rows, columns=['ID', 'DDC'])
DDC_df['DDC'] = DDC_df['DDC'].astype(float)

bins = [0, 100, 200, 300, 400, 500, 600, 700, 800, 900, 1000]
group_name = ['0', '100', '200', '300', '400', '500', '600', '700', '800', '900']

DDC_df['분류'] = pd.cut(DDC_df['DDC'], bins, labels = group_name)

# 분류별 데이터 카운트
category_counts = DDC_df['분류'].value_counts().sort_index()

### DB에 데이터 삽입

In [14]:
import os
from mysql import connector
from dotenv import load_dotenv

# Access environment variables
load_dotenv()

NAME = os.getenv("DB_NAME")
USER = os.getenv("DB_USER")
PASSWORD = os.getenv("DB_PASSWORD")
IP = os.getenv("DB_IP")
PORT = os.getenv("DB_PORT")

class Database:
    _connection = None  # 클래스 변수로 연결 상태 관리

    @staticmethod
    def get_connection():
        if Database._connection is None or not Database._connection.is_connected():
            Database._connection = connector.connect(
                database = NAME,
                user = USER,
                password = PASSWORD,
                host = IP,
                port = PORT
            )
        return Database._connection

def batch_insert(query, values_list, batch_size=1000):
    try:
        conn = Database.get_connection()
        cursor = conn.cursor()

        cursor.execute(query, values_list)
        conn.commit()
        
        cursor.close()
    except connector.Error as e:
        print(f"Error: {e}")
        conn.rollback()  # 오류 발생 시 롤백


# SQL 쿼리
query = """
INSERT INTO DDC_count 
(`0`, `100`, `200`, `300`, `400`, `500`, `600`, `700`, `800`, `900`) 
VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
"""

# 데이터 준비
values_list = category_counts.astype(str).tolist()

# Batch Insert 실행
batch_insert(query, values_list, batch_size=1)


# recent_rent 테이블 초기 데이터 입력

### 데이터 전처리

In [15]:
class Database:
    _connection = None  # 클래스 변수로 연결 상태 관리

    @staticmethod
    def get_connection():
        if Database._connection is None or not Database._connection.is_connected():
            Database._connection = connector.connect(
                database = NAME,
                user = USER,
                password = PASSWORD,
                host = IP,
                port = PORT
            )
        return Database._connection

try:
    conn = Database.get_connection()
    cursor = conn.cursor()
    
    cursor.execute("SELECT ID, rent_date FROM rent")

    rows = cursor.fetchall()
    
    cursor.close()
except connector.Error as e:
    print(f"Error: {e}")

rent_df = pd.DataFrame(rows, columns=['ID', '대여날짜'])

rent_df['대여날짜'] = pd.to_datetime(rent_df['대여날짜'])

# 도서 ID별 가장 최근 대출 날짜 추출
latest_rent = rent_df.groupby('ID')['대여날짜'].max()

# 딕셔너리로 변환
latest_rent_dict = latest_rent.to_dict()

# 결과를 데이터프레임으로 변환 (선택)
recent_df = pd.DataFrame.from_dict(latest_rent_dict, orient='index')
recent_df.reset_index(inplace=True)
recent_df.rename(columns={'index': 'ID'}, inplace=True)

default_date = pd.to_datetime('2024-10-31')

recent_df['대여날짜'] = pd.to_datetime(recent_df[0])

recent_df['Delta'] = (default_date - recent_df['대여날짜']).dt.days

recent_df.drop(columns=[0], inplace=True)

recent_df['Delta'] = recent_df['Delta'].astype(object)

### DB에 데이터 삽입

In [16]:
import os
from mysql import connector
from dotenv import load_dotenv

# Access environment variables
load_dotenv()

NAME = os.getenv("DB_NAME")
USER = os.getenv("DB_USER")
PASSWORD = os.getenv("DB_PASSWORD")
IP = os.getenv("DB_IP")
PORT = os.getenv("DB_PORT")

class Database:
    _connection = None  # 클래스 변수로 연결 상태 관리

    @staticmethod
    def get_connection():
        if Database._connection is None or not Database._connection.is_connected():
            Database._connection = connector.connect(
                database = NAME,
                user = USER,
                password = PASSWORD,
                host = IP,
                port = PORT
            )
        return Database._connection

def batch_insert(query, values_list, batch_size=1000):
    try:
        conn = Database.get_connection()
        cursor = conn.cursor()

        # 트랜잭션 시작
        total_rows = len(values_list)
        for i in range(0, total_rows, batch_size):
            batch = values_list[i : i + batch_size]
            cursor.executemany(query, batch)  # Batch Insert
            conn.commit()  # Batch마다 커밋 (너무 자주 커밋하지 않음)
            print(f'Current Batch : {i + batch_size}')

        cursor.close()
    except connector.Error as e:
        print(f"Error: {e}")
        conn.rollback()  # 오류 발생 시 롤백


# SQL 쿼리
query = """
INSERT INTO recent_rent 
(ID, duration) 
VALUES (%s, %s)
"""

# 데이터 준비
values_list = [
    (
        recent_df.iloc[i, 0], recent_df.iloc[i, 2], 
    )
    for i in range(len(recent_df))
]

# Batch Insert 실행
batch_insert(query, values_list, batch_size=20000)


Current Batch : 20000
Current Batch : 40000
Current Batch : 60000
Current Batch : 80000
Current Batch : 100000
Current Batch : 120000
Current Batch : 140000
Current Batch : 160000
Current Batch : 180000


# ISBN_rent_count 테이블 초기 데이터 입력

### 데이터 전처리

In [19]:
import pandas as pd
import numpy as np
import re
import os
from mysql import connector
from dotenv import load_dotenv

# Access environment variables
load_dotenv()

NAME = os.getenv("DB_NAME")
USER = os.getenv("DB_USER")
PASSWORD = os.getenv("DB_PASSWORD")
IP = os.getenv("DB_IP")
PORT = os.getenv("DB_PORT")

class Database:
    _connection = None  # 클래스 변수로 연결 상태 관리

    @staticmethod
    def get_connection():
        if Database._connection is None or not Database._connection.is_connected():
            Database._connection = connector.connect(
                database = NAME,
                user = USER,
                password = PASSWORD,
                host = IP,
                port = PORT
            )
        return Database._connection

try:
    conn = Database.get_connection()
    cursor = conn.cursor()
    
    cursor.execute("SELECT ID, rent_date, ISBN FROM rent NATURAL JOIN book WHERE rent.ID = book.ID")

    rows = cursor.fetchall()
    
    cursor.close()
except connector.Error as e:
    print(f"Error: {e}")

ISBN_df = pd.DataFrame(rows, columns=['ID', 'rent_date', 'ISBN'])
ISBN_df['rent_year'] = pd.to_datetime(ISBN_df['rent_date']).dt.year

# 문자열 기준으로 0인 데이터 제거
ISBN_df = ISBN_df[ISBN_df['ISBN'] != '0']

# ISBN을 기준으로 그룹화
result_df = ISBN_df.groupby('ISBN').agg(
    도서ID개수=('ID', lambda x: x.nunique()),  # 고유 도서 ID 개수
).reset_index()

# 연도별 카운트 초기화
years = list(range(2004, 2025))
for year in years:
    result_df[year] = 0  # 기본값 0 설정

# 연도별 카운트 업데이트
year_counts = ISBN_df.groupby(['ISBN', 'rent_year']).size().unstack(fill_value=0)

for year in years:
    if year in year_counts.columns:
        result_df[year] = result_df['ISBN'].map(year_counts[year])

result_df = result_df.astype(object)

result_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 92443 entries, 0 to 92442
Data columns (total 23 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   ISBN    92443 non-null  object
 1   도서ID개수  92443 non-null  object
 2   2004    92443 non-null  object
 3   2005    92443 non-null  object
 4   2006    92443 non-null  object
 5   2007    92443 non-null  object
 6   2008    92443 non-null  object
 7   2009    92443 non-null  object
 8   2010    92443 non-null  object
 9   2011    92443 non-null  object
 10  2012    92443 non-null  object
 11  2013    92443 non-null  object
 12  2014    92443 non-null  object
 13  2015    92443 non-null  object
 14  2016    92443 non-null  object
 15  2017    92443 non-null  object
 16  2018    92443 non-null  object
 17  2019    92443 non-null  object
 18  2020    92443 non-null  object
 19  2021    92443 non-null  object
 20  2022    92443 non-null  object
 21  2023    92443 non-null  object
 22  2024    92443 non-null

### DB에 데이터 삽입

In [23]:
import os
from mysql import connector
from dotenv import load_dotenv

# Access environment variables
load_dotenv()

NAME = os.getenv("DB_NAME")
USER = os.getenv("DB_USER")
PASSWORD = os.getenv("DB_PASSWORD")
IP = os.getenv("DB_IP")
PORT = os.getenv("DB_PORT")

class Database:
    _connection = None  # 클래스 변수로 연결 상태 관리

    @staticmethod
    def get_connection():
        if Database._connection is None or not Database._connection.is_connected():
            Database._connection = connector.connect(
                database = NAME,
                user = USER,
                password = PASSWORD,
                host = IP,
                port = PORT
            )
        return Database._connection

def batch_insert(query, values_list, batch_size=1000):
    try:
        conn = Database.get_connection()
        cursor = conn.cursor()

        # 트랜잭션 시작
        total_rows = len(values_list)
        for i in range(0, total_rows, batch_size):
            batch = values_list[i : i + batch_size]
            cursor.executemany(query, batch)  # Batch Insert
            conn.commit()  # Batch마다 커밋 (너무 자주 커밋하지 않음)
            print(f'Current Batch : {i + batch_size}')

        cursor.close()
    except connector.Error as e:
        print(f"Error: {e}")
        conn.rollback()  # 오류 발생 시 롤백


# SQL 쿼리
query = """
INSERT INTO ISBN_rent_count 
(ISBN, ID_count, `2004`, `2005`, `2006`, `2007`, `2008`, `2009`, `2010`, `2011`, `2012`, `2013`, 
`2014`, `2015`, `2016`, `2017`, `2018`, `2019`, `2020`, `2021`, `2022`, `2023`, `2024`) 
VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
"""

# 데이터 준비
values_list = [
    (
        result_df.iloc[i, 0], result_df.iloc[i, 1], result_df.iloc[i, 2], result_df.iloc[i, 3], result_df.iloc[i, 4], result_df.iloc[i, 5], 
        result_df.iloc[i, 6], result_df.iloc[i, 7], result_df.iloc[i, 8], result_df.iloc[i, 9], result_df.iloc[i, 10], result_df.iloc[i, 11], 
        result_df.iloc[i, 12], result_df.iloc[i, 13], result_df.iloc[i, 14], result_df.iloc[i, 15], result_df.iloc[i, 16], result_df.iloc[i, 17], 
        result_df.iloc[i, 18], result_df.iloc[i, 19], result_df.iloc[i, 20], result_df.iloc[i, 21], result_df.iloc[i, 22]
    )
    for i in range(len(result_df))
]

# Batch Insert 실행
batch_insert(query, values_list, batch_size=20000)


Current Batch : 20000
Current Batch : 40000
Current Batch : 60000
Current Batch : 80000
Current Batch : 100000


# None_ISBN_rent_count 테이블 초기 데이터 입력

### 데이터 전처리

In [27]:
import pandas as pd
import numpy as np
import re
import os
from mysql import connector
from dotenv import load_dotenv

# Access environment variables
load_dotenv()

NAME = os.getenv("DB_NAME")
USER = os.getenv("DB_USER")
PASSWORD = os.getenv("DB_PASSWORD")
IP = os.getenv("DB_IP")
PORT = os.getenv("DB_PORT")

class Database:
    _connection = None  # 클래스 변수로 연결 상태 관리

    @staticmethod
    def get_connection():
        if Database._connection is None or not Database._connection.is_connected():
            Database._connection = connector.connect(
                database = NAME,
                user = USER,
                password = PASSWORD,
                host = IP,
                port = PORT
            )
        return Database._connection

try:
    conn = Database.get_connection()
    cursor = conn.cursor()
    
    cursor.execute("SELECT ID, rent_date, ISBN, title FROM rent NATURAL JOIN book WHERE rent.ID = book.ID")

    rows = cursor.fetchall()
    
    cursor.close()
except connector.Error as e:
    print(f"Error: {e}")

ISBN_df = pd.DataFrame(rows, columns=['ID', 'rent_date', 'ISBN', '제목'])
ISBN_df['rent_year'] = pd.to_datetime(ISBN_df['rent_date']).dt.year

# 문자열 기준으로 0이 아닌 데이터 제거
ISBN_df = ISBN_df[ISBN_df['ISBN'] == '0']

# ISBN을 기준으로 그룹화
result_df = ISBN_df.groupby('제목').agg(
    도서ID개수=('ID', lambda x: x.nunique()),  # 고유 도서 ID 개수
).reset_index()

# 연도별 카운트 초기화
years = list(range(2004, 2025))
for year in years:
    result_df[year] = 0  # 기본값 0 설정

# 연도별 카운트 업데이트
year_counts = ISBN_df.groupby(['제목', 'rent_year']).size().unstack(fill_value=0)

for year in years:
    if year in year_counts.columns:
        result_df[year] = result_df['제목'].map(year_counts[year])

result_df = result_df.astype(object)

result_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6967 entries, 0 to 6966
Data columns (total 23 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   제목      6967 non-null   object
 1   도서ID개수  6967 non-null   object
 2   2004    6967 non-null   object
 3   2005    6967 non-null   object
 4   2006    6967 non-null   object
 5   2007    6967 non-null   object
 6   2008    6967 non-null   object
 7   2009    6967 non-null   object
 8   2010    6967 non-null   object
 9   2011    6967 non-null   object
 10  2012    6967 non-null   object
 11  2013    6967 non-null   object
 12  2014    6967 non-null   object
 13  2015    6967 non-null   object
 14  2016    6967 non-null   object
 15  2017    6967 non-null   object
 16  2018    6967 non-null   object
 17  2019    6967 non-null   object
 18  2020    6967 non-null   object
 19  2021    6967 non-null   object
 20  2022    6967 non-null   object
 21  2023    6967 non-null   object
 22  2024    6967 non-null   

### DB에 데이터 삽입

In [29]:
import os
from mysql import connector
from dotenv import load_dotenv

# Access environment variables
load_dotenv()

NAME = os.getenv("DB_NAME")
USER = os.getenv("DB_USER")
PASSWORD = os.getenv("DB_PASSWORD")
IP = os.getenv("DB_IP")
PORT = os.getenv("DB_PORT")

class Database:
    _connection = None  # 클래스 변수로 연결 상태 관리

    @staticmethod
    def get_connection():
        if Database._connection is None or not Database._connection.is_connected():
            Database._connection = connector.connect(
                database = NAME,
                user = USER,
                password = PASSWORD,
                host = IP,
                port = PORT
            )
        return Database._connection

def batch_insert(query, values_list, batch_size=1000):
    try:
        conn = Database.get_connection()
        cursor = conn.cursor()

        # 트랜잭션 시작
        total_rows = len(values_list)
        for i in range(0, total_rows, batch_size):
            batch = values_list[i : i + batch_size]
            cursor.executemany(query, batch)  # Batch Insert
            conn.commit()  # Batch마다 커밋 (너무 자주 커밋하지 않음)
            print(f'Current Batch : {i + batch_size}')

        cursor.close()
    except connector.Error as e:
        print(f"Error: {e}")
        conn.rollback()  # 오류 발생 시 롤백


# SQL 쿼리
query = """
INSERT INTO None_ISBN_rent_count 
(title, ID_count, `2004`, `2005`, `2006`, `2007`, `2008`, `2009`, `2010`, `2011`, `2012`, `2013`, 
`2014`, `2015`, `2016`, `2017`, `2018`, `2019`, `2020`, `2021`, `2022`, `2023`, `2024`) 
VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
"""

# 데이터 준비
values_list = [
    (
        result_df.iloc[i, 0], result_df.iloc[i, 1], result_df.iloc[i, 2], result_df.iloc[i, 3], result_df.iloc[i, 4], result_df.iloc[i, 5], 
        result_df.iloc[i, 6], result_df.iloc[i, 7], result_df.iloc[i, 8], result_df.iloc[i, 9], result_df.iloc[i, 10], result_df.iloc[i, 11], 
        result_df.iloc[i, 12], result_df.iloc[i, 13], result_df.iloc[i, 14], result_df.iloc[i, 15], result_df.iloc[i, 16], result_df.iloc[i, 17], 
        result_df.iloc[i, 18], result_df.iloc[i, 19], result_df.iloc[i, 20], result_df.iloc[i, 21], result_df.iloc[i, 22]
    )
    for i in range(len(result_df))
]

# Batch Insert 실행
batch_insert(query, values_list, batch_size=20000)


Current Batch : 20000
