# MySQL

In [None]:
# MySQL <=> Python(Pandas)
# pandas와 DB 연결 라이브러리: sqlalchemy

!python -m pip install pandas 
!python -m pip install sqlalchemy
!python -m pip install pymysql
!python -m pip install cryptography

In [None]:
# streamlit(streamlit_app.py 파일)

!python -m pip install streamlit
!python -m pip install mysql-connector-python

In [None]:
# 실전 분석 보고서 & 자동화
!python -m pip install xlsxwriter
!python -m pip install python-docx
!python -m pip install reportlab

In [None]:
# 1. pandas를 통해서 데이터 만들기

import pandas as pd

data = {
  'name': ['hong1', 'hong2', 'hong3'],
  'age': [30, 31, 32]
}
df_users = pd.DataFrame(data)
print(df_users)

In [None]:
# 2. MySQL에 연결(sqlalchemy)
from sqlalchemy import create_engine

user = 'root'
password = '1234'
host = '127.0.0.1'
port = 3306
database = 'analysis'

engine = create_engine(f"mysql+pymysql://{user}:{password}@{host}:{port}/{database}")

In [None]:
# 3. Pandas data > MySQL 저장

import pandas as pd
from sqlalchemy import create_engine

df_users = pd.DataFrame({
  'user_name': ['hong1', 'hong2', 'hong3'],
  'age': [30, 31, 32]
})

user = 'root'
password = '1234'
host = '127.0.0.1'
port = 3306
database = 'analysis'

engine = create_engine(f"mysql+pymysql://{user}:{password}@{host}:{port}/{database}")

df_users.to_sql('users',con=engine,index=False,if_exists='replace')
print("MySQL 데이터 저장 완료")

In [None]:
# 4. MySQL > Pandas 가져오기

from sqlalchemy import create_engine

user = 'root'
password = '1234'
host = '127.0.0.1'
port = 3306
database = 'analysis'

engine = create_engine(f"mysql+pymysql://{user}:{password}@{host}:{port}/{database}")
df_from_db = pd.read_sql("SELECT * FROM users",con=engine)
print("MySQL > Pandas:\n", df_from_db)

In [None]:
# MySQL2_Q 8-8 pandas로 그룹집계 & 시각화

import matplotlib.pyplot as plt
import matplotlib as mpl
mpl.rcParams['font.family'] = 'Malgun Gothic'
mpl.rcParams['axes.unicode_minus'] = False
from sqlalchemy import create_engine
import pandas as pd


user = 'root'
password = '1234'
host = '127.0.0.1'
port = 3306
database = 'data_collection'
engine = create_engine(f"mysql+pymysql://{user}:{password}@{host}:{port}/{database}")

query = '''
  SELECT d.dept_name AS 부서명,
  ROUND(AVG(s.salary), 2) AS 평균급여
  FROM departments d
  JOIN employees_salary s ON d.dept_id = s.dept_id
  GROUP BY d.dept_name;
'''

df = pd.read_sql(query, engine)

df.plot(kind='bar', x='부서명', y='평균급여', figsize=(10, 6), legend=False)
plt.title('부서별 평균 급여')
plt.xlabel('부서명')
plt.ylabel('평균 급여')
plt.xticks(rotation=0)
plt.xlim(-0.7, 2.7)
plt.show()


In [None]:
# MySQL2_Q 9-8 pandas에서 JOIN 쿼리를 가져오기

from sqlalchemy import create_engine
import pandas as pd

engine = create_engine("mysql+pymysql://data_id:1234@127.0.0.1/data_collection")

query = '''
  SELECT e.emp_name AS 이름,
  e.email AS 이메일,
  d.dept_name AS 부서명
  FROM employees e
  INNER JOIN departments d
  ON e.dept_id = d.dept_id;
'''

df = pd.read_sql(query, engine)
display(df)

In [None]:
# MySQL2_Q 9-9 pandas에서 JOIN 쿼리를 가져와 시각화

import matplotlib.pyplot as plt
import matplotlib as mpl
mpl.rcParams['font.family'] = 'Malgun Gothic'
mpl.rcParams['axes.unicode_minus'] = False
from sqlalchemy import create_engine
import pandas as pd


user = 'root'
password = '1234'
host = '127.0.0.1'
port = 3306
database = 'data_collection'
engine = create_engine(f"mysql+pymysql://{user}:{password}@{host}:{port}/{database}")

query = '''
  SELECT d.dept_name AS 부서명,
  COUNT(e.emp_id) AS 직원수
  FROM departments d
  LEFT JOIN employees e
  ON d.dept_id = e.dept_id
  GROUP BY d.dept_name
'''

df = pd.read_sql(query, engine)

df.plot(kind='bar', figsize=(10, 6), legend=False)
plt.title('부서별 직원수')
plt.xlabel('부서명')
plt.ylabel('직원수')
plt.xticks(rotation=0)
plt.xlim(-0.7, 3.7)
plt.show()


In [None]:
# MySQL2_Q 10-8 pandas에서 SELF JOIN 시각화

from sqlalchemy import create_engine
import pandas as pd


user = 'root'
password = '1234'
host = '127.0.0.1'
port = 3306
database = 'data_collection'
engine = create_engine(f"mysql+pymysql://{user}:{password}@{host}:{port}/{database}")

query = '''
  SELECT
  	e1.emp_name AS 직원1,
  	e2.emp_name AS 직원2,
  	d.dept_name AS 부서명
  FROM employees e1
  JOIN employees e2
  	ON e1.dept_id = e2.dept_id
  	AND e1.emp_id < e2.emp_id
  JOIN departments d
  	ON e1.dept_id = d.dept_id;
'''

df = pd.read_sql(query, engine)
display(df)

In [None]:
# MySQL3_Q 9-9-1 WHERE 절 서브쿼리(평균 이상 급여)

from sqlalchemy import create_engine
import pandas as pd


user = 'root'
password = '1234'
host = '127.0.0.1'
port = 3306
database = 'data_collection'
engine = create_engine(f"mysql+pymysql://{user}:{password}@{host}:{port}/{database}")

query = '''
  SELECT
  	e.emp_name AS 직원명,
    s.salary AS 급여
  FROM employees e
  JOIN employees_salary s
  	ON e.emp_id = s.emp_id
  WHERE s.salary >= (
  	SELECT AVG(salary) FROM employees_salary
  );
'''

df = pd.read_sql(query, engine)
display(df)

In [None]:
# MySQL3_Q 9-9-2 SELECT 절 스칼라 서브쿼리(전체 평균과 함께 출력)

from sqlalchemy import create_engine
import pandas as pd


user = 'root'
password = '1234'
host = '127.0.0.1'
port = 3306
database = 'data_collection'
engine = create_engine(f"mysql+pymysql://{user}:{password}@{host}:{port}/{database}")

query = '''
  SELECT
  	e.emp_name AS 직원명,
      s.salary AS 급여,
      (SELECT AVG(salary) FROM employees_salary) AS 전체평균
  FROM employees e
  JOIN employees_salary s
  	ON e.emp_id = s.emp_id;
'''

df = pd.read_sql(query, engine)
display(df)

In [None]:
# MySQL3_Q 9-9-3 FROM 절 인라인 서브쿼리(부서별 평균급여 >= 4500)

from sqlalchemy import create_engine
import pandas as pd


user = 'root'
password = '1234'
host = '127.0.0.1'
port = 3306
database = 'data_collection'
engine = create_engine(f"mysql+pymysql://{user}:{password}@{host}:{port}/{database}")

query = '''
  SELECT
  	부서명, 평균급여
  FROM (
  	SELECT dept_name AS 부서명, AVG(s.salary) AS 평균급여
      FROM departments d
      JOIN employees_salary s
  		ON d.dept_id = s.dept_id
  	GROUP BY dept_name
  ) AS dept_avg
  WHERE 평균급여 >= 4500;
'''

df = pd.read_sql(query, engine)
display(df)

In [None]:
# MySQL3_Q 9-9-4 VIEW 사용
# VIEW = v_dept_salary_summary

from sqlalchemy import create_engine
import pandas as pd


user = 'root'
password = '1234'
host = '127.0.0.1'
port = 3306
database = 'data_collection'
engine = create_engine(f"mysql+pymysql://{user}:{password}@{host}:{port}/{database}")

query = '''
  SELECT * FROM v_dept_salary_summary;
'''

df = pd.read_sql(query, engine)
display(df)

In [None]:
# MySQL3_Q 9-9-5 VIEW + JOIN 사용
# VIEW = v_dept_salary_summary

from sqlalchemy import create_engine
import pandas as pd


user = 'root'
password = '1234'
host = '127.0.0.1'
port = 3306
database = 'data_collection'
engine = create_engine(f"mysql+pymysql://{user}:{password}@{host}:{port}/{database}")

query = '''
  SELECT v.부서명, v.인원수, v.평균급여
  FROM v_dept_salary_summary v;
'''

df = pd.read_sql(query, engine)
display(df)

In [None]:
# MySQL3_Q 9-9-6 VIEW 사용(평균 이상 급여 직원)
# VIEW = v_salary

from sqlalchemy import create_engine
import pandas as pd


user = 'root'
password = '1234'
host = '127.0.0.1'
port = 3306
database = 'data_collection'
engine = create_engine(f"mysql+pymysql://{user}:{password}@{host}:{port}/{database}")

query = '''
  SELECT * FROM v_salary
  ORDER BY 급여 DESC;
'''

df = pd.read_sql(query, engine)
display(df)

In [None]:
# MySQL3_Q 10-8-1 데이터 확인

from sqlalchemy import create_engine
import pandas as pd


user = 'root'
password = '1234'
host = '127.0.0.1'
port = 3306
database = 'data_collection'
engine = create_engine(f"mysql+pymysql://{user}:{password}@{host}:{port}/{database}")

query = '''
  SELECT * FROM performance_test
'''

df = pd.read_sql(query, engine)
print(df)

In [None]:
# MySQL3_Q 10-8-2 인덱스 없는 쿼리 성능 확인

from sqlalchemy import create_engine, text
import pandas as pd
import time

user = 'root'
password = '1234'
host = '127.0.0.1'
port = 3306
database = 'data_collection'
engine = create_engine(f"mysql+pymysql://{user}:{password}@{host}:{port}/{database}")

query = '''
  SELECT * FROM performance_test WHERE age >= 40;
'''

with engine.connect() as conn:
  conn.execute(text("DROP INDEX idx_age ON performance_test;"))
  print("인덱스 삭제 완료\n")
    
start = time.time()
df = pd.read_sql(query, engine)
end = time.time()

print(df)
print("Python List 연산 시간:", end - start)

In [None]:
# MySQL3_Q 10-8-3 인덱스 생성 후 쿼리 성능 확인

from sqlalchemy import create_engine, text
import pandas as pd
import time

user = 'root'
password = '1234'
host = '127.0.0.1'
port = 3306
database = 'data_collection'
engine = create_engine(f"mysql+pymysql://{user}:{password}@{host}:{port}/{database}")
    
query = '''
  EXPLAIN SELECT * FROM performance_test WHERE age >= 40;
'''

with engine.connect() as conn:
  conn.execute(text("CREATE INDEX idx_age ON performance_test(age);"))
  print("인덱스 생성 완료\n")
    
start = time.time()
df = pd.read_sql(query, engine)
end = time.time()

print(df)
print("Python List 연산 시간:", end - start)


In [None]:
# MySQL3_Q 10-8-4 복합 인덱스 생성 후 쿼리 성능 확인

from sqlalchemy import create_engine, text
import pandas as pd
import time

user = 'root'
password = '1234'
host = '127.0.0.1'
port = 3306
database = 'data_collection'
engine = create_engine(f"mysql+pymysql://{user}:{password}@{host}:{port}/{database}")
    
query = '''
  EXPLAIN SELECT * FROM performance_test WHERE age >= 40;
'''

with engine.connect() as conn:
  conn.execute(text("CREATE INDEX idx_age_date ON performance_test(age, join_date);"))
  print("인덱스 생성 완료\n")
    
start = time.time()
df = pd.read_sql(query, engine)
end = time.time()

print(df)
print("Python List 연산 시간:", end - start)

with engine.connect() as conn:
  conn.execute(text("DROP INDEX idx_age_date ON performance_test;"))
  print("\n인덱스 삭제 완료")
    

In [None]:
# MySQL3_Q 11-3-1 계좌 테이블 데이터 조회

from sqlalchemy import create_engine, text
import pandas as pd

user = 'root'
password = '1234'
host = '127.0.0.1'
port = 3306
database = 'data_collection'
engine = create_engine(f"mysql+pymysql://{user}:{password}@{host}:{port}/{database}")
    
query = '''
  SELECT * FROM accounts
'''
df = pd.read_sql(query, engine)
print(df)

In [None]:
# MySQL3_Q 11-3-2 트랜잭션 없이 계좌 이체(위험 예시)

from sqlalchemy import create_engine, text
import pandas as pd

user = 'root'
password = '1234'
host = '127.0.0.1'
port = 3306
database = 'data_collection'
engine = create_engine(f"mysql+pymysql://{user}:{password}@{host}:{port}/{database}")

with engine.begin() as conn:
  conn.execute(text("UPDATE accounts SET balance = balance - 30000 WHERE user_id = 1;"))
  conn.execute(text("UPDATE accounts SET balance = balance + 30000 WHERE user_id = 2;"))
    
query = '''
  SELECT * FROM accounts
'''
df = pd.read_sql(query, engine)
print(df)

In [None]:
# MySQL3_Q 11-3-3 트랜잭션 사용 - COMMIT

from sqlalchemy import create_engine, text
import pandas as pd

user = 'root'
password = '1234'
host = '127.0.0.1'
port = 3306
database = 'data_collection'
engine = create_engine(f"mysql+pymysql://{user}:{password}@{host}:{port}/{database}")

with engine.begin() as conn:
  try:
    # 계좌 이체 작업
    conn.execute(text("UPDATE accounts SET balance = balance - 30000 WHERE user_id = 1;"))
    conn.execute(text("UPDATE accounts SET balance = balance + 30000 WHERE user_id = 2;"))
    
    # 예시로, 특정 조건을 확인하고 롤백할 수 있음
    # 예를 들어, 잔액이 음수로 계산되는 경우에 롤백하도록 설정
    result = conn.execute(text("SELECT balance FROM accounts WHERE user_id = 1;"))
    balance = result.fetchone()[0]
    
    if balance < 0:
      raise Exception("Insufficient funds for transfer!")  # 의도적으로 예외를 발생시켜 롤백

    # 정상적인 경우 커밋 (자동으로 처리되지만 명시적으로 할 수도 있음)
    # conn.commit()  # 이건 사실 필요없음, 'begin()'이 끝나면 자동 커밋됨

  except Exception as e:
    # 오류 발생 시 롤백
    print(f"Error: {e}")
    conn.rollback()

# 계좌 정보 출력
query = '''
  SELECT * FROM accounts
'''
df = pd.read_sql(query, engine)
print(df)

In [None]:
# MySQL3_Q 11-3-4 오류 발생 후 ROLLBACK
# 11-3-3 과 같은 구조

from sqlalchemy import create_engine, text
import pandas as pd

user = 'root'
password = '1234'
host = '127.0.0.1'
port = 3306
database = 'data_collection'
engine = create_engine(f"mysql+pymysql://{user}:{password}@{host}:{port}/{database}")

with engine.begin() as conn:
  try:
    # 계좌 이체 작업
    conn.execute(text("UPDATE accounts SET balance = balance - 30000 WHERE user_id = 1;"))
    conn.execute(text("UPDATE accounts SET balance = balance + 30000 WHERE user_id = 2;"))
    
    # 예시로, 특정 조건을 확인하고 롤백할 수 있음
    # 예를 들어, 잔액이 음수로 계산되는 경우에 롤백하도록 설정
    result = conn.execute(text("SELECT balance FROM accounts WHERE user_id = 1;"))
    balance = result.fetchone()[0]
    
    if balance < 0:
      raise Exception("오류 발생!")
      # 의도적으로 예외를 발생시켜 롤백

    # 정상적인 경우 커밋 (자동으로 처리되지만 명시적으로 할 수도 있음)
    # conn.commit()은 사실상 필요없음, 'begin()'이 끝나면 자동 커밋됨

  except Exception as e:
    # 오류 발생 시 롤백
    print(f"Error: {e}")
    conn.rollback()

query = '''
  SELECT * FROM accounts
'''

df = pd.read_sql(query, engine)
print(df)

In [None]:
# MySQL3_Q 11-3-5 자동 커밋 모드 확인 및 변경

from sqlalchemy import create_engine, text
import pandas as pd

user = 'root'
password = '1234'
host = '127.0.0.1'
port = 3306
database = 'data_collection'
engine = create_engine(f"mysql+pymysql://{user}:{password}@{host}:{port}/{database}")

# 자동 커밋 켜기 및 상태확인
# with engine.begin() as conn:
#   conn.execute(text("SET autocommit = 1;"))
# query = '''
#   SELECT @@autocommit;
# '''

# 자동 커밋 끄기 및 상태확인
with engine.begin() as conn:
  conn.execute(text("SET autocommit = 0;"))
query = '''
  SELECT @@autocommit;
'''

df = pd.read_sql(query, engine)
print(df)

In [None]:
# MySQL3_Q 11-3-6 TRANSACTION START 이후, 데이터 변경하고 SELECT 로 값 확인, 그리고 ROLLBACK

from sqlalchemy import create_engine, text
import pandas as pd

user = 'root'
password = '1234'
host = '127.0.0.1'
port = 3306
database = 'data_collection'
engine = create_engine(f"mysql+pymysql://{user}:{password}@{host}:{port}/{database}")

with engine.begin() as conn:
  try:
    # 계좌 이체 작업
    conn.execute(text("UPDATE accounts SET balance = balance - 30000 WHERE user_id = 1;"))
    conn.execute(text("UPDATE accounts SET balance = balance + 30000 WHERE user_id = 2;"))
    
    
    query = '''
      SELECT * FROM accounts
    '''

    # 롤백 전에 값 확인 - 해당 출력값은 결국 모든 코드가 실행되면 try-except로 인해 ROLLBACK이 실행되어 실행 전 값으로 돌아감
    # engine 대신 conn을 써줘야 하는 이유는 engine은 COMMIT, ROLLBACK 이 실행되기 이전의 값은 가져오지 못하기 때문
    df = pd.read_sql(query, conn)
    print(df)

    # 예시로, 특정 조건을 확인하고 롤백할 수 있음
    # 예를 들어, 잔액이 음수로 계산되는 경우에 롤백하도록 설정
    result = conn.execute(text("SELECT balance FROM accounts WHERE user_id = 1;"))
    balance = result.fetchone()[0]
    
    if balance < 0:
      raise Exception("오류 발생!")
      # 의도적으로 예외를 발생시켜 롤백

    # 정상적인 경우 커밋 (자동으로 처리되지만 명시적으로 할 수도 있음)
    # conn.commit()은 사실상 필요없음, 'begin()'이 끝나면 자동 커밋됨

  except Exception as e:
    # 오류 발생 시 롤백
    print(f"Error: {e}")
    conn.rollback()

In [None]:
# MySQL3_Q 12. 데이터 수집 연동 (크롤링/API >> MySQL로 저장)

from sqlalchemy import create_engine
import pandas as pd
import requests
from bs4 import BeautifulSoup

# 1. 크롤링할 웹사이트
url = "https://news.ycombinator.com/"

# 2. 해당 url에 get 요청을 보내고, 값을 받아옴
# 그 값에는 HTML 전체가 텍스트 형태로 담겨 있음
response = requests.get(url)

# 3. HTML 텍스트를 BeautifulSoup으로 파싱
soup = BeautifulSoup(response.text, 'html.parser')

# 4. a 태그 리스트 가져오기
links = soup.select('span.titleline > a')

# 5. 텍스트만 추출해서 리스트로 저장
titles = [link.text for link in links]

# 6. pandas DatafFrame으로 변환
df = pd.DataFrame(titles, columns=['title'])

# 7. MySQL로 보내기
user = 'root'
password = '1234'
host = '127.0.0.1'
port = 3306
database = 'data_collection'

# SQLAlchemy 엔진 생성
engine = create_engine(f"mysql+pymysql://{user}:{password}@{host}:{port}/{database}")

# 8. 테이블 이름: news_titles (없으면 새로 생성)
try:
  df.to_sql(name='news_titles', con=engine, if_exists='replace', index=False)
  print("MySQL로 전송 완료!")
except Exception as e:
  print(f"Error 발생{e}")

In [None]:
# MySQL3_Q 13. 데이터 분석 준비 (정제 & 탐색적 분석 전처리)
# 데이터 생성

import os
import pandas as pd

df = pd.DataFrame({
  "날짜": ["2023-01-01", "2023-01-01", "2023-01-02", "2023-01-03"],
  "고객명": ["홍길동", "김철수", "홍길동", "김영희"],
  "제품": ["A", "B", "A", "C"],
  "수량": [3, None, 2, 1],
  "가격": [10000, 12000, 10000, 15000],
  "지역": ["서울", "부산", "서울", "대전"]
})

if not os.path.exists('dataset'):
  os.makedirs('dataset')
  
df.to_csv("dataset/04_Q13.csv", index=False)

In [None]:
# 데이터 불러오기

display(pd.read_csv("dataset/04_Q13.csv"))

In [None]:
# 데이터 구조 & 통계 확인

df = pd.read_csv("dataset/04_Q13.csv")

# 데이터프레임 요약 정보
df.info() # df.info()는 내부적으로 print()를 사용

# 수치형 데이터 요약 통계
df.describe()

In [None]:
# 결측치 처리

df = pd.read_csv("dataset/04_Q13.csv")
print(f"결측치 수:\n{df.isnull().sum()}")

df['수량'] = df['수량'].fillna(0)
df.isnull().sum()

In [None]:
# 중복 제거

df = pd.read_csv("dataset/04_Q13.csv")
display(df)

df_cleaned = df.drop_duplicates(subset='고객명').reset_index(drop=True)
display(df_cleaned)

In [None]:
# 날짜 타임 변환 & 파생 변수(매출) 생성

df = pd.read_csv("dataset/04_Q13.csv")

df['수량'] = df['수량'].fillna(0)
df['날짜'] = pd.to_datetime(df['날짜'])
df['매출'] = df['수량'] * df['가격']

print(df)

In [None]:
# 지역별 매출 시각화

import matplotlib.pyplot as plt
import matplotlib as mpl
mpl.rcParams['font.family'] = 'Malgun Gothic'
mpl.rcParams['axes.unicode_minus'] = False
import pandas as pd

df = pd.read_csv("dataset/04_Q13.csv")

df['수량'] = df['수량'].fillna(0)
df['날짜'] = pd.to_datetime(df['날짜'])
df['매출'] = df['수량'] * df['가격']

df.groupby('지역')['매출'].sum().plot(kind='bar', title='지역별 매출', ylabel='총매출', rot=0)
plt.show()

In [None]:
# 결측치를 평균값으로 채운 버전과 0으로 채운 버전을 각각 비교

import pandas as pd

df = pd.read_csv("dataset/04_Q13.csv")
df['수량'] = df['수량'].fillna(df['수량'].mean())
display(df)

df2 = pd.read_csv("dataset/04_Q13.csv")
df2['수량'] = df2['수량'].fillna(0)
display(df2)

In [None]:
# value_counts()로 제품별 판매 수량 TOP3 을 출력

import pandas as pd

df = pd.read_csv("dataset/04_Q13.csv")

df['수량'] = df['수량'].fillna(0)
df_popular = df.groupby('제품')['수량'].sum().sort_values(ascending=False).head(3)

display(df_popular)

In [None]:
# pandas로 로딩 후 매출 컬럼(수량 * 가격) 추가하고 MySQL 에 sample_sales 테이블 생성 및 데이터 저장 (sqlalchemy 사용)

from sqlalchemy import create_engine
import pandas as pd

df = pd.read_csv('dataset/04_Q13.csv')

df['수량'] = df['수량'].fillna(0)
df['매출'] = df['수량'] * df['가격']

user = 'root'
password = '1234'
host= '127.0.0.1'
port = 3306
database = 'data_collection'

engine = create_engine(f"mysql+pymysql://{user}:{password}@{host}:{port}/{database}")

df.to_sql('sample_sales',con=engine,index=False,if_exists='replace')
print("sample_sales 테이블 저장 완료")

In [None]:
from sqlalchemy import create_engine
import pandas as pd

user = 'root'
password = '1234'
host = '127.0.0.1'
port = 3306
database = 'data_collection'
engine = create_engine(f"mysql+pymysql://{user}:{password}@{host}:{port}/{database}")

query = '''
  SELECT * FROM sample_sales
'''

df = pd.read_sql(query, engine)
print(df)

In [None]:
# MySQL3_Q 14. 실전 데이터 분석 프로젝트1 - 매출분석
# 데이터 불러오기 및 매출 컬럼 생성

import pandas as pd

df = pd.read_csv('dataset/04_Q13.csv')

df['수량'] = df['수량'].fillna(0)
df['매출'] = df['수량'] * df['가격']

In [None]:
# 기초 분석(총 매출, 평균 단가, 고객 수)

import pandas as pd

df = pd.read_csv('dataset/04_Q13.csv')

df['수량'] = df['수량'].fillna(0)
df['매출'] = df['수량'] * df['가격']

print(f"총 매출: {df['매출'].sum()}")
print(f"평균 단가: {df['가격'].mean()}")
print(f"고객 수: {df['고객명'].nunique()}")

In [None]:
# 제품 및 지역별 매출 시각화

import matplotlib.pyplot as plt
import matplotlib as mpl
mpl.rcParams['font.family'] = 'Malgun Gothic'
mpl.rcParams['axes.unicode_minus'] = False
import pandas as pd

import pandas as pd

df = pd.read_csv('dataset/04_Q13.csv')

df['수량'] = df['수량'].fillna(0)
df['매출'] = df['수량'] * df['가격']

df_1 = df.groupby('제품')['매출'].sum()
df_2 = df.groupby('지역')['매출'].sum()

plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
plt.bar(df_1.index, df_1.values, width=0.5)
plt.title('제품별 매출')
plt.xlabel('제품')
plt.xlim(-0.6, 2.6)
plt.ylabel('매출')

plt.subplot(1, 2, 2)
plt.pie(df_2.values, labels=df_2.index, autopct='%.1f%%', startangle=90)
plt.title('지역별 매출')

plt.tight_layout()
plt.show()

In [None]:
# 요일별 매출 분석

import matplotlib.pyplot as plt
import matplotlib as mpl
mpl.rcParams['font.family'] = 'Malgun Gothic'
mpl.rcParams['axes.unicode_minus'] = False
import pandas as pd

df = pd.read_csv('dataset/04_Q13.csv')

df['날짜'] = pd.to_datetime(df['날짜'])
df['수량'] = df['수량'].fillna(0)
df['매출'] = df['수량'] * df['가격']
df['요일'] = df['날짜'].dt.day_name()

df = df.groupby('요일')['매출'].sum().plot(kind='bar', title='요일별 매출', ylabel='매출', rot=0)
plt.xlim(-0.6, 2.6)
plt.show()


In [None]:
# 최고 고객 및 제품

import matplotlib.pyplot as plt
import matplotlib as mpl
mpl.rcParams['font.family'] = 'Malgun Gothic'
mpl.rcParams['axes.unicode_minus'] = False
import pandas as pd

df = pd.read_csv('dataset/04_Q13.csv')

df['수량'] = df['수량'].fillna(0)
df['매출'] = df['수량'] * df['가격']

df_top_customer = df.groupby('고객명')['매출'].sum().sort_values(ascending=False)
df_top_product = df.groupby('제품')['수량'].sum().sort_values(ascending=False)

print(f"TOP 고객:\n{df_top_customer}\n")
print(f"TOP 제품:\n{df_top_product}")

In [None]:
# 날짜별 매출 라인 그래프

import matplotlib.pyplot as plt
import matplotlib as mpl
mpl.rcParams['font.family'] = 'Malgun Gothic'
mpl.rcParams['axes.unicode_minus'] = False
import pandas as pd

df = pd.read_csv('dataset/04_Q13.csv')

df['날짜'] = pd.to_datetime(df['날짜'])
df['수량'] = df['수량'].fillna(0)
df['매출'] = df['수량'] * df['가격']

df = df.groupby('날짜')['매출'].sum()
df.plot(kind='line')
plt.show()

In [None]:
# 특정 지역 월별 매출

import matplotlib.pyplot as plt
import matplotlib as mpl
mpl.rcParams['font.family'] = 'Malgun Gothic'
mpl.rcParams['axes.unicode_minus'] = False
import pandas as pd

df = pd.read_csv('dataset/04_Q13.csv')

df['날짜'] = pd.to_datetime(df['날짜'])
df['수량'] = df['수량'].fillna(0)
df['매출'] = df['수량'] * df['가격']
df = df[df['지역'] == '서울']
df['월'] = df['날짜'].dt.month

df = df.groupby('월')['매출'].sum()
df.plot(kind='bar', title='서울 지역 월별 매출', ylabel='매출', rot=0)
plt.show()

In [None]:
# MySQL 저장(project_sales)

import matplotlib.pyplot as plt
import matplotlib as mpl
mpl.rcParams['font.family'] = 'Malgun Gothic'
mpl.rcParams['axes.unicode_minus'] = False
import pandas as pd
from sqlalchemy import create_engine

df = pd.read_csv('dataset/04_Q13.csv')

df['날짜'] = pd.to_datetime(df['날짜'])
df['수량'] = df['수량'].fillna(0)
df['매출'] = df['수량'] * df['가격']
df['요일'] = df['날짜'].dt.day_name()
df['월'] = df['날짜'].dt.month

user = 'root'
password = '1234'
host= '127.0.0.1'
port = 3306
database = 'data_collection'

engine = create_engine(f"mysql+pymysql://{user}:{password}@{host}:{port}/{database}")

df.to_sql('project_sales',con=engine,index=False,if_exists='replace')
print("project_sales 테이블 저장 완료")

In [None]:
# 실전 프로젝트 확장 - 매출 분석 대시보드
# MySQL에서 데이터 불러오기(project_sales)

from sqlalchemy import create_engine
import pandas as pd

user = 'root'
password = '1234'
host= '127.0.0.1'
port = 3306
database = 'data_collection'

engine = create_engine(f"mysql+pymysql://{user}:{password}@{host}:{port}/{database}")

df = pd.read_sql('SELECT * FROM project_sales', engine)

display(df)

In [None]:
# 날짜 및 지역 필터링 예시

from sqlalchemy import create_engine
import pandas as pd

user = 'root'
password = '1234'
host= '127.0.0.1'
port = 3306
database = 'data_collection'

engine = create_engine(f"mysql+pymysql://{user}:{password}@{host}:{port}/{database}")

df = pd.read_sql('SELECT * FROM project_sales', engine)

df_20230101 = df[df['날짜'] == '2023-01-01']
df_seoul = df[df['지역'] == '서울']
df_20230101_seoul = df[(df['날짜'] == '2023-01-01') & (df['지역'] == '서울')]

display(df_20230101)
display(df_seoul)
display(df_20230101_seoul)

In [None]:
# 요약 지표

from sqlalchemy import create_engine
import pandas as pd

user = 'root'
password = '1234'
host= '127.0.0.1'
port = 3306
database = 'data_collection'

engine = create_engine(f"mysql+pymysql://{user}:{password}@{host}:{port}/{database}")

df = pd.read_sql('SELECT * FROM project_sales', engine)

total_sales = df['매출'].sum()
unique_customers = df['고객명'].nunique()
unique_products = df['제품'].nunique()

print(f"총 매출: {int(total_sales) if total_sales.is_integer() else total_sales}")
print(f"고객 수: {unique_customers}")
print(f"제품 수: {unique_products}")

In [None]:
# 시각화

import matplotlib.pyplot as plt
import matplotlib as mpl
mpl.rcParams['font.family'] = 'Malgun Gothic'
mpl.rcParams['axes.unicode_minus'] = False
from sqlalchemy import create_engine
import pandas as pd

user = 'root'
password = '1234'
host= '127.0.0.1'
port = 3306
database = 'data_collection'

engine = create_engine(f"mysql+pymysql://{user}:{password}@{host}:{port}/{database}")

df = pd.read_sql('SELECT * FROM project_sales', engine)
display(df)
df_1 = df.groupby('제품')['매출'].sum()
df_2 = df.groupby('지역')['매출'].sum()
df_3 = df.groupby('날짜')['매출'].sum()
df_4 = df.groupby('월')['매출'].sum()

# 제품별 매출
df_1.plot(kind='bar', title='제품별 매출', ylabel='매출', rot=0)
plt.xlim(-0.6, 2.6)
plt.show()

# 지역별 매출
df_2.plot(kind='pie', autopct='%.1f%%',title='지역별 매출 비중', startangle=90)
plt.ylabel('')  # y축 레이블을 빈 문자열로 설정하여 제거
plt.show()

# 날짜별 매출
df_3.plot(kind='line', title='날짜별 매출 추이')
plt.show()

# 월별 매출
df_4.plot(kind='bar', title='월별 매출 추이', rot=0)
plt.show()

In [None]:
# 실전 분석 보고서 & 자동화
# MySQL 데이터 불러오기 및 요약표 생성

import pandas as pd
from sqlalchemy import create_engine

engine = create_engine("mysql+mysqlconnector://root:1234@localhost/data_collection")
df = pd.read_sql("SELECT * FROM project_sales", engine)

df['날짜'] = pd.to_datetime(df['날짜'])
df['월'] = df['날짜'].dt.month

summary = df.groupby('지역')['매출'].agg(['sum', 'mean', 'count']).reset_index()
summary.columns = ['지역', '총매출', '평균매출', '건수']

display(summary)

In [None]:
# Excel 보고서 자동 저장

import pandas as pd
from sqlalchemy import create_engine

engine = create_engine("mysql+mysqlconnector://root:1234@localhost/data_collection")
df = pd.read_sql("SELECT * FROM project_sales", engine)

df['날짜'] = pd.to_datetime(df['날짜'])
df['월'] = df['날짜'].dt.month

# agg(총매출='sum', 평균매출='mean', 건수='count') 이렇게 작성시 columns 메서드 필요X
summary = df.groupby('지역')['매출'].agg(['sum', 'mean', 'count']).reset_index()
summary.columns = ['지역', '총매출', '평균매출', '건수']

with pd.ExcelWriter("매출보고서.xlsx", engine='xlsxwriter') as writer:
  summary.to_excel(writer, sheet_name="요약", index=False)
  workbook = writer.book
  worksheet = writer.sheets["요약"]
  
  chart = workbook.add_chart({'type': 'column'})
  chart.add_series({
    'name': '총매출',
    'values': f"='요약'!$B$2:$B${len(summary)+1}",
    'categories': f"='요약'!$A$2:$A${len(summary)+1}"
  })
  chart.set_title({'name': '지역별 총매출'})
  worksheet.insert_chart('F2', chart)

In [None]:
# PDF 용 차트 이미지 저장(지역별매출.png)

import matplotlib.pyplot as plt
import matplotlib as mpl
mpl.rcParams['font.family'] = 'Malgun Gothic'
mpl.rcParams['axes.unicode_minus'] = False
import pandas as pd
from sqlalchemy import create_engine

engine = create_engine("mysql+mysqlconnector://root:1234@localhost/data_collection")
df = pd.read_sql("SELECT * FROM project_sales", engine)

df['날짜'] = pd.to_datetime(df['날짜'])
df['월'] = df['날짜'].dt.month

plt.figure(figsize=(8, 5)) 
df.groupby("지역")["매출"].sum().plot(kind="bar", title="지역별 매출") 
plt.ylabel("매출") 
plt.tight_layout() 
plt.savefig("지역별매출.png") 
plt.show() 

In [None]:
# PDF 보고서 생성

from reportlab.pdfgen import canvas 
from reportlab.pdfbase import pdfmetrics 
from reportlab.pdfbase.ttfonts import TTFont 
from reportlab.lib.pagesizes import A4 
# 한글 폰트 등록 (나눔고딕 사용 예시) 
pdfmetrics.registerFont(TTFont('MalgunGothic', 'C:/Windows/Fonts/malgun.ttf')) 
# PDF 생성 
c = canvas.Canvas("매출보고서.pdf", pagesize=A4) 
c.setFont("MalgunGothic", 16)  # 등록한 한글 폰트 사용 
c.drawString(50, 800, "매출 분석 보고서 (PDF)")
# 차트 이미지 삽입
c.drawImage("지역별매출.png", 50, 450, width=500, height=300)
# 저장 
c.save()

In [None]:
# Word 보고서 생성

from docx import Document 
import pandas as pd
from sqlalchemy import create_engine

engine = create_engine("mysql+mysqlconnector://root:1234@localhost/data_collection")
df = pd.read_sql("SELECT * FROM project_sales", engine)

df['날짜'] = pd.to_datetime(df['날짜'])
df['월'] = df['날짜'].dt.month

# agg(총매출='sum', 평균매출='mean', 건수='count') 이렇게 작성시 columns 메서드 필요X
summary = df.groupby('지역')['매출'].agg(['sum', 'mean', 'count']).reset_index()
summary.columns = ['지역', '총매출', '평균매출', '건수']

doc = Document() 
doc.add_heading("매출 분석 보고서", 0) 
 
doc.add_paragraph("지역별 매출 요약:") 
 
table = doc.add_table(rows=1, cols=4) 
hdr_cells = table.rows[0].cells 
hdr_cells[0].text = "지역" 
hdr_cells[1].text = "총매출" 
hdr_cells[2].text = "평균매출" 
hdr_cells[3].text = "건수" 
 
for _, row in summary.iterrows(): 
  row_cells = table.add_row().cells 
  for i, val in enumerate(row): 
    row_cells[i].text = str(val) 
 
doc.save("매출보고서.docx") 