# 29. Capstone Project - pandas를 활용한 Web Scraping + 결과 파일 이메일 전송

### 프로야구 연도별 순위를 정리하여 이메일로 전송

- pandas read_html() 함수를 사용하여 데이터를 쉽게 웹 스크래핑  

- html에 있는 table속성에 해당하는 값을 가져올 수 있다. 이는 웹페이지에 있는 표를 불러오겠다는 의미이다.

In [1]:
import pandas as pd
import urllib.request as req
from IPython.display import display_html

import numpy as np
from matplotlib import pyplot as plt

## 1. string 으로부터 HTML table 읽어 들이기

<img src="https://miro.medium.com/max/875/1*NtL7NT395fXaCfq8_lNhjg.png" width=400/>

html에 있는 table속성에 해당하는 값을 가져올 수 있다. 이는 웹페이지에 있는 표를 불러오겠다는 의미이다.

In [2]:
html_string = """
<table>
  <thead>
    <tr>
      <th>date</th>
      <th>name</th>
      <th>year</th>
      <th>cost</th>
      <th>region</th>
    </tr>
  </thead>
  <tbody>
    <tr>
      <td>2020-01-01</td>
      <td>Jenny</td>
      <td>1998</td>
      <td>0.2</td>
      <td>South</td>
    </tr>
    <tr>
      <td>2020-01-02</td>
      <td>Alice</td>
      <td>1992</td>
      <td>-1.34</td>
      <td>East</td>
    </tr>
    <tr>
      <td>2020-01-03</td>
      <td>Tomas</td>
      <td>1982</td>
      <td>1.00023</td>
      <td>South</td>
    </tr>
  </tbody>
</table>
"""

# Display the HTML representation
display_html(html_string, raw=True)

date,name,year,cost,region
2020-01-01,Jenny,1998,0.2,South
2020-01-02,Alice,1992,-1.34,East
2020-01-03,Tomas,1982,1.00023,South


- 문자열로부터 table 정보 읽기

In [3]:
dfs = pd.read_html(html_string)
dfs

[         date   name  year     cost region
 0  2020-01-01  Jenny  1998  0.20000  South
 1  2020-01-02  Alice  1992 -1.34000   East
 2  2020-01-03  Tomas  1982  1.00023  South]

In [4]:
type(dfs), len(dfs)

(list, 1)

In [5]:
dfs[0]

Unnamed: 0,date,name,year,cost,region
0,2020-01-01,Jenny,1998,0.2,South
1,2020-01-02,Alice,1992,-1.34,East
2,2020-01-03,Tomas,1982,1.00023,South


In [6]:
type(dfs[0])

pandas.core.frame.DataFrame

In [7]:
dfs[0].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   date    3 non-null      object 
 1   name    3 non-null      object 
 2   year    3 non-null      int64  
 3   cost    3 non-null      float64
 4   region  3 non-null      object 
dtypes: float64(1), int64(1), object(3)
memory usage: 248.0+ bytes


## 2. URL로 부터 table 읽어들이기

In [8]:
url = "http://localhost:5500/babynames/baby1990.html"
res = req.urlopen(url)

dfs = pd.read_html(res.read().decode('utf-8'))

In [9]:
print(f'읽어들인 총 tables: {len(dfs)}')

읽어들인 총 tables: 4


In [10]:
dfs[2].head()

Unnamed: 0,Rank,Male name,Female name
0,1,Michael,Jessica
1,2,Christopher,Ashley
2,3,Matthew,Brittany
3,4,Joshua,Amanda
4,5,Daniel,Samantha


## 3. html file 로부터 table 읽기

In [11]:
file_path = '../babynames/baby1990.html'
with open(file_path, 'r') as f:
    dfs = pd.read_html(f.read())

In [12]:
dfs[2].head()

Unnamed: 0,Rank,Male name,Female name
0,1,Michael,Jessica
1,2,Christopher,Ashley
2,3,Matthew,Brittany
3,4,Joshua,Amanda
4,5,Daniel,Samantha


In [13]:
dfs[2].tail()

Unnamed: 0,Rank,Male name,Female name
996,997,Eliezer,Asha
997,998,Jory,Jada
998,999,Misael,Leila
999,1000,Tate,Peggy
1000,"Note: Rank 1 is the most popular, rank 2 is th...","Note: Rank 1 is the most popular, rank 2 is th...","Note: Rank 1 is the most popular, rank 2 is th..."


- 마지막 record 제거

In [14]:
dfs[2] = dfs[2].iloc[:-1, :]

In [15]:
dfs[2].tail()

Unnamed: 0,Rank,Male name,Female name
995,996,Brittany,Annemarie
996,997,Eliezer,Asha
997,998,Jory,Jada
998,999,Misael,Leila
999,1000,Tate,Peggy


## Naver 재무 데이터 불러오기 

In [16]:
url = 'https://finance.naver.com/item/main.nhn?code=035720'  # 카카오 재무제표

dfs = pd.read_html(url, encoding='euc-kr')  
len(dfs)

13

In [17]:
df = dfs[3] 
df.head()

Unnamed: 0_level_0,주요재무정보,최근 연간 실적,최근 연간 실적,최근 연간 실적,최근 연간 실적,최근 분기 실적,최근 분기 실적,최근 분기 실적,최근 분기 실적,최근 분기 실적,최근 분기 실적
Unnamed: 0_level_1,주요재무정보,2019.12,2020.12,2021.12,2022.12(E),2021.06,2021.09,2021.12,2022.03,2022.06,2022.09(E)
Unnamed: 0_level_2,주요재무정보,IFRS연결,IFRS연결,IFRS연결,IFRS연결,IFRS연결,IFRS연결,IFRS연결,IFRS연결,IFRS연결,IFRS연결
0,매출액,30701.0,41568.0,61367.0,75781.0,13522.0,17408.0,17857.0,16517.0,18223.0,19533.0
1,영업이익,2068.0,4559.0,5949.0,7441.0,1626.0,1682.0,1066.0,1587.0,1710.0,1992.0
2,당기순이익,-3419.0,1734.0,16462.0,18346.0,3159.0,8663.0,2241.0,13221.0,1012.0,1972.0
3,영업이익률,6.73,10.97,9.69,9.82,12.03,9.66,5.97,9.61,9.38,10.2
4,순이익률,-11.14,4.17,26.82,24.21,23.36,49.76,12.55,80.05,5.55,10.1


읽어들인 table data 를 csv file 로 저장. Excel 로 확인.

In [18]:
df.to_csv('naver_finance_카카오.csv', encoding='euc-kr')

## Panda 를 이용한 간단한 통계 정보 작성 및 시각화
- 2015-2022 시즌의 프로야구 **연도별 순위표** 작성

In [19]:
url = "https://sports.news.naver.com/kbaseball/record/index?category=kbo"
dfs = pd.read_html(url)
dfs[0]

Unnamed: 0,순위,팀,경기수,승,패,무,승률,게임차,연속,출루율,장타율,최근 10경기
0,1,SSG,119,77,39,3,0.664,0.0,1패,0.336,0.392,4승-6패-0무
1,2,LG,114,71,42,1,0.628,4.5,6승,0.348,0.41,7승-3패-0무
2,3,키움,122,69,51,2,0.575,10.0,5승,0.335,0.366,8승-2패-0무
3,4,KT,118,65,51,2,0.56,12.0,1승,0.332,0.374,6승-4패-0무
4,5,KIA,117,58,58,1,0.5,19.0,1패,0.35,0.4,5승-5패-0무
5,6,롯데,120,53,63,4,0.457,24.0,1승,0.325,0.379,5승-5패-0무
6,7,NC,114,49,62,3,0.441,25.5,2패,0.332,0.374,3승-7패-0무
7,8,삼성,118,50,66,2,0.431,27.0,1승,0.328,0.369,6승-4패-0무
8,9,두산,115,48,65,2,0.425,27.5,2패,0.322,0.351,2승-8패-0무
9,10,한화,117,36,79,2,0.313,40.5,1승,0.32,0.358,4승-6패-0무


- 연도를 제외한 공통 url부분을 변수로 선언

In [20]:
url = "https://sports.news.naver.com/kbaseball/record/index?category=kbo&year="

- 2015~2022 사이의 팀 순위 읽어 오기

In [21]:
# for문의 결과물을 담을 빈 데이터프레임
baseball_df = pd.DataFrame() 

# 2015 - 2022 
for i in range(2015, 2023): 
    df1 = pd.read_html(url + str(i))[0] 
    df1["연도"] = str(i)   # 연도 column 추가 
    baseball_df = pd.concat([baseball_df, df1])
    
baseball_df.shape

(70, 13)

In [22]:
baseball_df

Unnamed: 0,순위,팀,경기수,승,패,무,승률,게임차,연속,출루율,장타율,최근 10경기,연도
0,1,두산,144,79,65,0,0.549,0.0,2승,0.370,0.435,7승-3패-0무,2015
1,2,삼성,144,88,56,0,0.611,-9.0,3승,0.378,0.469,4승-6패-0무,2015
2,3,NC,144,84,57,3,0.596,-6.5,1패,0.367,0.455,5승-4패-1무,2015
3,4,넥센,144,78,65,1,0.545,0.5,1패,0.372,0.486,4승-6패-0무,2015
4,5,SK,144,69,73,2,0.486,9.0,1승,0.349,0.410,6승-4패-0무,2015
...,...,...,...,...,...,...,...,...,...,...,...,...,...
5,6,SSG,144,66,64,14,0.508,7.5,1패,0.353,0.421,4승-4패-2무,2021
6,7,NC,144,67,68,9,0.496,9.0,1패,0.343,0.416,4승-5패-1무,2021
7,8,롯데,144,65,71,8,0.478,11.5,1승,0.356,0.399,4승-4패-2무,2021
8,9,KIA,144,58,76,10,0.433,17.5,2패,0.337,0.336,5승-5패-0무,2021


- 2015-2022 사이에 팀명이 바뀐 팀들의 팀명 수정

In [25]:
baseball_df = baseball_df.replace({"kt":"KT", "SK":"SSG", "넥센":"키움"}) 

- 연도를 index 로 하고 column에 팀명을 배치한 pivot table  작성

In [26]:
annual_rank = baseball_df.pivot(index="연도", columns="팀", values="순위")
annual_rank

팀,KIA,KT,LG,NC,SSG,두산,롯데,삼성,키움,한화
연도,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2015,7,10,9,3,5,1,8,2,4,6
2016,5,10,4,2,6,1,8,9,3,7
2017,1,10,6,4,5,2,3,9,7,8
2018,5,9,8,10,1,2,7,6,4,3
2019,7,6,4,5,3,1,10,8,2,9
2020,6,3,4,1,9,2,7,8,5,10
2021,9,1,4,7,6,2,8,3,5,10


- 연도를 index 로 하고 column에 순위를 배치한 pivot table  작성

In [27]:
yearly_baseball_rank = baseball_df.pivot(index="연도", columns="순위", values="팀")
yearly_baseball_rank

순위,1,2,3,4,5,6,7,8,9,10
연도,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2015,두산,삼성,NC,키움,SSG,한화,KIA,롯데,LG,KT
2016,두산,NC,키움,LG,KIA,SSG,한화,롯데,삼성,KT
2017,KIA,두산,롯데,NC,SSG,LG,키움,한화,삼성,KT
2018,SSG,두산,한화,키움,KIA,삼성,롯데,LG,KT,NC
2019,두산,키움,SSG,LG,NC,KT,KIA,삼성,한화,롯데
2020,NC,두산,KT,LG,키움,KIA,롯데,삼성,SSG,한화
2021,KT,두산,삼성,LG,키움,SSG,NC,롯데,KIA,한화


2015-2021 시즌 사이에 3 위 이내에 가장 많이 들어간 팀 순위

In [33]:
yearly_baseball_rank.loc[:, :3].values

array([['두산', '삼성', 'NC'],
       ['두산', 'NC', '키움'],
       ['KIA', '두산', '롯데'],
       ['SSG', '두산', '한화'],
       ['두산', '키움', 'SSG'],
       ['NC', '두산', 'KT'],
       ['KT', '두산', '삼성']], dtype=object)

In [34]:
count = dict()

for teams in yearly_baseball_rank.loc[:, :3].values:
    for team in teams:
        if team in count:
            count[team] += 1
        else:
            count[team] = 1      

sorted(count.items(), key=lambda kv: kv[1], reverse=True)

[('두산', 7),
 ('NC', 3),
 ('삼성', 2),
 ('키움', 2),
 ('SSG', 2),
 ('KT', 2),
 ('KIA', 1),
 ('롯데', 1),
 ('한화', 1)]

In [39]:
yearly_baseball_rank.to_csv("yearly-team-ranking.csv", encoding="cp949")

### SMTP mail sender 를 이용하여 첨부 파일로 발송

In [40]:
from email_sender import send_email

subject = '연도별 팀 순위 현황'
body = '첨부된 파일을 확인해 주세요.\n두산이 가장 강팀으로 보입니다.'

files = list()
files.append('yearly-team-ranking.csv')

send_email(subject, body, files)

성공적으로 메일 발송
