## 爬取單日大盤統計資訊
### [收盤指數資訊 - 價格指數(臺灣證券交易所)](https://www.twse.com.tw/zh/page/trading/exchange/MI_INDEX.html)

In [None]:
# 可以去掉 python 輸出時，因為軟體版本所引起的警告的警告。
import warnings
warnings.filterwarnings('ignore')

In [None]:
import requests
import json
import pandas as pd
import numpy as np

In [None]:
data = {
    'response': 'json',
    'date': '20210607',
    'type': 'IND',
    #'_': '1623107531853'
}

res = requests.get('http://www.tse.com.tw/exchangeReport/MI_INDEX', data)

print(res.text)

In [None]:
jres = json.loads(res.text)

jres

In [None]:
jres['stat']    # 回應 'OK' 代表當日有開盤

In [None]:
jres['data1']

In [None]:
df_temp = pd.DataFrame(jres['data1'], columns = jres['fields1'])

df_temp

## 時間序列處理
### 注意：星期一是 0 到 星期日是 6

In [None]:
from datetime import datetime

datetime(2021, 6, 7).weekday() #  Notice: Monday is 0, Sunday is 6

### timedelta
#### 參數可放：months, weeks, days(default)

In [None]:
from datetime import timedelta

timedelta(days = 1)

In [None]:
datetime(2021, 6, 7) - timedelta(1)

### datetime轉string: strftime(datetime, format)

In [None]:
datetime.strftime(datetime(2021, 6, 7), '%Y%m%d')

## 創建儲存爬取多日大盤指數的DataFrame

In [None]:
column_list = list(df_temp['指數'])

column_list.append('date')

df = pd.DataFrame(columns = column_list)

df

## 爬取多日證交所大盤指數
### 注意：使用 time.sleep() 避免頻繁爬取被拒絕存取

In [None]:
import time
import random

crawl_date = datetime(2021, 6, 7) # start_date

crawl_date

In [None]:
headers = {'user-agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36'}

for i in range(30):

    crawl_date -= timedelta(1)
    
    crawl_date_str = datetime.strftime(crawl_date, '%Y%m%d')
    
    res = requests.get('http://www.tse.com.tw/exchangeReport/MI_INDEX?response=json&date=' + crawl_date_str + '&type=IND', headers = headers)
    
    jres = json.loads(res.text)
    
    
    # 證交所回覆有資料
    if (jres['stat'] == 'OK'):
        
        print(crawl_date_str, ': crawling data...')
        
        # 將讀取回的json轉成的DataFrame(df_temp)
        df_temp = pd.DataFrame(jres['data1'], columns = jres['fields1'])
        
        # 將單列的 DataFrame(df_temp) 中的欄位 '漲跌百分比(%)' 存入統整的 DataFrame
        row_data = list(df_temp['漲跌百分比(%)'])
        
        row_data.append(crawl_date_str)
        
        df.loc[len(df)] = row_data
    
    else:
    
        print(crawl_date_str, ': no data')
     
    
    # 讓程式睡個幾秒(建議 5~10 秒或更久)再繼續爬取下一天資料，避免頻繁抓取被台灣證券交易所封鎖 IP 拒絕存取
    time.sleep((random.uniform(5, 10))) 

### 修改pandas顯示設定

In [None]:
pd.set_option('display.max.columns', 80)

pd.set_option('display.max.rows', 80)

In [None]:
df

## 資料預處理
### 取代

In [None]:
df.dtypes

In [None]:
df = df.replace('--', 0)

In [None]:
df

### 時間序列處理：datetime_index

In [None]:
df = df.set_index(df['date'], drop = True)

df.head()

In [None]:
type(df.index)

In [None]:
df.index = pd.to_datetime(df.index, format = '%Y%m%d')

type(df.index)

In [None]:
del df['date']

In [None]:
df

### 資料型態轉換

In [None]:
df.dtypes

In [None]:
df = df.astype(float)

df.dtypes

## 視覺化

In [None]:
import matplotlib.pyplot as plt
import matplotlib.gridspec as gs
from matplotlib.font_manager import FontProperties # 處理中文

#plt.rcParams['font.family'] = 'SimHei' #顯示中文('SimHei' for MacOS)
#plt.rcParams['font.family'] = 'DFKai-SB' #顯示中文 (for Win10)
plt.rcParams['axes.unicode_minus'] = False #正常顯示負號

plt.rcParams['font.sans-serif'] = ['Microsoft JhengHei']  # 步驟一（替換sans-serif字型）

#圖片顯示於Jupyter Notebook上
%matplotlib inline

In [None]:
df['寶島股價指數'].plot(figsize = (10, 8))

In [None]:
df.loc[:, '水泥窯製類指數':'油電燃氣類指數'].plot(figsize = (20, 16))

In [None]:
df.index

In [None]:
df.iloc[:7, :5].plot(kind = 'bar', figsize = (20, 16))

In [None]:
df.plot(kind = 'scatter', x = '半導體類指數', y = '光電類指數', figsize = (10, 8))

## 相關性分析

In [None]:
df.corr()

In [None]:
df.loc[:, '水泥窯製類指數':'油電燃氣類指數']

In [None]:
df.loc[:, '水泥窯製類指數':'油電燃氣類指數'].corr()

In [None]:
corr = df.loc[:, '水泥窯製類指數':'油電燃氣類指數'].corr()

### Seaborn: https://seaborn.pydata.org/

In [None]:
import seaborn as sns

plt.figure(figsize = (20, 20))

sns.heatmap(corr, square = True, annot = True)

plt.show()

## 異常值偵測
### [數據可視化圖表——箱形圖](https://kknews.cc/tech/9n58a58.html)

In [None]:
df.loc[:,'水泥窯製類指數':'油電燃氣類指數'].boxplot(figsize = (20, 10), rot = 90, fontsize = 16) # rot:x-axis label旋轉角度

In [None]:
df[df['汽車類指數'] < -2]

## 統計分析

In [None]:
df_stock_index = df.loc[:, '水泥窯製類指數':'油電燃氣類指數']

df_stock_index.describe()

In [None]:
df_stock_index.mean()

In [None]:
df_stock_index.mean().plot(kind = 'bar', figsize = (15, 8), fontsize = 16)