In [1]:
import requests
import pandas as pd
import json
from dotenv import load_dotenv
import os
from datetime import datetime
import re
from sqlalchemy import create_engine, exc
from zoneinfo import ZoneInfo
from io import StringIO
from google.cloud import storage


# mrt_usage_history
# get csv download of every month's data
# each url can get one month's data


def E_mrt_usage_history_csvfilelist():
    url = "https://data.taipei/api/dataset/63f31c7e-7fc3-418b-bd82-b95158755b4d/resource/eb481f58-1238-4cff-8caa-fa7bb20cb4f4/download"
    response = requests.get(url=url)
    response_list = response.text.split("\r")

    col_name = response_list[0].split(",")
    url_df = pd.concat([pd.DataFrame([response_list[i].split(
        ",")[1:]], columns=col_name[1:]) for i in range(1, len(response_list))], axis=0)
    url_df.reset_index(drop=True, inplace=True)
    print("E_mrt_usage_history_csvfilelist finished")
    return (url_df)


# def T_mrt_usage_history_one_month_apply_reduce(url):
#     response = requests.get(url=url)
#     StringIO_df = StringIO(response.content.decode("utf-8-sig"))
#     df = pd.read_csv(StringIO_df)
#     pattern = re.compile(r"[A-Za-z]+")
#     df["進站"] = df["進站"].str.replace(pattern, "", regex=True)
#     df["出站"] = df["出站"].str.replace(pattern, "", regex=True)
#     df_enter = pd.DataFrame(df.groupby(["日期", "時段", "進站"])[
#                             "人次"].sum()).reset_index(drop=False)
#     df_out = pd.DataFrame(df.groupby(["日期", "時段", "出站"])[
#         "人次"].sum()).reset_index(drop=False)
#     df_enter.rename(columns={
#         "日期": "date",
#         "時段": "hour",
#         "進站": "mrt_station_name",
#         "人次": "enter_count"
#     }, inplace=True)

#     df_out.rename(columns={
#         "日期": "date",
#         "時段": "hour",
#         "出站": "mrt_station_name",
#         "人次": "exit_count"
#     }, inplace=True)
#     df = df_enter.merge(df_out,
#                         left_on=["date", "hour", "mrt_station_name"],
#                         right_on=["date", "hour", "mrt_station_name"],
#                         how="outer")
#     print("T_mrt_usage_history_one_month finished")
#     return (df)


def T_mrt_usage_history_one_month(url: str):
    response = requests.get(url=url)
    StringIO_df = StringIO(response.content.decode("utf-8-sig"))
    df = pd.read_csv(StringIO_df)
    pattern = re.compile(r"[A-Za-z]+")
    df["進站"] = df["進站"].str.replace(pattern, "", regex=True)
    df["出站"] = df["出站"].str.replace(pattern, "", regex=True)
    df.rename(columns={
        "日期": "date",
        "時段": "hour",
        "進站": "mrt_station_name_enter",
        "出站": "mrt_station_name_exit",
        "人次": "visitors_num"
    }, inplace=True)
    print(f"T_mrt_usage_history finished")
    return (df)


def T_mrt_usage_history_one_month_recuce(df: pd.DataFrame):
    df_enter = pd.DataFrame(df.groupby(["date", "hour", "mrt_station_name_enter"])[
                            "visitors_num"].sum()).reset_index(drop=False)
    df_out = pd.DataFrame(df.groupby(["date", "hour", "mrt_station_name_exit"])[
        "visitors_num"].sum()).reset_index(drop=False)
    df = df_enter.merge(df_out,
                        left_on=["date", "hour", "mrt_station_name_enter"],
                        right_on=["date", "hour", "mrt_station_name_exit"],
                        how="outer", suffixes=["_enter", "_exit"])
    df["mrt_station_name"] = df["mrt_station_name_exit"].combine_first(
        df["mrt_station_name_enter"])
    df = df.loc[:, ["date", "hour", "mrt_station_name",
                    "visitors_num_enter", "visitors_num_exit"]]
    return (df)


def L_mrt_usage_history(df: pd.DataFrame):
    username_sql = os.getenv("ANDY_USERNAME_SQL")
    password_sql = os.getenv("ANDY_PASSWORD_SQL")
    # server = "host.docker.internal:3306"  #docker用
    server = "localhost:3306"
    db_name = "group2_db"
    try:
        with create_engine(f"mysql+pymysql://{username_sql}:{password_sql}@{server}/{db_name}",).connect() as conn:
            df.to_sql(
                name="mrt_usage_history",
                con=conn,
                if_exists="append",
                index=False
            )
        print(f"L_mrt_usage_history finished")
        return ("L_mrt_usage_history finished")
    except:
        print("loading to sql fail")





In [2]:
url_df = E_mrt_usage_history_csvfilelist()

E_mrt_usage_history_csvfilelist finished


In [7]:
url_df

Unnamed: 0,年月,URL
0,201701,http://tcgmetro.blob.core.windows.net/stationo...
1,201702,http://tcgmetro.blob.core.windows.net/stationo...
2,201703,http://tcgmetro.blob.core.windows.net/stationo...
3,201704,http://tcgmetro.blob.core.windows.net/stationo...
4,201705,http://tcgmetro.blob.core.windows.net/stationo...
...,...,...
82,202311,http://tcgmetro.blob.core.windows.net/stationo...
83,202312,http://tcgmetro.blob.core.windows.net/stationo...
84,202401,http://tcgmetro.blob.core.windows.net/stationo...
85,202402,http://tcgmetro.blob.core.windows.net/stationo...


In [8]:
# url_df = E_mrt_usage_history_csvfilelist()
# for i in range(0, 2):
for i in range(len(url_df)):
    month = url_df.loc[i, "年月"]
    url = url_df.loc[i, "URL"]
    file_save_path = r"C:\TIR101_Group2\DA\data\MRT\mrt_usage_history"
    filename_full = f"{month}_full_mrt_usage_history.csv"
    filename_reduce = f"{month}_reduce_mrt_usage_history.csv"
    filename_full_save_path = os.path.join(file_save_path,"full",filename_full)
    filename_reduce_save_path = os.path.join(file_save_path,"reduce",filename_reduce)
    try:
        print(f"{month} prepare to download")

        T_df = T_mrt_usage_history_one_month(url=url)

        print(f"{month} download finished")
        print(f"{month} saving csv")
        
        T_df.to_csv(filename_full_save_path,encoding="utf-8-sig",index=False)

        print(f"{month} csv has been saved")
        print(f"{month} reducing(group by)")

        reduce_df = T_mrt_usage_history_one_month_recuce(T_df)
        T_df.to_csv(filename_reduce_save_path,encoding="utf-8-sig",index=False)
        print(f"{month} reducing file has been saved")
    except:
        print(f"ERROR {month} fail", sep=" ")
        continue
    

E_mrt_usage_history_csvfilelist finished
201701 prepare to download
T_mrt_usage_history finished
201701 download finished
201701 saving csv
201701 csv has been saved
201701 reducing(group by)
201701 reducing file has been saved
201702 prepare to download
ERROR 201702 fail
201703 prepare to download
