# 5章 分析システムを構築する10本ノック

### ノック41：基本的なフォルダを生成しよう

In [1]:
import os
data_dir = "data"
input_dir = os.path.join(data_dir, "0_input")
output_dir = os.path.join(data_dir, "10_output")
master_dir = os.path.join(data_dir, "99_master")
print(input_dir)

data/0_input


In [2]:
os.makedirs(input_dir, exist_ok=True)
os.makedirs(output_dir, exist_ok=True)
os.makedirs(master_dir, exist_ok=True)

### ノック42:入力データのチェック機構を作ろう

In [3]:
import pandas as pd
m_area_file = "m_area.csv"
m_store_file = "m_store.csv"
m_area = pd.read_csv(os.path.join(master_dir, m_area_file))
m_store = pd.read_csv(os.path.join(master_dir, m_store_file))
m_area.head(3)

Unnamed: 0,area_cd,wide_area,narrow_area
0,TK,東京,東京
1,KN,神奈川,神奈川
2,CH,千葉,千葉


In [4]:
# メインとなる注文データの読み込み
tg_ym = "202007"
target_file = "tbl_order_" + tg_ym + ".csv"
target_data = pd.read_csv(os.path.join(input_dir, target_file))

In [5]:
import datetime
max_date = pd.to_datetime(target_data["order_accept_date"]).max()
min_date = pd.to_datetime(target_data["order_accept_date"]).min()
max_str_date = max_date.strftime("%Y%m")
min_str_date = min_date.strftime("%Y%m")
if tg_ym == min_str_date and tg_ym == max_str_date:
  print("日付が一致しました")
else:
  raise Exception("日付が一致しません")

日付が一致しました


In [6]:
def calc_delta(t):
  t1, t2 = t
  delta = t2 - t1
  return delta.total_seconds()/60

def init_tran_df(trg_df):
  # 保守用店舗データの削除
  trg_df = trg_df.loc[trg_df['store_id'] != 999]

  trg_df = pd.merge(trg_df, m_store, on='store_id', how='left')
  trg_df = pd.merge(trg_df, m_area, on='area_cd', how='left')

  # マスターにないコードに対応した文字列を設定
  trg_df.loc[trg_df['takeout_flag'] == 0, 'takeout_name'] = 'デリバリー'
  trg_df.loc[trg_df['takeout_flag'] == 1, 'takeout_name'] = 'お持ち帰り'

  trg_df.loc[trg_df['status'] == 0, 'status_name'] = '受付'
  trg_df.loc[trg_df['status'] == 1, 'status_name'] = 'お支払済'
  trg_df.loc[trg_df['status'] == 2, 'status_name'] = 'お渡し済'
  trg_df.loc[trg_df['status'] == 9, 'status_name'] = 'キャンセル'

  trg_df.loc[:, 'order_date'] = pd.to_datetime(trg_df['order_accept_date']).dt.date

  # 配達までの時間を計算
  trg_df.loc[:, 'order_accept_datetime'] = pd.to_datetime(trg_df['order_accept_date'])
  trg_df.loc[:, 'delivered_datetime'] = pd.to_datetime(trg_df['delivered_date'])
  trg_df.loc[:, 'delta'] = trg_df[['order_accept_datetime', 'delivered_datetime']].apply(calc_delta, axis=1)

  return trg_df

# 当月分を初期化
target_data = init_tran_df(target_data)

### ノック43:レポーティング（本部向け）を関数化してみよう

In [7]:
import openpyxl
from openpyxl.utils.dataframe import dataframe_to_rows
from openpyxl.styles import PatternFill, Border, Side, Font

def get_rank_df(target_data):
  # 店舗のデータ作成、ランキングDFの返却
  tmp = target_data.loc[target_data['status'].isin([1,2])]
  rank = tmp.groupby(['store_id'])['total_amount'].sum().sort_values(ascending=False)
  rank = pd.merge(rank, m_store, on='store_id', how='left')

  return rank

def get_cancel_rank_df(target_data):
  # キャンセル率の計算、ランキングDFの返却
  cancel_df = pd.DataFrame()
  cancel_cnt = target_data.loc[target_data['status']==9].groupby(['store_id'])['store_id'].count()
  order_cnt = target_data.loc[target_data['status'].isin([1,2,9])].groupby(['store_id'])['store_id'].count()
  cancel_rate = (cancel_cnt / order_cnt) * 100
  cancel_df['cancel_rate'] = cancel_rate
  cancel_df = pd.merge(cancel_df, m_store, on='store_id', how='left')
  cancel_df = cancel_df.sort_values('cancel_rate', ascending=True)

  return cancel_df

In [8]:
def data_export(df, ws, row_start, col_start):
  # スタイル定義
  side = Side(style='thin', color='008080')
  border = Border(top=side, bottom=side, left=side, right=side)

  rows = dataframe_to_rows(df, index=False, header=True)

  for row_no, row in enumerate(rows, row_start):
    for col_no, value in enumerate(row, col_start):
      cell = ws.cell(row_no, col_no)
      cell.value = value
      cell.border = border
      if row_no == row_start:
        cell.fill = PatternFill(patternType='solid', fgColor='008080')
        cell.font = Font(bold=True, color='FFFFFF')

In [9]:
# 本部向けレポーティングデータ処理
def make_report_hq(target_data, output_folder):
  rank = get_rank_df(target_data)
  cancel_rank = get_cancel_rank_df(target_data)

  # Excel出力処理
  wb = openpyxl.Workbook()
  ws = wb.active
  ws.title = 'サマリーレポート（本部向け）'

  cell = ws.cell(1,1)
  cell.value = f'本部向け {max_str_date}月度 サマリーレポート'
  cell.font = Font(bold=True, color='008080', size=20)

  cell = ws.cell(3,6)
  cell.value = f"{'{:,}'.format(rank['total_amount'].sum())}"
  cell.font = Font(bold=True, color='008080', size=20)

  # 売り上げランキングを直接出力
  cell = ws.cell(5,2)
  cell.value = f'売り上げランキング'
  cell.font = Font(bold=True, color='008080', size=16)

  # 表の貼り付け
  data_export(rank, ws, 6, 2)

  # キャンセル率ランキングを直接出力
  cell = ws.cell(5,8)
  cell.value = f'キャンセル率ランキング'
  cell.font = Font(bold=True, color='008080', size=16)

  # 表の貼り付け位置
  data_export(cancel_rank, ws, 6, 8)

  wb.save(os.path.join(output_folder, f'report_hq_{max_str_date}.xlsx'))