In [None]:
#########################################################################################################
#  RUT-SOM-DATA-PT-06-2020-U-C-Team5                                                      July 28, 2020 #
#     Project1                                                                                          #
#    program-name = censusdata_extraction                                                               #
#00    > read in US Census bureau data file and key files for Monthly Retail Trade and                  #
#        Food Sales (MRTS), merge files on key values.                                                  #                       
#01    > keep only non adjusted sales (is_adj=0), monthly sales(SM) and Monthly Percentage              #
#        Change(MPSCM) for electronic shopping and mail order(4541) and MRTFS excluding                 #
#        automotive and fuel sales (44W72) between 2010 and 2020 (inclusive).                           #
#      > create complete df and write to csv.                                                           #
#02    > create two output csv files, summarizing monthly sales and annual sales for both ecomm         #
#        and instore purchases.                                                                         #
#########################################################################################################

In [9]:
####################################################
#00     I/O and Table S/U                          #
#   a- import code.                                #
#   b- associate files to variables.               #  
#    - read input files into df's.                 #
####################################################

#a
import pandas as pd
import numpy as np
import requests
import json
import warnings; warnings.simplefilter('ignore')

#b
data_in = "datain/data3.csv"      
data_df = pd.read_csv(data_in) 


In [10]:
###########################################################################################
#01      Create Complete Dataframe                                                        #
###########################################################################################

data_df =data_df[((data_df["category_code"] == "44W72") | (data_df["category_code"] == "4541")) &
                 ((data_df["data_type_code"] == "SM") | (data_df["data_type_code"] == "MPCSM")) &
                 (data_df["seasonally_adj"] == "no")]

data_df["year"], data_df["month"] = data_df['time'].str.split('-', 1).str
data_df["year"] = data_df["year"].astype(int)
data_df["cell_value"] = data_df["cell_value"].astype(float)

data_df["month"].replace({"01":"Jan","02":"Feb","03":"Mar","04":"Apr","05":"May","06":"Jun","07":"Jul","08":"Aug",
                          "09":"Sep","10":"Oct","11":"Nov","12":"Dec"}, inplace=True)

complete_df = pd.DataFrame({"month": data_df["month"], "year": data_df["year"], "amount": data_df["cell_value"],
              "dt_code": data_df["data_type_code"], "cat_code": data_df["category_code"]})

complete_df.reset_index(inplace=True,drop=True)
complete_df.sort_values(["cat_code", "dt_code"], inplace=True,ascending=False)
complete_df.reset_index(inplace=True, drop=True)
complete_df.to_csv("Data/censusdata_complete.csv", index=False, header=True)

In [21]:
###############################################################
#02      Monthly and Annual Summary                           #
#   a- split df into e-comm and in store for monthly sales.   #   THIS CODE ASSUMES CODE 44000 OR 44W72 IS TOTAL RETAIL
#   b- create monthly df of sales information.                #   
#   c- aggregate data and create annual summary df.           #  
#   d- write out df's to csv files.                           # 
###############################################################

#a
ecomm_sales = complete_df[(complete_df["cat_code"] == "4541") & (complete_df["dt_code"] == "SM")]
ecomm_sales.reset_index(inplace=True,drop=True)

total_sales = complete_df[(complete_df["cat_code"] == "44W72")& (complete_df["dt_code"] == "SM")]
total_sales.reset_index(inplace=True,drop=True)

#b
monthly_sales_both = pd.DataFrame({"month": ecomm_sales["month"], "year": ecomm_sales["year"], 
                       "ecomm sales(MIL$)": ecomm_sales["amount"].astype(int), 
                       "instore sales(MIL$)": (total_sales["amount"] - ecomm_sales["amount"]).astype(int),
                       "ecomm %":  round(ecomm_sales["amount"] / total_sales["amount"] * 100,2),
                       "instore %": round((total_sales["amount"] - ecomm_sales["amount"]) / total_sales["amount"] * 100,2),
                       "total sales": total_sales["amount"].astype(int)})
#c
ecomm_annual_sales = (ecomm_sales.groupby("year")["amount"].sum()).astype(int)
total_annual_sales = (total_sales.groupby("year")["amount"].sum()).astype(int)
instore_annual_sales = total_annual_sales - ecomm_annual_sales
ecom_per = round((ecomm_annual_sales / total_annual_sales) * 100,2)
instore_per = round((instore_annual_sales / total_annual_sales) * 100,2)
years = ecomm_sales.groupby('year')["year"].mean()

yearly_sales_both = pd.DataFrame({"year": years, "ecomm sales(MIL$)": ecomm_annual_sales,
                                  "instore sales(MIL$)": instore_annual_sales, "ecomm %": ecom_per, 
                                  "instore %": instore_per, "total sales": total_annual_sales})
#d
monthly_sales_both.to_csv("Data/censusdata_monthly_sum.csv", index=False, header=True)
yearly_sales_both.to_csv("Data/censusdata_yearly_sum.csv", index=False, header=True)

In [None]:
###############################################################
#02      Monthly and Annual Summary                           #
#   a- split df into e-comm and in store for monthly sales.   #   THIS CODE ASSUMES CODE 44000 OR 44W72 DOES NOT INCLUDE
#   b- create monthly df of sales information.                #   ECOMM AND TOTAL RETAIL IS THE SUM WITH 4541.
#   c- aggregate data and create annual summary df.           #  
#   d- write out df's to csv files.                           # 
###############################################################

#a
ecomm_sales = complete_df[(complete_df["cat_code"] == "4541") & (complete_df["dt_code"] == "SM")]
ecomm_sales.reset_index(inplace=True,drop=True)

comm_sales = complete_df[(complete_df["cat_code"] == "44W72")& (complete_df["dt_code"] == "SM")]
comm_sales.reset_index(inplace=True,drop=True)

#b
monthly_sales_both = pd.DataFrame({"month": ecomm_sales["month"], "year": ecomm_sales["year"], 
                       "ecomm sales(MIL$)": ecomm_sales["amount"].astype(int), "instore sales(MIL$)": comm_sales["amount"].astype(int),
                       "ecomm %":  round(ecomm_sales["amount"] / (ecomm_sales["amount"] + comm_sales["amount"]) * 100,2),
                       "total sales": (ecomm_sales["amount"] + comm_sales["amount"]).astype(int)})
#c
ecomm_annual_sales = (ecomm_sales.groupby("year")["amount"].sum()).astype(int)
comm_annual_sales = (comm_sales.groupby("year")["amount"].sum()).astype(int)
total_annual_sales = ecomm_annual_sales + comm_annual_sales
ecom_per = round((ecomm_annual_sales / total_annual_sales) * 100,2)
years = ecomm_sales.groupby('year')["year"].mean()


yearly_sales_both = pd.DataFrame({"year": years, "ecomm sales(MIL$)": ecomm_annual_sales, "instore sales(MIL$)": comm_annual_sales,
                                  "ecomm %": ecom_per, "total sales": total_annual_sales})
#d
monthly_sales_both.to_csv("Data/censusdata_monthly_sum.csv", index=False, header=True)
yearly_sales_both.to_csv("Data/censusdata_yearly_sum.csv", index=False, header=True)