In [1]:
# Dependencies
import json
import requests
from config import api_key
import pandas as pd
import numpy as np
import warnings; warnings.simplefilter('ignore')

In [2]:
#API documentation - https://www.census.gov/data/developers/guidance/api-user-guide.html
#                    https://api.census.gov/data/timeseries/eits/mrts/variables.html
#                    https://api.census.gov/data/timeseries/eits.html
query_url = "https://api.census.gov/data/timeseries/eits/mrts?get=data_type_code,time_slot_id,seasonally_adj,category_code,cell_value&time=from+2010+to+2020&key="+api_key

In [3]:
#Displays if the user is getting a sucessful response from the API. 
print(requests.get(query_url))

<Response [200]>


In [4]:
#Data is pulled and displayed as JSON. Data pulled contains information from 2010 up until the most recently uploaded 2020 data
#User should input the following line prior to opening the notebook: jupyter notebook --NotebookApp.iopub_data_rate_limit=1.0e10
response = requests.get(query_url).json()
print(response)

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [5]:
#Save the JSON as DataFrame for Pandas manipulation
response_df = pd.DataFrame(response[1:], columns = response[0])
response_df.head()

Unnamed: 0,data_type_code,time_slot_id,seasonally_adj,category_code,cell_value,time
0,MPCIM,721,no,44000,-0.8,2010-01
1,IR,721,no,44000,1.51,2010-01
2,SM,721,no,44000,279044.0,2010-01
3,MPCSM,721,no,44000,-23.1,2010-01
4,IM,721,no,44000,420088.0,2010-01


In [6]:
#Checking for incomplete data
response_df.count()

data_type_code    75440
time_slot_id      75440
seasonally_adj    75440
category_code     75440
cell_value        75440
time              75440
dtype: int64

In [7]:
#Cleaning up to DataFrame
data_df=response_df[((response_df["category_code"] == "44W72") | (response_df["category_code"] == "4541")) &
                 ((response_df["data_type_code"] == "SM") | (response_df["data_type_code"] == "MPCSM")) &
                 (response_df["seasonally_adj"] == "no")]

data_df["year"], data_df["month"] = data_df['time'].str.split('-', 1).str
data_df["year"] = data_df["year"].astype(int)


data_df["month"].replace({"01":"Jan","02":"Feb","03":"Mar","04":"Apr","05":"May","06":"Jun","07":"Jul","08":"Aug",
                          "09":"Sep","10":"Oct","11":"Nov","12":"Dec"}, inplace=True)

data_df["mon/yy"] = ""
data_df["mon/yy"] = data_df["month"] + "/" + data_df["year"].astype(str)

complete_df = pd.DataFrame({"month": data_df["month"], "year": data_df["year"], "amount": data_df["cell_value"],
              "dt_code": data_df["data_type_code"], "cat_code": data_df["category_code"], "monthyear" : data_df["mon/yy"]})

complete_df.reset_index(inplace=True,drop=True)
complete_df.sort_values(["cat_code", "dt_code"], inplace=True,ascending=False)
complete_df.reset_index(inplace=True, drop=True)
complete_df.to_csv("censusdata_complete2.csv", index=False, header=True)
complete_df

Unnamed: 0,month,year,amount,dt_code,cat_code,monthyear
0,Jan,2010,19565,SM,4541,Jan/2010
1,Feb,2010,18419,SM,4541,Feb/2010
2,Mar,2010,21629,SM,4541,Mar/2010
3,Apr,2010,20385,SM,4541,Apr/2010
4,May,2010,20021,SM,4541,May/2010
...,...,...,...,...,...,...
495,Jan,2020,-21.9,MPCSM,44W72,Jan/2020
496,Feb,2020,-1.1,MPCSM,44W72,Feb/2020
497,Mar,2020,5.6,MPCSM,44W72,Mar/2020
498,Apr,2020,-14.0,MPCSM,44W72,Apr/2020


In [8]:
#Ecommerce Monthly Sales
ecomm_sales = complete_df[(complete_df["cat_code"] == "4541") & (complete_df["dt_code"] == "SM")]
ecomm_sales.reset_index(inplace=True,drop=True)
ecomm_sales

Unnamed: 0,month,year,amount,dt_code,cat_code,monthyear
0,Jan,2010,19565,SM,4541,Jan/2010
1,Feb,2010,18419,SM,4541,Feb/2010
2,Mar,2010,21629,SM,4541,Mar/2010
3,Apr,2010,20385,SM,4541,Apr/2010
4,May,2010,20021,SM,4541,May/2010
...,...,...,...,...,...,...
120,Jan,2020,56350,SM,4541,Jan/2020
121,Feb,2020,53979,SM,4541,Feb/2020
122,Mar,2020,62588,SM,4541,Mar/2020
123,Apr,2020,69952,SM,4541,Apr/2020


In [9]:
#In Store Monthly Sales
total_sales = complete_df[(complete_df["cat_code"] == "44W72")& (complete_df["dt_code"] == "SM")]
total_sales.reset_index(inplace=True,drop=True)
total_sales

Unnamed: 0,month,year,amount,dt_code,cat_code,monthyear
0,Jan,2010,230542,SM,44W72,Jan/2010
1,Feb,2010,227215,SM,44W72,Feb/2010
2,Mar,2010,255997,SM,44W72,Mar/2010
3,Apr,2010,253668,SM,44W72,Apr/2010
4,May,2010,261194,SM,44W72,May/2010
...,...,...,...,...,...,...
120,Jan,2020,349654,SM,44W72,Jan/2020
121,Feb,2020,345746,SM,44W72,Feb/2020
122,Mar,2020,365250,SM,44W72,Mar/2020
123,Apr,2020,314130,SM,44W72,Apr/2020


In [10]:
#Monthly Sales information for both Datasets
monthly_sales_both = pd.DataFrame({"month": ecomm_sales["month"], "year": ecomm_sales["year"], 
                       "ecomm sales(MIL$)": ecomm_sales["amount"], 
                       "instore sales(MIL$)": (total_sales["amount"].astype(int) - ecomm_sales["amount"].astype(int)),
                       "ecomm %":  round(ecomm_sales["amount"].astype(int) / total_sales["amount"].astype(int) * 100,2),
                       "instore %": round((total_sales["amount"].astype(int) - ecomm_sales["amount"].astype(int)) / total_sales["amount"].astype(int) * 100,2),
                       "total sales": total_sales["amount"]})
monthly_sales_both

Unnamed: 0,month,year,ecomm sales(MIL$),instore sales(MIL$),ecomm %,instore %,total sales
0,Jan,2010,19565,210977,8.49,91.51,230542
1,Feb,2010,18419,208796,8.11,91.89,227215
2,Mar,2010,21629,234368,8.45,91.55,255997
3,Apr,2010,20385,233283,8.04,91.96,253668
4,May,2010,20021,241173,7.67,92.33,261194
...,...,...,...,...,...,...,...
120,Jan,2020,56350,293304,16.12,83.88,349654
121,Feb,2020,53979,291767,15.61,84.39,345746
122,Mar,2020,62588,302662,17.14,82.86,365250
123,Apr,2020,69952,244178,22.27,77.73,314130


In [11]:
#Change 'amount' Datatype in Ecommerce Monthly Sales DataFrame
ecomm_sales["amount"]=ecomm_sales["amount"].astype(int)
ecomm_sales.dtypes

month        object
year          int64
amount        int64
dt_code      object
cat_code     object
monthyear    object
dtype: object

In [12]:
#Ecommerce Annual Sales
ecomm_annual_sales = ecomm_sales.groupby("year").agg({'amount': "sum"})
ecomm_annual_sales

Unnamed: 0_level_0,amount
year,Unnamed: 1_level_1
2010,263488
2011,294697
2012,328655
2013,350487
2014,386065
2015,433987
2016,488840
2017,551298
2018,611743
2019,705534


In [13]:
#Change 'amount' Datatype in In Store Monthly Sales DataFrame
total_sales["amount"]=total_sales["amount"].astype(int)
total_sales.dtypes

month        object
year          int64
amount        int64
dt_code      object
cat_code     object
monthyear    object
dtype: object

In [14]:
#In Store Annual Sales
total_annual_sales = total_sales.groupby("year").agg({'amount': "sum"})
total_annual_sales

Unnamed: 0_level_0,amount
year,Unnamed: 1_level_1
2010,3093706
2011,3251907
2012,3384477
2013,3492856
2014,3656015
2015,3811348
2016,3946780
2017,4112980
2018,4306377
2019,4479461


In [15]:
#Aggregate Data and create annual summary DataFrame
ecomm_annual_sales = (ecomm_sales.groupby("year")["amount"].sum())
total_annual_sales = (total_sales.groupby("year")["amount"].sum())
instore_annual_sales = total_annual_sales - ecomm_annual_sales
ecom_per = round((ecomm_annual_sales / total_annual_sales) * 100,2)
instore_per = round((instore_annual_sales / total_annual_sales) * 100,2)
years = ecomm_sales.groupby('year')["year"].mean()

yearly_sales_both = pd.DataFrame({"year": years, "ecomm sales(MIL$)": ecomm_annual_sales,
                                  "instore sales(MIL$)": instore_annual_sales, "ecomm %": ecom_per, 
                                  "instore %": instore_per, "total sales": total_annual_sales})
yearly_sales_both

Unnamed: 0_level_0,year,ecomm sales(MIL$),instore sales(MIL$),ecomm %,instore %,total sales
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2010,2010,263488,2830218,8.52,91.48,3093706
2011,2011,294697,2957210,9.06,90.94,3251907
2012,2012,328655,3055822,9.71,90.29,3384477
2013,2013,350487,3142369,10.03,89.97,3492856
2014,2014,386065,3269950,10.56,89.44,3656015
2015,2015,433987,3377361,11.39,88.61,3811348
2016,2016,488840,3457940,12.39,87.61,3946780
2017,2017,551298,3561682,13.4,86.6,4112980
2018,2018,611743,3694634,14.21,85.79,4306377
2019,2019,705534,3773927,15.75,84.25,4479461


In [16]:
#Write DataFrame to CSV files
monthly_sales_both.to_csv("censusdata_monthly_sum.csv", index=False, header=True)
yearly_sales_both.to_csv("censusdata_yearly_sum.csv", index=False, header=True)