In [1]:
# Dependencies and Setup
import pandas as pd
import numpy as np
import os

# Parent directory to read in datasets
source_data_folder = 'source_data/pulse_survey/'

In [2]:
# Initial configuration items prior to loading in data
# Create dictionary to convert "Mmm" to 2 digit month
month_dict = {'Jan':'01',
              'Feb':'02',
              'Mar':'03',
              'Apr':'04',
              'May':'05',
              'Jun':'06',
              'Jul':'07',
              'Aug':'08',
              'Sep':'09',
              'Oct':'10',
              'Nov':'11',
              'Dec':'12'           
             }

# Create empty dataframe for national_state_sector file - will be populated in loop below
columns = ['ST','NAICS_SECTOR','INSTRUMENT_ID','QUESTION','ANSWER_ID','ANSWER_TEXT','ESTIMATE_PERCENTAGE','SE','START_DATE','END_DATE']
national_state_sector_df = pd.DataFrame(columns=columns)

In [3]:
# Loop through each weeks folder and read in 'national_state_sector' file
for i in range(1,37):
    data_folder = f'{source_data_folder}week{str(i)}/'
    # iterate through each week's folder
    # NOTE: approach taken from https://stackoverflow.com/questions/10377998/how-can-i-iterate-over-files-in-a-given-directory
    for filename in os.listdir(data_folder):
        if filename[0:21] == 'national_state_sector':
            print(f'----- Reading in filename: {filename} -----')
            # Parse file name for start date and end date
            # Start month
            start_month = month_dict[filename[24:27]]
            start_day = filename[22:24]
            start_year = f'20{filename[27:29]}'
            start_date = f'{start_month}/{start_day}/{start_year}'
            # End month
            end_month = month_dict[filename[32:35]]
            end_day = filename[30:32]
            end_year = f'20{filename[35:37]}'
            end_date = f'{end_month}/{end_day}/{end_year}'
            
            # Read in dataframe to a current_df
            current_df = pd.read_excel(data_folder + filename)
            
            # Add in start and end date columns
            current_df['START_DATE'] = start_date
            current_df['END_DATE'] = end_date
            
            # Append to 'national_state_sector_df' created in above cell
            national_state_sector_df = national_state_sector_df.append(current_df)
            
national_state_sector_df.head()

----- Reading in filename: national_state_sector_26Apr20_02May20.xlsx -----
----- Reading in filename: national_state_sector_03May20_09May20.xlsx -----
----- Reading in filename: national_state_sector_10May20_16May20.xlsx -----
----- Reading in filename: national_state_sector_17May20_23May20.xlsx -----
----- Reading in filename: national_state_sector_24May20_30May20.xlsx -----
----- Reading in filename: national_state_sector_31May20_06Jun20.xlsx -----
----- Reading in filename: national_state_sector_07Jun20_13Jun20.xlsx -----
----- Reading in filename: national_state_sector_14Jun20_20Jun20.xlsx -----
----- Reading in filename: national_state_sector_21Jun20_27Jun20.xlsx -----
----- Reading in filename: national_state_sector_09Aug20_15Aug20.xlsx -----
----- Reading in filename: national_state_sector_16Aug20_22Aug20.xlsx -----
----- Reading in filename: national_state_sector_23Aug20_29Aug20.xlsx -----
----- Reading in filename: national_state_sector_30Aug20_05Sep20.xlsx -----
----- Readin

Unnamed: 0,ST,NAICS_SECTOR,INSTRUMENT_ID,QUESTION,ANSWER_ID,ANSWER_TEXT,ESTIMATE_PERCENTAGE,SE,START_DATE,END_DATE
0,-,-,1,"Overall, how has this business been affected b...",1,Large negative effect,51.4%,0.21%,04/26/2020,05/02/2020
1,-,-,1,"Overall, how has this business been affected b...",2,Moderate negative effect,38.5%,0.27%,04/26/2020,05/02/2020
2,-,-,1,"Overall, how has this business been affected b...",3,Little or no effect,7.6%,0.29%,04/26/2020,05/02/2020
3,-,-,1,"Overall, how has this business been affected b...",4,Moderate positive effect,1.7%,0.08%,04/26/2020,05/02/2020
4,-,-,1,"Overall, how has this business been affected b...",5,Large positive effect,0.8%,0.06%,04/26/2020,05/02/2020


In [6]:
len(national_state_sector_df)

634197

In [7]:
# Clean consolidated dataframe

#1.) Drop values not populated
national_state_sector_df = national_state_sector_df.loc[(national_state_sector_df['ST'] != '-')&
                                                        (national_state_sector_df['NAICS_SECTOR'] != '-'),:]

In [8]:
len(national_state_sector_df)

430832