In [110]:
import os
import pandas as pd
from datetime import *
import numpy as np
import matplotlib.pyplot as plt

### Step 1: Create a function that stores the file paths in the folder

In [36]:
def get_filepaths(directory):

    '''
    Read files from a path, and return file names
    ---------------------------------------------

    input:
        directory: string of directory (path)

    return:
        a list of file names

    '''

    file_paths = [] # List which will store all of the full filepaths.

    for filename in os.scandir(directory): # os.scandir() returns an iterator of DirEntry objects
        if filename.is_file(): # if the object is a file
            file_paths.append(filename.path) # append the file name to the list

    return file_paths

In [37]:
# directory paths (change paths if needed)
data_dir_age = '/Users/kazi0302/Desktop/SDSU/MSCSDS/COVID-19-Modeling-and-Parameter-Fitting/Data/Data_unscrambled/Data_by_age'
data_dir_date = '/Users/kazi0302/Desktop/SDSU/MSCSDS/COVID-19-Modeling-and-Parameter-Fitting/Data/Data_unscrambled/Data_by_date'
data_dir_race = '/Users/kazi0302/Desktop/SDSU/MSCSDS/COVID-19-Modeling-and-Parameter-Fitting/Data/Data_unscrambled/Data_by_race'

# store file paths from each catogory
files_age = get_filepaths(data_dir_age)
files_date = get_filepaths(data_dir_date)
files_race = get_filepaths(data_dir_race)

### Step 2: Create a function that reads each file stored and extract the information needed

In [142]:
def get_data(filename):

    '''
    Read data file, extract corresponding rows and store each region
    ----------------------------------------------------------------

    input:
        filename: string of file name

    return:
        dictionaries of dataframes of each region
    
    '''

    if filename.endswith('.csv'):
        df = pd.read_csv(filename)
    elif filename.endswith('.xlsx'):
        df = pd.read_excel(filename)
    else:
        print('File type not supported')

    central = []
    east = []
    north_central = []
    north_coastal = []
    north_inland = []
    south = []

    for index, row in df.iterrows():
        temp = [] # date, raw, total

        # central region data
        if row[0] == 'Central Region' and row[1] and row[2]:
            temp.append(row[-1])
            temp.append(row[1])
            temp.append(row[2])
            central.append(temp)

        # east region data
        elif row[0] == 'East Region':
            temp.append(row[-1])
            temp.append(row[1])
            temp.append(row[2])
            east.append(temp)

        # north central region data
        elif row[0] == 'North Central Region':
            temp.append(row[-1])
            temp.append(row[1])
            temp.append(row[2])
            north_central.append(temp)

        # north coastal region data
        elif row[0] == 'North Coastal Region':
            temp.append(row[-1])
            temp.append(row[1])
            temp.append(row[2])
            north_coastal.append(temp)

        # north inland region data
        elif row[0] == 'North Inland Region':
            temp.append(row[-1])
            temp.append(row[1])
            temp.append(row[2])
            north_inland.append(temp)

        # south region data
        elif row[0] == 'South Region':
            temp.append(row[-1])
            temp.append(row[1])
            temp.append(row[2])
            south.append(temp)

        else:
            pass

    return central, east, north_central, north_coastal, north_inland, south

    # new_central = pd.DataFrame(central, index=['date', 'raw', 'total'])
    # new_east = pd.DataFrame(east, index=['date', 'raw', 'total'])
    # new_north_central = pd.DataFrame(north_central, index=['date', 'raw', 'total'])
    # new_north_coastal = pd.DataFrame(north_coastal, index=['date', 'raw', 'total'])
    # new_north_inland = pd.DataFrame(north_inland, index=['date', 'raw', 'total'])
    # new_south = pd.DataFrame(south, index=['date', 'raw', 'total'])


### Step 3: Convert the data into Pandas dataframe

In [145]:
data_central = pd.DataFrame(index=['date', 'raw', 'total'])

for file in files_date:
    
    central, east, north_central, north_coastal, north_inland, south = get_data(file)

    print(central)

[['9/1/2020', 8215.0, 1594.7]]
[['11/11/2020', 13035.0, 2530.4]]
[['4/23/2020', 598.0, 115.491]]
[['6/15/2020', 2138.0, 412.91]]
[['6/22/2020', 2514.0, 485.5]]
[['5/18/2020', 1276.0, 246.431]]
[['7/28/2020', 6066.0, 1177.5]]
[['9/24/2020', 10249.0, 1989.6]]
[['9/13/2020', 9443.0, 1833.1]]
[['12/9/2020', 18983.0, 3685.0]]
[['11/3/2020', 12341.0, 2395.7]]
[['5/15/2020', 1184.0, 228.664]]
[['4/19/2020', 520.0, 100.427]]
[[Timestamp('2020-06-17 00:00:00'), 2222.0, 429.1]]
[['6/7/2020', 1874.0, 361.922]]
[['10/29/2020', 12001.0, 2329.7]]
[['5/10/2020', 1043.0, 201.433]]
[['6/18/2020', 2278.0, 439.9]]
[['10/1/2020', 10596.0, 2056.9]]
[['4/3/2020', 296.0, 57.1659]]
[['8/2/2020', 6450.0, 1252.1]]
[['10/24/2020', 11703.0, 2271.8]]
[['10/13/2020', 11177.0, 2169.7]]
[['5/2/2020', 818.0, 157.979]]
[['4/11/20', 424.0, 81.8863]]
[['8/27/2020', 7930.0, 1539.4]]
[['12/4/2020', 17637.0, 3423.7]]
[['8/10/2020', 7032.0, 1365.1]]
[['7/25/2020', 5793.0, 1118.8]]
[['4/30/2020', 771.0, 148.902]]
[['9/29/2020

  warn("""Cannot parse header or footer so it will be ignored""")


[['11/14/2020', 13464.0, 2613.7]]
[['5/28/2020', 1553.0, 299.928]]
[['10/30/2020', 12117.0, 2352.2]]
[['10/9/2020', 11002.0, 2135.7]]
[['6/27/2020', 2999.0, 579.2]]
[['4/13/2020', 438.0, 84.59]]
[['6/10/2020', 1965.0, 379.497]]
[['7/31/2020', 6307.0, 1224.3]]
[[Timestamp('2020-05-06 00:00:00'), 918.0, 177.292]]
[['9/4/2020', 8605.0, 1670.4]]
[['7/8/2020', 4051.0, 782.4]]
[['5/21/2020', 1366.0, 263.813]]
[['5/12/2020', 1098.0, 212.055]]
[['11/6/2020', 12616.0, 2449.0]]
[['4/29/2020', 739.0, 142.722]]
[[Timestamp('2020-05-20 00:00:00'), 1340.0, 258.792]]
[['9/16/2020', 9664.0, 1876.0]]
[['9/21/2020', 10028.0, 1946.7]]
[['8/18/2020', 7451.0, 1446.4]]
[['4/7/2020', 368.0, 71.0711]]
[['9/26/2020', 10384.0, 2015.8]]
[['8/28/2020', 7979.0, 1548.9]]
[['11/1/2020', 12236.0, 2375.3]]
[['6/5/2020', 1809.0, 349.369]]
[[Timestamp('2020-04-14 00:00:00'), 457.0, 88.26]]
[['9/3/2020', 8460.0, 1642.3]]
[['4/15/2020', 470.0, 90.77]]
[['4/22/2020', 560.0, 108.152]]
[['5/6/2020', 918.0, 177.292]]
[['11/13

In [140]:
data_central

date
raw
total
