In [1]:
import os
import pandas as pd

# Directory containing the CSV files
data_dir = '../johns_hopkins_github_data_pull/covid_reports_2020'

# List all CSV files in the directory
csv_files = [f for f in os.listdir(data_dir) if f.endswith('.csv')]

# Initialize an empty list to hold individual DataFrames
dataframes = []

# Load each CSV file into a DataFrame
for file in csv_files:
    file_path = os.path.join(data_dir, file)
    df = pd.read_csv(file_path)
    # Extract the date from the filename and create a new column for it
    date = file.split('.')[0]  # Assumes filename is of the form 'MM-DD-YYYY.csv'
    df['Date'] = pd.to_datetime(date, format='%m-%d-%Y')
    # Append the DataFrame to the list
    dataframes.append(df)

# Concatenate all DataFrames into a single DataFrame
combined_df = pd.concat(dataframes, ignore_index=True)

# Filter for 'US' country and select necessary columns
us_data = combined_df[combined_df['Country_Region'] == 'US']

# Pivot the DataFrame to get 'Confirmed' cases as columns
pivot_df = us_data.pivot_table(index=['Province_State'], columns='Date', values='Confirmed', aggfunc='sum')

# Flatten the columns and rename them to include the date in the format 'Confirmed MM-DD-YYYY'
pivot_df.columns = [f'Confirmed {date.strftime("%m-%d-%Y")}' for date in pivot_df.columns]

# Reset index to include 'Province_State' as a column
final_df = pivot_df.reset_index()

# Display the final DataFrame
print(final_df)

# Save the final DataFrame to a CSV file if needed
final_df.to_csv('consolidated_covid_data_us.csv', index=False)


              Province_State  Confirmed 04-01-2020  Confirmed 05-01-2020  \
0                    Alabama                  1171                  7440   
1                     Alaska                   132                   364   
2                    Arizona                  1530                  7969   
3                   Arkansas                   584                  3337   
4                 California                  9262                 52542   
5                   Colorado                  7604                 20228   
6                Connecticut                  3557                 28764   
7                   Delaware                   368                  4918   
8           Diamond Princess                    49                    49   
9       District of Columbia                   586                  4658   
10                   Florida                  6956                 34728   
11                   Georgia                  4638                 27489   
12          