In [14]:
import pandas as pd
import os

# Define the file paths using relative paths
us_data_path = 'consolidated_covid_data_us_with_lat_long_.csv'
world_data_path = 'consolidated_covid_data_world2us.csv'
output_path = 'merged_covid_data_2.csv'

# Print the current working directory
print("Current working directory:", os.getcwd())

# Check if the files exist
if not os.path.exists(us_data_path):
    print(f"File not found: {us_data_path}")
elif not os.path.exists(world_data_path):
    print(f"File not found: {world_data_path}")
else:
    # Load the data
    us_data = pd.read_csv(us_data_path)
    world_data = pd.read_csv(world_data_path)

    # Merge the data on 'Province_State'
    merged_data = pd.merge(us_data, world_data, on='Province_State', how='left')

   # Remove specific rows based on 'Province_State' column
    exclude_states = ['American Samoa', 'Northern Mariana Islands', 'Puerto Rico', 'Guam']
    merged_data = merged_data[~merged_data['Province_State'].isin(exclude_states)]

    # Find the confirmed columns from both files
    confirmed_columns_us = [col for col in us_data.columns if col.startswith('Confirmed')]
    confirmed_columns_world = [col for col in world_data.columns if col.startswith('Confirmed')]

    # Combine the 'Confirmed' columns, avoiding duplicates
    combined_confirmed_columns = sorted(set(confirmed_columns_world + confirmed_columns_us), key=lambda x: pd.to_datetime(x.split(' ')[1], format='%m-%d-%Y'))

    # Keep required columns and add the combined 'Confirmed' columns
    result = merged_data[['Province_State', 'Lat', 'Long_'] + combined_confirmed_columns]

    # Save the result to a new CSV file
    result.to_csv(output_path, index=False)
    print(f"Merged data saved to: {output_path}")


Current working directory: /Users/stephenloucel/Desktop/DataClass/Classwork/Project-3/Stephen/johns_hopkins_data_merge
Merged data saved to: merged_covid_data_2.csv
