In [44]:
import pandas as pd

# Step 1: Read the data from the confirmed CSV file
confirmed_file = 'rawData/monthly_confirmed.csv'
df_confirmed = pd.read_csv(confirmed_file)

# Step 2: Fill missing values in "Province/State" column with null
df_confirmed["Province/State"].fillna(value='unknown', inplace=True)

# Step 3: Melt the data to unpivot the date columns
id_vars =  ["Province/State","Country/Region","Lat","Long"]
df_confirmed_melted = df_confirmed.melt(id_vars=id_vars, var_name="date", value_name="confirmed_cases")

# Step 4: Rename the "Country/Region" column to "Country_Region"
df_confirmed_melted.rename(columns={'Country/Region': 'Country_Region'}, inplace=True)

# Step 5: Rename the "Province/State" column to "Province_State"
df_confirmed_melted.rename(columns={'Province/State': 'Province_State'}, inplace=True)

df_confirmed_melted.rename(columns={'confirmed_cases': 'cases'}, inplace=True)

# Step 6: Convert the "date" column to a consistent date format (e.g., 'YYYY-MM')
df_confirmed_melted['date'] = pd.to_datetime(df_confirmed_melted['date'], format='%B/%y').dt.strftime('%Y-%m')

# Combining province/state with country/region
combined_values = []
for index, row in df_confirmed_melted.iterrows():
    province = row["Province_State"]
    country = row["Country_Region"]
    if province != "unknown":
        combined_values.append(f"{country}, {province}")
    else:
        combined_values.append(country)

df_confirmed_melted["Location"] = combined_values

# Step 7: Optionally, sort the data frame by country and date
confirmed_clean_df = df_confirmed_melted.sort_values(by=["Location", "date"]).reset_index(drop=True)
confirmed_clean_df = confirmed_clean_df[['Location', 'Lat', 'Long', 'date', 'cases']]
# Display the final data frame
confirmed_clean_df.head(100)

Unnamed: 0,Location,Lat,Long,date,cases
0,Afghanistan,33.93911,67.709953,2020-01,0.0
1,Afghanistan,33.93911,67.709953,2020-02,30.0
2,Afghanistan,33.93911,67.709953,2020-03,1141.0
3,Afghanistan,33.93911,67.709953,2020-04,26928.0
4,Afghanistan,33.93911,67.709953,2020-05,222720.0
...,...,...,...,...,...
95,Algeria,28.03390,1.659600,2021-08,5788899.0
96,Algeria,28.03390,1.659600,2021-09,6016238.0
97,Algeria,28.03390,1.659600,2021-10,6358229.0
98,Algeria,28.03390,1.659600,2021-11,6249508.0


In [45]:
import pandas as pd
#DO THIS FOR THE DEATHS CSV

# Step 1: Read the data from the confirmed CSV file
deaths_file = 'rawData/monthly_deaths.csv'
df_deaths = pd.read_csv(deaths_file)

# Step 2: Fill missing values in "Province/State" column with null
df_deaths["Province/State"].fillna(value='unknown', inplace=True)

# Step 3: Melt the data to unpivot the date columns
id_vars =  ["Province/State","Country/Region","Lat","Long"]
df_deaths_melted = df_deaths.melt(id_vars=id_vars, var_name="date", value_name="deaths")

# Step 4: Rename the "Country/Region" column to "Country_Region"
df_deaths_melted.rename(columns={'Country/Region': 'Country_Region'}, inplace=True)

# Step 5: Rename the "Province/State" column to "Province_State"
df_deaths_melted.rename(columns={'Province/State': 'Province_State'}, inplace=True)

df_deaths_melted.rename(columns={'deaths': 'cases'}, inplace=True)
# Step 6: Convert the "date" column to a consistent date format (e.g., 'YYYY-MM')
df_deaths_melted['date'] = pd.to_datetime(df_deaths_melted['date'], format='%B/%y').dt.strftime('%Y-%m')

# Combining province/state with country/region
combined_values = []
for index, row in df_deaths_melted.iterrows():
    province = row["Province_State"]
    country = row["Country_Region"]
    if province != "unknown":
        combined_values.append(f"{country}, {province}")
    else:
        combined_values.append(country)

df_deaths_melted["Location"] = combined_values

# Step 7: Optionally, sort the data frame by country and date
deaths_clean_df = df_deaths_melted.sort_values(by=["Location", "date"]).reset_index(drop=True)
deaths_clean_df = deaths_clean_df[['Location', 'Lat', 'Long', 'date', 'cases']]
# Display the final data frame
deaths_clean_df.head(100)

Unnamed: 0,Location,Lat,Long,date,cases
0,Afghanistan,33.93911,67.709953,2020-01,0
1,Afghanistan,33.93911,67.709953,2020-02,0
2,Afghanistan,33.93911,67.709953,2020-03,21
3,Afghanistan,33.93911,67.709953,2020-04,812
4,Afghanistan,33.93911,67.709953,2020-05,4924
...,...,...,...,...,...
95,Algeria,28.03390,1.659600,2021-08,149055
96,Algeria,28.03390,1.659600,2021-09,168391
97,Algeria,28.03390,1.659600,2021-10,181913
98,Algeria,28.03390,1.659600,2021-11,179783


In [46]:
import pandas as pd
#DO THIS FOR RECOVERY
# Step 1: Read the data from the confirmed CSV file
recovery_file = 'rawData/monthly_recovery.csv'
df_recovery = pd.read_csv(recovery_file)

# Step 2: Fill missing values in "Province/State" column with null
df_recovery["Province/State"].fillna(value='unknown', inplace=True)

# Step 3: Melt the data to unpivot the date columns
id_vars =  ["Province/State","Country/Region","Lat","Long"]
df_recovery_melted = df_recovery.melt(id_vars=id_vars, var_name="date", value_name="recovery")

# Step 4: Rename the "Country/Region" column to "Country_Region"
df_recovery_melted.rename(columns={'Country/Region': 'Country_Region'}, inplace=True)
# Step 5: Rename the "Province/State" column to "Province_State"
df_recovery_melted.rename(columns={'Province/State': 'Province_State'}, inplace=True)

df_recovery_melted.rename(columns={'recovery': 'cases'}, inplace=True)

# Step 6: Convert the "date" column to a consistent date format (e.g., 'YYYY-MM')
df_recovery_melted['date'] = pd.to_datetime(df_recovery_melted['date'], format='%B/%y').dt.strftime('%Y-%m')

# Combining province/state with country/region
combined_values = []
for index, row in df_recovery_melted.iterrows():
    province = row["Province_State"]
    country = row["Country_Region"]
    if province != "unknown":
        combined_values.append(f"{country}, {province}")
    else:
        combined_values.append(country)

df_recovery_melted["Location"] = combined_values

# Step 7: Optionally, sort the data frame by country and date
recovery_clean_df = df_recovery_melted.sort_values(by=["Location", "date"]).reset_index(drop=True)
recovery_clean_df = recovery_clean_df[['Location', 'Lat', 'Long', 'date', 'cases']]

# Display the final data frame
recovery_clean_df.head(100)

Unnamed: 0,Location,Lat,Long,date,cases
0,Afghanistan,33.93911,67.709953,2020-01,0
1,Afghanistan,33.93911,67.709953,2020-02,0
2,Afghanistan,33.93911,67.709953,2020-03,26
3,Afghanistan,33.93911,67.709953,2020-04,2927
4,Afghanistan,33.93911,67.709953,2020-05,24129
...,...,...,...,...,...
95,Antarctica,-71.94990,23.347000,2020-01,0
96,Antarctica,-71.94990,23.347000,2020-02,0
97,Antarctica,-71.94990,23.347000,2020-03,0
98,Antarctica,-71.94990,23.347000,2020-04,0


In [47]:
#JSONify all DataFrames
confirmed_json = confirmed_clean_df.to_json(orient = 'records', indent=2)
deaths_json = deaths_clean_df.to_json(orient = 'records', indent = 2)
recovery_json = recovery_clean_df.to_json(orient = 'records', indent = 2)

In [50]:
# Define the file path where you want to save the JSON data
confirmed_path = 'JSON Data/confirmed_data.js'
deaths_path = 'JSON Data/deaths_data.js'
recovery_path = 'JSON Data/recovery_data.js'
combined_path = 'JSON Data/combined_data.js'

# Write the JSON data to the file
with open(confirmed_path, 'w') as js_file:
    js_file.write(f"{confirmed_json}")

with open(deaths_path, 'w') as js_file:
    js_file.write(f"{deaths_json}")

with open(recovery_path, 'w') as js_file:
    js_file.write(f"{recovery_json}")

with open(combined_path, 'w') as js_file:
    js_file.write(f"var confirmed_json = {confirmed_json}\n")
    js_file.write(f"var deaths_json = {deaths_json}\n")
    js_file.write(f"var recovery_json = {recovery_json}\n")