In [2]:
import pandas as pd

# Step 1: Read the data from the confirmed CSV file
confirmed_file = 'rawData/time_series_covid19_confirmed_global.csv'
df_confirmed = pd.read_csv(confirmed_file)

# Step 2: Fill missing values in "Province/State" column with null
df_confirmed["Province/State"].fillna(value='unknown', inplace=True)

# Step 3: Melt the data to unpivot the date columns
id_vars =  ["Province/State","Country/Region","Lat","Long"]
df_confirmed_melted = df_confirmed.melt(id_vars=id_vars, var_name="date", value_name="confirmed_cases")

# Drop rows with null values in 'Lat' and 'Long' columns
df_confirmed_melted.dropna(subset=['Lat', 'Long'], inplace=True)

# Step 4: Rename the "Country/Region" column to "Country_Region"
df_confirmed_melted.rename(columns={'Country/Region': 'Country_Region'}, inplace=True)

# Step 5: Rename the "Province/State" column to "Province_State"
df_confirmed_melted.rename(columns={'Province/State': 'Province_State'}, inplace=True)

df_confirmed_melted.rename(columns={'confirmed_cases': 'cases'}, inplace=True)

# Step 6: Convert the "date" column to a consistent date format (e.g., 'YYYY-MM')
df_confirmed_melted['date'] = pd.to_datetime(df_confirmed_melted['date'], format='%m/%d/%y').dt.strftime('%Y-%m-%d')

# Combining province/state with country/region
combined_values = []
for index, row in df_confirmed_melted.iterrows():
    province = row["Province_State"]
    country = row["Country_Region"]
    if province != "unknown":
        combined_values.append(f"{country}, {province}")
    else:
        combined_values.append(country)

df_confirmed_melted["Location"] = combined_values

# Step 7: Optionally, sort the data frame by country and date
daily_confirmed_clean_df = df_confirmed_melted.sort_values(by=["Location", "date"]).reset_index(drop=True)
daily_confirmed_clean_df = daily_confirmed_clean_df[['Location', 'Lat', 'Long', 'date', 'cases']]
# Display the final data frame
daily_confirmed_clean_df.head(100)

Unnamed: 0,Location,Lat,Long,date,cases
0,Afghanistan,33.93911,67.709953,2020-01-22,0
1,Afghanistan,33.93911,67.709953,2020-01-23,0
2,Afghanistan,33.93911,67.709953,2020-01-24,0
3,Afghanistan,33.93911,67.709953,2020-01-25,0
4,Afghanistan,33.93911,67.709953,2020-01-26,0
...,...,...,...,...,...
95,Afghanistan,33.93911,67.709953,2020-04-26,1463
96,Afghanistan,33.93911,67.709953,2020-04-27,1531
97,Afghanistan,33.93911,67.709953,2020-04-28,1703
98,Afghanistan,33.93911,67.709953,2020-04-29,1827


In [3]:
import pandas as pd
#DO THIS FOR THE DEATHS CSV

# Step 1: Read the data from the confirmed CSV file
deaths_file = 'rawData/time_series_covid19_deaths_global.csv'
df_deaths = pd.read_csv(deaths_file)

# Step 2: Fill missing values in "Province/State" column with null
df_deaths["Province/State"].fillna(value='unknown', inplace=True)

# Step 3: Melt the data to unpivot the date columns
id_vars =  ["Province/State","Country/Region","Lat","Long"]
df_deaths_melted = df_deaths.melt(id_vars=id_vars, var_name="date", value_name="deaths")

# Drop rows with null values in 'Lat' and 'Long' columns
df_deaths_melted.dropna(subset=['Lat', 'Long'], inplace=True)

# Step 4: Rename the "Country/Region" column to "Country_Region"
df_deaths_melted.rename(columns={'Country/Region': 'Country_Region'}, inplace=True)

# Step 5: Rename the "Province/State" column to "Province_State"
df_deaths_melted.rename(columns={'Province/State': 'Province_State'}, inplace=True)

df_deaths_melted.rename(columns={'deaths': 'cases'}, inplace=True)
# Step 6: Convert the "date" column to a consistent date format (e.g., 'YYYY-MM')
df_deaths_melted['date'] = pd.to_datetime(df_deaths_melted['date'], format='%m/%d/%y').dt.strftime('%Y-%m-%d')

# Combining province/state with country/region
combined_values = []
for index, row in df_deaths_melted.iterrows():
    province = row["Province_State"]
    country = row["Country_Region"]
    if province != "unknown":
        combined_values.append(f"{country}, {province}")
    else:
        combined_values.append(country)

df_deaths_melted["Location"] = combined_values

# Step 7: Optionally, sort the data frame by country and date
daily_deaths_clean_df = df_deaths_melted.sort_values(by=["Location", "date"]).reset_index(drop=True)
daily_deaths_clean_df = daily_deaths_clean_df[['Location', 'Lat', 'Long', 'date', 'cases']]
# Display the final data frame
daily_deaths_clean_df.head(100)

Unnamed: 0,Location,Lat,Long,date,cases
0,Afghanistan,33.93911,67.709953,2020-01-22,0
1,Afghanistan,33.93911,67.709953,2020-01-23,0
2,Afghanistan,33.93911,67.709953,2020-01-24,0
3,Afghanistan,33.93911,67.709953,2020-01-25,0
4,Afghanistan,33.93911,67.709953,2020-01-26,0
...,...,...,...,...,...
95,Afghanistan,33.93911,67.709953,2020-04-26,49
96,Afghanistan,33.93911,67.709953,2020-04-27,50
97,Afghanistan,33.93911,67.709953,2020-04-28,60
98,Afghanistan,33.93911,67.709953,2020-04-29,60


In [4]:
import pandas as pd
#DO THIS FOR RECOVERY
# Step 1: Read the data from the confirmed CSV file
recovery_file = 'rawData/time_series_covid19_recovered_global.csv'
df_recovery = pd.read_csv(recovery_file)

# Step 2: Fill missing values in "Province/State" column with null
df_recovery["Province/State"].fillna(value='unknown', inplace=True)

# Step 3: Melt the data to unpivot the date columns
id_vars =  ["Province/State","Country/Region","Lat","Long"]
df_recovery_melted = df_recovery.melt(id_vars=id_vars, var_name="date", value_name="recovery")

# Drop rows with null values in 'Lat' and 'Long' columns
df_recovery_melted.dropna(subset=['Lat', 'Long'], inplace=True)

# Step 4: Rename the "Country/Region" column to "Country_Region"
df_recovery_melted.rename(columns={'Country/Region': 'Country_Region'}, inplace=True)
# Step 5: Rename the "Province/State" column to "Province_State"
df_recovery_melted.rename(columns={'Province/State': 'Province_State'}, inplace=True)

df_recovery_melted.rename(columns={'recovery': 'cases'}, inplace=True)

# Step 6: Convert the "date" column to a consistent date format (e.g., 'YYYY-MM')
df_recovery_melted['date'] = pd.to_datetime(df_recovery_melted['date'], format='%m/%d/%y').dt.strftime('%Y-%m-%d')

# Combining province/state with country/region
combined_values = []
for index, row in df_recovery_melted.iterrows():
    province = row["Province_State"]
    country = row["Country_Region"]
    if province != "unknown":
        combined_values.append(f"{country}, {province}")
    else:
        combined_values.append(country)

df_recovery_melted["Location"] = combined_values

# Step 7: Optionally, sort the data frame by country and date
daily_recovery_clean_df = df_recovery_melted.sort_values(by=["Location", "date"]).reset_index(drop=True)
daily_recovery_clean_df = daily_recovery_clean_df[['Location', 'Lat', 'Long', 'date', 'cases']]

# Display the final data frame
daily_recovery_clean_df.head(100)

Unnamed: 0,Location,Lat,Long,date,cases
0,Afghanistan,33.93911,67.709953,2020-01-22,0
1,Afghanistan,33.93911,67.709953,2020-01-23,0
2,Afghanistan,33.93911,67.709953,2020-01-24,0
3,Afghanistan,33.93911,67.709953,2020-01-25,0
4,Afghanistan,33.93911,67.709953,2020-01-26,0
...,...,...,...,...,...
95,Afghanistan,33.93911,67.709953,2020-04-26,207
96,Afghanistan,33.93911,67.709953,2020-04-27,220
97,Afghanistan,33.93911,67.709953,2020-04-28,228
98,Afghanistan,33.93911,67.709953,2020-04-29,252


In [5]:
#JSONify all DataFrames
confirmed_json = daily_confirmed_clean_df.to_json(orient = 'records', indent=2)
deaths_json = daily_deaths_clean_df.to_json(orient = 'records', indent = 2)
recovery_json = daily_recovery_clean_df.to_json(orient = 'records', indent = 2)

In [7]:
# Define the file path where you want to save the JSON data
confirmed_path = '../database/jsonData/daily_confirmed_data.js'
deaths_path = '../database/jsonData/daily_deaths_data.js'
recovery_path = '../database/jsonData/daily_recovery_data.js'

# Write the JSON data to the file
with open(confirmed_path, 'w') as js_file:
    js_file.write(f"{confirmed_json}")

with open(deaths_path, 'w') as js_file:
    js_file.write(f"{deaths_json}")

with open(recovery_path, 'w') as js_file:
    js_file.write(f"{recovery_json}")