Legacy functions for RDBS interaction

In [2]:
import pymysql
endpoint = 'ta10projectinstance1.c0m59tujlbqm.ap-southeast-2.rds.amazonaws.com'
username = 'admin'
password = 'fit5120ta10'
database_name = 'fit5120_i3_schema'
client_flag = 'CLIENT.MULTI_STATEMENTS'

connection = pymysql.connect(host = endpoint, user = username, passwd = password, db = database_name)

# Function used to clear all entries in a single table considering no indexes or Foreign keys are present
def truncate_handler(table):
    cursor = connection.cursor()
    cursor.execute("""TRUNCATE {}""".format(table))
    connection.commit()
    return 'table {} has been truncated'.format(table)

def country_region_handler(values):
    query = """INSERT INTO country_region (country_region_id, country_code, region_name) VALUES (%s, %s, %s)"""
    cursor = connection.cursor()
    cursor.execute(query, values)
    connection.commit()
    return 'import successful {}'.format(values)

def country_handler(values):
    query = """INSERT INTO country (country_name, country_code) VALUES (%s, %s)"""
    cursor = connection.cursor()
    cursor.execute(query, values)
    connection.commit()
    return 'import successful {}'.format(values)

def symptom_handler(values):
    query = """INSERT INTO symptom (symp_id, symp_name) VALUES (%s, %s)"""
    cursor = connection.cursor()
    cursor.execute(query, values)
    connection.commit()
    return 'import successful {}'.format(values)

def case_details_handler(values):
    query = """INSERT INTO case_details (case_id, symp_id) VALUES (%s, %s)"""
    cursor = connection.cursor()
    cursor.execute(query, values)
    connection.commit()
    return 'import successful {}'.format(values)

def case_handler(values):
    query = """INSERT INTO `case` (case_id, case_status, case_date, case_gender, country_region_id, case_death_date) VALUES (%s, %s, %s, %s, %s, %s)"""
    cursor = connection.cursor()
    cursor.execute(query, values)
    connection.commit()
    return 'import successful {}'.format(values)

def prediction_handler(values):
    query = """INSERT INTO predictions (country_region_id, pred_date, pred_count) VALUES (%s, %s, %s)"""
    cursor = connection.cursor()
    cursor.execute(query, values)
    connection.commit()
    return 'predictions table has been updated'

def query(query):
    sql_query = """{}""".format(query)
    cursor = connection.cursor()
    cursor.execute(sql_query)
    return cursor.fetchall()

Functions for data wrangling and extraction MUST BE UPLOADED TO AWS LAMBDA

In [3]:
import pandas as pd
import datetime as dt
# Function used to load new datasource
def load_data():
    # Loads the data
    df = pd.read_csv('https://raw.githubusercontent.com/owid/monkeypox/main/owid-monkeypox-data.csv')
    # converts the date column to a datetime type
    df['date'] = pd.to_datetime(df['date'], format='%Y-%m-%d').dt.date
    # Drops the rows related to the global count
    df = df.loc[df['iso_code'] != 'OWID_WRL']
    return df

# Function used to check if a death has occured to populate case_death_date column
def death_date_check(row):
    if row['new_deaths'] == 0:
        return None
    else:
        return row['date']

# Function used to generate the country_region_id as the data is country level
def country_region_id_gen(row):
    c_r_id = row['iso_code'] + "_0"
    return c_r_id

# Function used to generate the case_id from the raw data
def case_id_gen(df):
    owid_entries = query("SELECT count(case_id) FROM `case` WHERE case_id LIKE 'OWID%'")[0][0]
    new_entries = df.shape[0]
    c_id = ['OWID_' + str(owid_entries + x) for x in range(0,new_entries)]
    return c_id

# Adds new rows based on the nummber of cases which have occured in a given day
def explode_rows(df):
    new_df = {'location': [], 'iso_code': [], 'date':[], 'new_deaths':[], 'new_cases':[]}
    for indx in range(0, df.shape[0]):
        place = df.iloc[indx,0]
        iso = df.iloc[indx, 1]
        value = df.iloc[indx, 4]
        case_date = df.iloc[indx, 2]
        new_deaths = df.iloc[indx, 3]
        if value > 1:
            for i in range(0, int(value)):
                new_df['location'].append(place)
                new_df['iso_code'].append(iso)
                new_df['date'].append(case_date)
                new_df['new_cases'].append(1)
                if new_deaths == 1 or new_deaths == 0:
                    new_df['new_deaths'].append(new_deaths)
                elif new_deaths <= i:
                    new_df['new_deaths'].append(1)
                elif new_deaths > i:
                    new_df['new_deaths'].append(0)
        else:
            new_df['location'].append(place)
            new_df['iso_code'].append(iso)
            new_df['date'].append(case_date)
            new_df['new_cases'].append(1)
            if new_deaths == 1 or new_deaths == 0:
                new_df['new_deaths'].append(new_deaths)
            elif new_deaths <= i:
                    new_df['new_deaths'].append(1)
            elif new_deaths > i:
                    new_df['new_deaths'].append(0)
    exploded_df = pd.DataFrame(new_df)
    return exploded_df



# Function used to extract the latest cases from the new dataset which have not been inserted into the rdbs yet
def first_push_cases(df, last_entry):
    # Selects all cases that have occured from a given date on wards
    df = df.loc[df['date'] > last_entry]
    # drops all columns which are not required to reduce computational loading
    df = df[['location', 'iso_code', 'date', 'new_deaths', 'new_cases']]
    # Drops all rows which have no new cases
    df = df.loc[df['new_cases'] > 0]
    # Generates new rows for each new case
    df = explode_rows(df)
    # Creates a new column for case_death_date
    df['case_death_date'] = df.apply(death_date_check, axis = 1)
    # Creates a new column for case_gender
    df['case_gender'] = None
    # Creates the country_region_id
    df['country_region_id'] = df.apply(country_region_id_gen, axis = 1)
    # Creates a case_id and sorts the column to be ordered by date than country_region_id
    df = df.sort_values(['date', 'country_region_id'], ascending=False)
    df['case_id'] = case_id_gen(df)
    # Populates the status column
    df['case_status'] = 'confirmed'
    # Renames all columns to the appropriate column names for importing
    df = df.rename({'date': 'case_date'}, axis = 1)
    # Selects a subset of the dataframe after all columns have been generated for the case table
    df = df[['case_id', 'case_status', 'case_date', 'case_gender', 'country_region_id', 'case_death_date']]
    #df = df[['case_status', 'case_date', 'case_gender', 'country_region_id', 'case_death_date']]
    return df

# Function used to find the new rows to be uploaded
def new_cases():
    # Queries and finds the number of cases each country from the rdbs from the new data source
    current_entries = dict(query("""
    SELECT c.country_code, count(*) FROM
    `country_region` AS c JOIN `case` AS c1 ON c.country_region_id = c1.country_region_id
    WHERE c1.case_id LIKE 'OWID%'
    GROUP BY c.country_code"""))
    raw_data = load_data()
    mpx_data = raw_data.loc[raw_data['date'] > dt.date(2022, 9, 21)]
    latest_data = mpx_data[['iso_code', 'new_cases']].groupby('iso_code').agg('sum').to_dict()
    latest_data = latest_data['new_cases']
    new_case = dict()
    for item in latest_data.keys():
        if item in current_entries.keys() and latest_data[item] > current_entries[item]:
            new_case[item] = int(latest_data[item] - current_entries[item])
        elif item not in current_entries.keys() and latest_data[item] != 0:
            new_case[item] = int(latest_data[item])
    return mpx_data, new_case

# Function used to get all new rows to be added to the rdbs
def update_case(df, update_entries):
    # Drops all columns which aren't required
    df = df[['location', 'iso_code', 'date', 'new_deaths', 'new_cases']]
    add_df = {'location':[], 'iso_code':[], 'date': [], 'new_deaths': [], 'new_cases':[]}
    for item in update_entries.keys():
        temp_df = df.loc[df['iso_code'] == item]
        temp_df = temp_df.loc[temp_df['new_cases'] > 0]
        temp_df = temp_df.sort_values('date', ascending = False)
        indx = 0
        while update_entries[item] > 0:
            add_df['location'].append(temp_df.iloc[indx, 0])
            add_df['iso_code'].append(temp_df.iloc[indx, 1])
            add_df['date'].append(temp_df.iloc[indx, 2])
            add_df['new_deaths'].append(temp_df.iloc[indx, 3])
            if update_entries[item] < temp_df.iloc[indx, 4]:
                add_df['new_cases'].append(update_entries[item])
                update_entries[item] = 0
            else:
                update_entries[item] = update_entries[item]-temp_df.iloc[indx, 4]
                add_df['new_cases'].append(temp_df.iloc[indx, 4])
            indx = indx + 1
    update_df = pd.DataFrame(add_df)
    # Removes all cases which have 0
    update_df = update_df.loc[update_df['new_cases'] > 0]
    # Explodes the rows to make one row equal one case
    df = explode_rows(update_df)
    # Checks for deaths for a given update
    df['case_death_date'] = df.apply(death_date_check, axis = 1)
    # Creates a new column for case_gender
    df['case_gender'] = None
    # Creates the country_region_id
    df['country_region_id'] = df.apply(country_region_id_gen, axis = 1)
    # Creates a case_id and sorts the column to be ordered by date than country_region_id
    df = df.sort_values(['date', 'country_region_id'], ascending=False)
    df['case_id'] = case_id_gen(df)
    # Populates the status column
    df['case_status'] = 'confirmed'
    # Renames all columns to the appropriate column names for importing
    df = df.rename({'date': 'case_date'}, axis = 1)
    # Selects a subset of the dataframe after all columns have been generated for the case table
    df = df[['case_id', 'case_status', 'case_date', 'case_gender', 'country_region_id', 'case_death_date']]
    return df, update_entries

In [4]:
#query("SELECT count(*) FROM `case` WHERE case_id LIKE 'OWID_%'")

((0,),)

In [7]:
#query("DELETE FROM `case` WHERE case_id LIKE 'OWID_%'")

FUNCTION FOR INTIIAL DATA POPULATION DOES NOT NEED TO BE UPLOADED

In [5]:
def main():
    # Loads the new data
    mpx_data = load_data()
    # loads all data from a specified date onwards where depreciated data stopped
    mpx_latest_cases = first_push_cases(mpx_data, dt.date(2022, 9, 21))
    # Removes all previous entries from OWID
    #query("DELETE FROM `case` WHERE case_id LIKE 'OWID%'")
    # Updates the rdbs with all new entries
    # Obtains the cases which are not in the rdbs
    #mpx_update_case = update_case(mpx_latest_cases)
    # Uploads table for case
    case_df = mpx_latest_cases.copy()
    for row in range(0,case_df.shape[0]):
        con1 = False
        con2 = False
        in_tuple = tuple(case_df.iloc[row, :])
        if type(in_tuple[3]) != str:
            con1 = True
        if type(in_tuple[5]) != str:
            con2 = True
        if con1 == True and con2 == True:
            case_handler((in_tuple[0], in_tuple[1], in_tuple[2], None, in_tuple[4], None))
        elif con1 == True and con2 == False:
            case_handler((in_tuple[0], in_tuple[1], in_tuple[2], None, in_tuple[4], in_tuple[5]))
        elif con1 == False and con2 == True:
            case_handler((in_tuple[0], in_tuple[1], in_tuple[2], in_tuple[3], in_tuple[4], None))
        else: 
            case_handler(in_tuple)
        #print('row {} added'.format(row))
        if row == 500:
            break
main()

row 0 added
row 1 added
row 2 added
row 3 added
row 4 added
row 5 added
row 6 added
row 7 added
row 8 added
row 9 added
row 10 added
row 11 added
row 12 added
row 13 added
row 14 added
row 15 added
row 16 added
row 17 added
row 18 added
row 19 added
row 20 added
row 21 added
row 22 added
row 23 added
row 24 added
row 25 added
row 26 added
row 27 added
row 28 added
row 29 added
row 30 added
row 31 added
row 32 added
row 33 added
row 34 added
row 35 added
row 36 added
row 37 added
row 38 added
row 39 added
row 40 added
row 41 added
row 42 added
row 43 added
row 44 added
row 45 added
row 46 added
row 47 added
row 48 added
row 49 added
row 50 added
row 51 added
row 52 added
row 53 added
row 54 added
row 55 added
row 56 added
row 57 added
row 58 added
row 59 added
row 60 added
row 61 added
row 62 added
row 63 added
row 64 added
row 65 added
row 66 added
row 67 added
row 68 added
row 69 added
row 70 added
row 71 added
row 72 added
row 73 added
row 74 added
row 75 added
row 76 added
row 77 ad

FUNCTIONS TO BE UPLOADED TO AWS LAMBDA FOR UPDATES

In [6]:
# Function used to update the rdbs
def update_main():
    latest_data, new_entries = new_cases()
    update_df, updated_entries = update_case(latest_data, new_entries)
    case_df = update_df.copy()
    for row in range(0,case_df.shape[0]):
        con1 = False
        con2 = False
        in_tuple = tuple(case_df.iloc[row, :])
        if type(in_tuple[3]) != str:
            con1 = True
        if type(in_tuple[5]) != str:
            con2 = True
        if con1 == True and con2 == True:
            case_handler((in_tuple[0], in_tuple[1], in_tuple[2], None, in_tuple[4], None))
        elif con1 == True and con2 == False:
            case_handler((in_tuple[0], in_tuple[1], in_tuple[2], None, in_tuple[4], in_tuple[5]))
        elif con1 == False and con2 == True:
            case_handler((in_tuple[0], in_tuple[1], in_tuple[2], in_tuple[3], in_tuple[4], None))
        else: 
            case_handler(in_tuple)
        #print('row {} added'.format(row))
    return 'rdbs has been updated!'
update_main()

row 0 added
row 1 added
row 2 added
row 3 added
row 4 added
row 5 added
row 6 added
row 7 added
row 8 added
row 9 added
row 10 added
row 11 added
row 12 added
row 13 added
row 14 added
row 15 added
row 16 added
row 17 added
row 18 added
row 19 added
row 20 added
row 21 added
row 22 added
row 23 added
row 24 added
row 25 added
row 26 added
row 27 added
row 28 added
row 29 added
row 30 added
row 31 added
row 32 added
row 33 added
row 34 added
row 35 added
row 36 added
row 37 added
row 38 added
row 39 added
row 40 added
row 41 added
row 42 added
row 43 added
row 44 added
row 45 added
row 46 added
row 47 added
row 48 added
row 49 added
row 50 added
row 51 added
row 52 added
row 53 added
row 54 added
row 55 added
row 56 added
row 57 added
row 58 added
row 59 added
row 60 added
row 61 added
row 62 added
row 63 added
row 64 added
row 65 added
row 66 added
row 67 added
row 68 added
row 69 added
row 70 added
row 71 added
row 72 added
row 73 added
row 74 added
row 75 added
row 76 added
row 77 ad

'rdbs has been updated!'

In [47]:
update_df

Unnamed: 0,case_id,case_status,case_date,case_gender,country_region_id,case_death_date
3812,OWID_1501,confirmed,2022-10-06,,PRI_0,
2748,OWID_1502,confirmed,2022-10-06,,JPN_0,
2688,OWID_1503,confirmed,2022-10-06,,GTM_0,
2689,OWID_1504,confirmed,2022-10-06,,GTM_0,
2531,OWID_1505,confirmed,2022-10-06,,FRA_0,
...,...,...,...,...,...,...
146,OWID_7018,confirmed,2022-09-22,,AUT_0,
147,OWID_7019,confirmed,2022-09-22,,AUT_0,
134,OWID_7020,confirmed,2022-09-22,,AUS_0,
135,OWID_7021,confirmed,2022-09-22,,AUS_0,


In [None]:

# Loads the new data
mpx_data = load_data()
# loads all data from a specified date onwards where depreciated data stopped
mpx_latest_cases = new_cases(mpx_data, dt.date(2022, 9, 21))
# Updates the rdbs with all new entries
# Obtains the cases which are not in the rdbs
mpx_update_case = update_case(mpx_latest_cases)

In [134]:
query("DELETE FROM `case` WHERE case_id LIKE 'OWID%'")

()

In [164]:
mpx_data = load_data()
current_entries = dict(query("""
SELECT c.country_code, count(*) FROM
`country_region` AS c JOIN `case` AS c1 ON c.country_region_id = c1.country_region_id
WHERE c1.case_id LIKE 'OWID%'
GROUP BY c.country_code"""))
mpx_data = mpx_data.loc[mpx_data['date'] > dt.date(2022, 9, 21)]
temp = mpx_data[['iso_code', 'new_cases']].groupby('iso_code').agg('sum').to_dict()
temp = temp['new_cases']
new_case = dict()
for item in temp.keys():
    if item in current_entries.keys() and temp[item] > current_entries[item]:
        new_case[item] = int(temp[item] - current_entries[item])
    elif item not in current_entries.keys() and temp[item] != 0:
        new_case[item] = int(temp[item])
new_case

{'ARG': 131,
 'AUS': 6,
 'AUT': 11,
 'BEL': 18,
 'BIH': 3,
 'BOL': 58,
 'BRA': 1010,
 'CAF': 1,
 'CAN': 37,
 'CHE': 13,
 'CHL': 187,
 'COD': 16,
 'COL': 800,
 'CRI': 2,
 'CUB': 2,
 'CUW': 1,
 'CZE': 2,
 'DEU': 46,
 'DNK': 4,
 'ECU': 49,
 'EGY': 1,
 'ESP': 126,
 'FIN': 7,
 'FRA': 56,
 'GBR': 69,
 'GHA': 19,
 'GRC': 13,
 'GTM': 17,
 'HND': 2,
 'HUN': 1,
 'IRL': 17,
 'ISL': 4,
 'ISR': 4,
 'ITA': 14,
 'JAM': 1,
 'JPN': 1,
 'LVA': 1,
 'MEX': 412,
 'NGA': 123,
 'NOR': 2,
 'NZL': 4,
 'PAN': 3,
 'PER': 496,
 'POL': 22,
 'PRI': 23,
 'PRT': 18,
 'PRY': 1,
 'ROU': 1,
 'SDN': 10,
 'SLV': 5,
 'SVN': 1,
 'SWE': 12,
 'THA': 2,
 'UKR': 2,
 'URY': 2,
 'USA': 1632,
 'VNM': 1}

In [153]:
current_entries

{'BOL': 3,
 'BRA': 118,
 'CAF': 1,
 'CHL': 62,
 'DEU': 9,
 'FRA': 44,
 'GTM': 2,
 'JPN': 1,
 'LBN': 3,
 'MEX': 189,
 'NZL': 2,
 'PRI': 1,
 'TUR': 11,
 'URY': 1,
 'USA': 1051,
 'VEN': 3}

Main function (to be run on a daily basis to keep rdbs up to date)

In [60]:
def main():
    # Loads the new data
    mpx_data = load_data()
    # loads all data from a specified date onwards where depreciated data stopped
    mpx_latest_cases = first_push_cases(mpx_data, dt.date(2022, 9, 21))
    # Removes all previous entries from OWID
    query("DELETE FROM `case` WHERE case_id LIKE 'OWID%'")
    # Updates the rdbs with all new entries
    # Obtains the cases which are not in the rdbs
    #mpx_update_case = update_case(mpx_latest_cases)
    # Uploads table for case
    case_df = mpx_latest_cases.copy()
    for row in range(0,case_df.shape[0]):
        con1 = False
        con2 = False
        in_tuple = tuple(case_df.iloc[row, :])
        if type(in_tuple[3]) != str:
            con1 = True
        if type(in_tuple[5]) != str:
            con2 = True
        if con1 == True and con2 == True:
            case_handler((in_tuple[0], in_tuple[1], in_tuple[2], None, in_tuple[4], None))
        elif con1 == True and con2 == False:
            case_handler((in_tuple[0], in_tuple[1], in_tuple[2], None, in_tuple[4], in_tuple[5]))
        elif con1 == False and con2 == True:
            case_handler((in_tuple[0], in_tuple[1], in_tuple[2], in_tuple[3], in_tuple[4], None))
        else: 
            case_handler(in_tuple)
        print('row {} added'.format(row))
        if row == 500:
            break
main()

TypeError: new_cases() takes 0 positional arguments but 2 were given

In [55]:
main()

TypeError: new_cases() takes 0 positional arguments but 2 were given

In [75]:
mpx_data = load_data()
df1 = mpx_data.loc[mpx_data['date'] > dt.date(2022, 9, 21)]
# drops all columns which are not required to reduce computational loading
df1 = df1[['location', 'iso_code', 'date', 'new_deaths', 'new_cases']]
# Drops all rows which have no new cases
df1 = df1.loc[df1['new_cases'] > 0]
# Generates new rows for each new case
df1 = explode_rows(df1)

In [84]:
mpx_data

Unnamed: 0,location,iso_code,date,total_cases,total_deaths,new_cases,new_deaths,new_cases_smoothed,new_deaths_smoothed,new_cases_per_million,total_cases_per_million,new_cases_smoothed_per_million,new_deaths_per_million,total_deaths_per_million,new_deaths_smoothed_per_million
0,Andorra,AND,2022-07-25,2.0,0.0,2.0,0.0,0.29,0.0,25.306,25.306,3.669,0.0,0.0,0.0
1,Andorra,AND,2022-07-26,3.0,0.0,1.0,0.0,0.43,0.0,12.653,37.958,5.441,0.0,0.0,0.0
2,Andorra,AND,2022-07-27,3.0,0.0,0.0,0.0,0.43,0.0,0.000,37.958,5.441,0.0,0.0,0.0
3,Andorra,AND,2022-07-28,3.0,0.0,0.0,0.0,0.43,0.0,0.000,37.958,5.441,0.0,0.0,0.0
4,Andorra,AND,2022-07-29,3.0,0.0,0.0,0.0,0.43,0.0,0.000,37.958,5.441,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9317,Venezuela,VEN,2022-10-02,5.0,0.0,0.0,0.0,0.00,0.0,0.000,0.177,0.000,0.0,0.0,0.0
9318,Venezuela,VEN,2022-10-03,5.0,0.0,0.0,0.0,0.00,0.0,0.000,0.177,0.000,0.0,0.0,0.0
9319,Venezuela,VEN,2022-10-04,5.0,0.0,0.0,0.0,0.00,0.0,0.000,0.177,0.000,0.0,0.0,0.0
9320,Venezuela,VEN,2022-10-05,8.0,0.0,3.0,0.0,0.43,0.0,0.106,0.284,0.015,0.0,0.0,0.0


In [76]:
df1.loc[df1['iso_code'] == 'AUS']

Unnamed: 0,location,iso_code,date,new_deaths,new_cases
131,Australia,AUS,2022-09-22,0.0,1
132,Australia,AUS,2022-09-22,0.0,1
133,Australia,AUS,2022-09-22,0.0,1
134,Australia,AUS,2022-09-27,0.0,1
135,Australia,AUS,2022-10-04,0.0,1
136,Australia,AUS,2022-10-04,0.0,1


In [60]:
temp = mpx_data.loc[mpx_data['location'] == 'Australia']

In [69]:
temp.loc[temp['date'] > dt.date(2022, 9, 22),'new_cases']

344    0.0
345    0.0
346    0.0
347    0.0
348    1.0
349    0.0
350    0.0
351    0.0
352    0.0
353    0.0
354    0.0
355    2.0
Name: new_cases, dtype: float64

In [61]:
temp.new_cases.sum()

138.0

In [42]:
mpx_latest_cases = new_cases(mpx_data, dt.date(2022, 9, 22))

In [88]:
# Loads the new data
mpx_data = load_data()
# loads all data from a specified date onwards where depreciated data stopped
mpx_latest_cases = new_cases(mpx_data, dt.date(2022, 9, 21))

In [89]:
mpx_latest_cases

Unnamed: 0,case_id,case_status,case_date,case_gender,country_region_id,case_death_date
4085,OWID_7023,confirmed,2022-10-06,,PRI_0,
2787,OWID_7024,confirmed,2022-10-06,,LBN_0,
2788,OWID_7025,confirmed,2022-10-06,,LBN_0,
2789,OWID_7026,confirmed,2022-10-06,,LBN_0,
2785,OWID_7027,confirmed,2022-10-06,,JPN_0,
...,...,...,...,...,...,...
138,OWID_14041,confirmed,2022-09-22,,AUT_0,
139,OWID_14042,confirmed,2022-09-22,,AUT_0,
131,OWID_14043,confirmed,2022-09-22,,AUS_0,
132,OWID_14044,confirmed,2022-09-22,,AUS_0,


In [87]:
mpx_latest_cases

Unnamed: 0,case_id,case_status,case_date,case_gender,country_region_id,case_death_date
4085,OWID_7023,confirmed,2022-10-06,,PRI_0,
2787,OWID_7024,confirmed,2022-10-06,,LBN_0,
2788,OWID_7025,confirmed,2022-10-06,,LBN_0,
2789,OWID_7026,confirmed,2022-10-06,,LBN_0,
2785,OWID_7027,confirmed,2022-10-06,,JPN_0,
...,...,...,...,...,...,...
138,OWID_14041,confirmed,2022-09-22,,AUT_0,
139,OWID_14042,confirmed,2022-09-22,,AUT_0,
131,OWID_14043,confirmed,2022-09-22,,AUS_0,
132,OWID_14044,confirmed,2022-09-22,,AUS_0,


In [44]:
mpx_latest_cases.loc[mpx_latest_cases['country_region_id'] == 'AUS_0']

Unnamed: 0,case_id,case_status,case_date,case_gender,country_region_id,case_death_date
3752,OWID_1070,confirmed,2022-10-06,,PRI_0,
2595,OWID_1071,confirmed,2022-10-06,,LBN_0,
2596,OWID_1072,confirmed,2022-10-06,,LBN_0,
2597,OWID_1073,confirmed,2022-10-06,,LBN_0,
2593,OWID_1074,confirmed,2022-10-06,,JPN_0,
...,...,...,...,...,...,...
56,OWID_7755,confirmed,2022-09-23,,ARG_0,
57,OWID_7756,confirmed,2022-09-23,,ARG_0,
58,OWID_7757,confirmed,2022-09-23,,ARG_0,
59,OWID_7758,confirmed,2022-09-23,,ARG_0,


In [74]:
query("DELETE FROM `case` WHERE case_id LIKE 'OWID%'")
query("SELECT count(*) FROM `case` WHERE country_region_id LIKE 'AUS%'")

((132,),)

In [106]:
current_entries = query("""
SELECT c.country_code, count(*) FROM
`country_region` AS c JOIN `case` AS c1 ON c.country_region_id = c1.country_region_id
WHERE c1.case_id LIKE 'OWID%'
GROUP BY c.country_code""")
dict(current_entries)

{'ARG': 131,
 'AUS': 6,
 'AUT': 11,
 'BEL': 18,
 'BIH': 3,
 'BOL': 61,
 'BRA': 1128,
 'CAF': 2,
 'CHE': 13,
 'CHL': 249,
 'COD': 16,
 'COL': 800,
 'CRI': 2,
 'CUB': 2,
 'CUW': 1,
 'CZE': 2,
 'DEU': 55,
 'DNK': 4,
 'ECU': 49,
 'EGY': 1,
 'ESP': 126,
 'FIN': 7,
 'FRA': 100,
 'GBR': 69,
 'GHA': 19,
 'GRC': 13,
 'GTM': 19,
 'HND': 2,
 'HUN': 1,
 'IRL': 17,
 'ISL': 4,
 'ISR': 4,
 'ITA': 14,
 'JAM': 1,
 'JPN': 2,
 'LBN': 3,
 'LVA': 1,
 'MEX': 601,
 'NGA': 123,
 'NOR': 2,
 'NZL': 6,
 'PAN': 3,
 'PER': 496,
 'POL': 22,
 'PRI': 24,
 'PRT': 18,
 'PRY': 1,
 'ROU': 1,
 'SDN': 10,
 'SLV': 5,
 'SVN': 1,
 'SWE': 12,
 'THA': 2,
 'TUR': 11,
 'UKR': 2,
 'URY': 3,
 'USA': 2683,
 'VEN': 3}

In [81]:
query("SELECT MAX(STR_TO_DATE(case_date, '%Y-%m-%d')) FROM `case` WHERE case_id NOT LIKE 'OWID%'")

((datetime.date(2022, 9, 22),),)