In [None]:
!pip install pycountry

In [50]:
!pip install Nominatim

Collecting Nominatim
  Downloading https://files.pythonhosted.org/packages/59/f2/d47726f804208804f7f295e20a9d2ee4ea925fb6462481223464026bcf66/nominatim-0.1.tar.gz
Building wheels for collected packages: Nominatim
  Building wheel for Nominatim (setup.py): started
  Building wheel for Nominatim (setup.py): finished with status 'done'
  Created wheel for Nominatim: filename=nominatim-0.1-cp37-none-any.whl size=2368 sha256=a868d462f656d6d0ac027438661887cd3ce2dd7f39cf9a5f7044b213d8a07928
  Stored in directory: C:\Users\kfrid\AppData\Local\pip\Cache\wheels\d5\66\ed\e7476981dc30210b6b5ce7c25b054e8db35d44fdd2198003d4
Successfully built Nominatim
Installing collected packages: Nominatim
Successfully installed Nominatim-0.1


In [52]:
!pip install geopy

Collecting geopy
  Downloading https://files.pythonhosted.org/packages/53/fc/3d1b47e8e82ea12c25203929efb1b964918a77067a874b2c7631e2ec35ec/geopy-1.21.0-py2.py3-none-any.whl (104kB)
Collecting geographiclib<2,>=1.49 (from geopy)
  Downloading https://files.pythonhosted.org/packages/8b/62/26ec95a98ba64299163199e95ad1b0e34ad3f4e176e221c40245f211e425/geographiclib-1.50-py3-none-any.whl
Installing collected packages: geographiclib, geopy
Successfully installed geographiclib-1.50 geopy-1.21.0


In [10]:
!pip install feather-format

Collecting feather-format
  Downloading https://files.pythonhosted.org/packages/08/55/940b97cc6f19a19f5dab9efef2f68a0ce43a7632f858b272391f0b851a7e/feather-format-0.4.0.tar.gz
Collecting pyarrow>=0.4.0 (from feather-format)
  Downloading https://files.pythonhosted.org/packages/a7/51/504108cc176b6625abc439570ac722dbab831974947dd7c34c8812c709e9/pyarrow-0.16.0-cp37-cp37m-win_amd64.whl (20.4MB)
Building wheels for collected packages: feather-format
  Building wheel for feather-format (setup.py): started
  Building wheel for feather-format (setup.py): finished with status 'done'
  Created wheel for feather-format: filename=feather_format-0.4.0-cp37-none-any.whl size=3012 sha256=cc76d8cd883831048d06b5d601476aaa1f494d7e8e7c9551a281949636b7b78b
  Stored in directory: C:\Users\kfrid\AppData\Local\pip\Cache\wheels\85\7d\12\2dfa5c0195f921ac935f5e8f27deada74972edc0ae9988a9c1
Successfully built feather-format
Installing collected packages: pyarrow, feather-format
Successfully installed feather-forma

In [1]:
# convert_updates.py
# This program reads in the csv updates for Coronavirus and converts the data into a dataframe
# with info about confirmed cases and overall deaths by country. It stores the
# dataframe into a feather file so that it can be retrieved later, and it adds
# the name of the feather file to a list so that all updates can be easily 
# converted into one main dataframe.
#
# Author: Kristen Friday
# Date: April 21, 2020


import pandas as pd
import plotly.express as px
import pycountry
from geopy.geocoders import Nominatim
from geopy.extra.rate_limiter import RateLimiter
import feather

# function that takes in url and returns dataframe from csv
def csv_to_dataframe(url):
    # put csv info into dataframe
    df = pd.read_csv(url, error_bad_lines=False)
    df = df.loc[:, ['Country_Region', 'Lat', 'Long_', 'Confirmed', 'Deaths']]
    return df


# function that loops through country column in dataframe
# creates new dataframe with only one row for each country
def modify_countries(df, date):
    # create new list to store names of countries
    countries = []
    # dictionary representing confirmed cases
    confirmed = {}
    # dictionary representing deaths
    deaths = {}
    # list representing latitudes
    latitude_list = []
    # list representing longitudes
    longitude_list = []
    # list representing date
    date_list = [date]

    # loop through country_region column of dataframe
    # if country has already been documented, add data to original
    # else add the counts for that country to create ongoing sum
    for i in range(len(df['Country_Region'])):
        country = df.at[i, 'Country_Region']
        if country in countries:
            confirmed[country] += df.at[i, 'Confirmed']
            deaths[country] += df.at[i, 'Deaths']
        else:
            countries.append(country)
            confirmed[country] = df.at[i, 'Confirmed']
            deaths[country] = df.at[i, 'Deaths']
            latitude_list.append(df.at[i, 'Lat'])
            longitude_list.append(df.at[i, 'Long_'])
        
    # create lists for the confirmed and death dictionaries
    confirmed_list = list(confirmed.values())
    deaths_list = list(deaths.values())
    date_list = date_list*len(confirmed_list)
    # create new dataframe with only one row per country
    update_new_df = pd.DataFrame({'Date' : date_list, 'Country_Region' : countries, 'Latitude' : latitude_list,
                                  'Longitude' : longitude_list, 'Confirmed' : confirmed_list, 
                                  'Deaths' : deaths_list})
    return update_new_df


# function reverse geocodes the longitudes and latitudes in order to obtain 2 letter iso code
def country_to_alpha2(df):
    # show user that geocoding is in process
    print('Loading ISO Codes...')
    
    # set unique user_agent name to access python package
    locator = Nominatim(user_agent = 'khf_geocode', timeout=5)
    # create empty list to store 2 letter iso codes
    codes_alpha2 = []
    
    # loop through data in Country_Region column of original dataframe
    for i in range(len(df['Country_Region'])):
        # access coordinates from dataframe
        latitude = str(df.at[i, 'Latitude'])
        longitude = str(df.at[i, 'Longitude'])
        
        # determine if the coordinates point to a real country
        if (latitude == '0.0') or (latitude == 'nan'):
            exact_country = 'Not a country'
            code_2 = 'XX'
        else:
            # limit the rate of geocoding so code doesn't throw error
            reverse = RateLimiter(locator.reverse, min_delay_seconds=1)
            # reverse geocode the latitudes and longitudes to get exact country name
            address = reverse(latitude + ', ' + longitude, language='en')

            # check if the coordinates have an associated country
            if ('country' in address.raw['address'].keys()):
                # access the country name in raw address
                exact_country = (address.raw['address'])['country']
                code_2 = (address.raw['address'])['country_code']
            else:
                # operating under the assumption that I can only plot country names
                exact_country = 'Not a country'
                code_2 = 'XX'
            
        # update country name
        df.at[i, 'Country_Region'] = exact_country
        # add the 2 letter codes to a list
        codes_alpha2.append(code_2)
        
    # add code list to a new column in dataframe
    df.loc[:, '2 Char ISO Codes'] = pd.Series(codes_alpha2)

    # inform user that process is done
    print('Process Complete')
    
    return df


# function adds alpha3 iso codes to dataframe
def country_to_alpha3(df):
    # create new list of alpha3 country codes that correspond to country names
    codes_alpha3 = []
    # set arbitrary variable i for indexing purposes
    i = 0
    # create empty list to store indices of non-countries
    non_country = []
    # loop through list of countries from dataframe and add country codes to dictionary
    for alpha2_code in df['2 Char ISO Codes']:
        # check if country is a real country name
        if (pycountry.countries.get(alpha_2=alpha2_code.upper())) == None:
            non_country.append(i)
            code = 'XXX'
        else:
            # get iso code for each country
            code = pycountry.countries.get(alpha_2=alpha2_code.upper()).alpha_3
        # put code into list
        codes_alpha3.append(code)
        i += 1
    
    # create new Series for country codes
    codes_series = pd.Series(codes_alpha3)
    # append country codes into a new column in dataframe
    df.loc[:, '3 Char ISO Codes'] = codes_series

    # loop through non_country list
    for i in non_country:
        # delete row from dataframe
        df = df.drop(df.index[i])
    return df


# function saves dataframe to a feather file so that csv doesn't have to be run everytime
def save_dataframe(df, file_name):
    feather_compatible_df = df.reset_index(drop=True)
    feather_compatible_df.to_feather(file_name)
    print('Dataframe successfully saved')
    
    
# function appends feather file to the file containing names of all updates
def update_file_list(file_name):
    with open('Corona_updates.txt', 'a') as file:
        file.write(file_name + '\n')
         
            
if __name__ == '__main__':
    url = input('Enter url of Corona Virus csv file: \n')
    print('')
    
    update_df = csv_to_dataframe(url)
    print(update_df)
    
    date = input('Enter the date of the update (XX-XX-XXXX): \n')
    print('')
    update_new_df = modify_countries(update_df, date)
    print(update_new_df)
    print('')
    
    update_new_df = country_to_alpha2(update_new_df)
    print('')
    
    update_new_df = country_to_alpha3(update_new_df)
    print(update_new_df)
    print('')
    
    save_file = input('Enter a feather file name to save dataframe: \n')
    print('')
    save_dataframe(update_new_df, save_file)
    
    update_file_list(save_file)



Enter url of Corona Virus csv file: 
https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_daily_reports/04-25-2020.csv

     Country_Region        Lat       Long_  Confirmed  Deaths
0                US  34.223334  -82.461707         24       0
1                US  30.295065  -92.414197        130       7
2                US  37.767072  -75.632346        146       3
3                US  43.452658 -116.241552        650      15
4                US  41.330756  -94.471059          1       0
...             ...        ...         ...        ...     ...
3129      Australia -27.469800  153.025100       1026       6
3130      Australia -34.928500  138.600700        438       4
3131      Australia -42.882100  147.327200        207      10
3132      Australia -37.813600  144.963100       1346      16
3133      Australia -31.950500  115.860500        549       8

[3134 rows x 5 columns]
Enter the date of the update (XX-XX-XXXX): 
04-25-2020

          