In [1]:
import pandas as pd
import numpy as np
%matplotlib inline
import lightgbm as lgb
from tqdm import tqdm_notebook
import warnings
warnings.simplefilter("ignore")

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


In [2]:
covid = pd.read_csv("covid_19_data.csv")

In [3]:
covid['ObservationDate'] = pd.to_datetime(covid['ObservationDate'], format='%m/%d/%Y')

In [4]:
covid2 = covid.drop_duplicates(['ObservationDate', 'Country/Region'])


#.drop_duplicates(subset= ['Country/Region'], keep='first')

In [5]:
covid2 = covid2[covid2['Confirmed'] > 0]

In [6]:
covid2['Country/Region'].value_counts()

Mainland China          33
Japan                   33
US                      33
Taiwan                  33
South Korea             33
Thailand                33
Macau                   33
Hong Kong               32
Vietnam                 32
Singapore               32
France                  31
Nepal                   30
Australia               30
Malaysia                30
Canada                  29
Sri Lanka               28
Cambodia                28
Germany                 27
Finland                 26
United Arab Emirates    26
Philippines             25
India                   25
Sweden                  24
Russia                  24
Italy                   24
UK                      24
Spain                   23
Belgium                 20
Others                  17
Egypt                   10
Iran                     5
Lebanon                  3
Israel                   3
Ivory Coast              1
Name: Country/Region, dtype: int64

In [7]:
covid3 = covid2.drop_duplicates("Country/Region")
covid3 = covid3[covid3["Country/Region"] != "Others"]
covid3 = covid3.reset_index(drop=True)
covid3

Unnamed: 0,SNo,ObservationDate,Province/State,Country/Region,Last Update,Confirmed,Deaths,Recovered
0,1,2020-01-22,Anhui,Mainland China,1/22/2020 17:00,1.0,0.0,0.0
1,21,2020-01-22,Macau,Macau,1/22/2020 17:00,1.0,0.0,0.0
2,29,2020-01-22,Taiwan,Taiwan,1/22/2020 17:00,1.0,0.0,0.0
3,32,2020-01-22,Washington,US,1/22/2020 17:00,1.0,0.0,0.0
4,36,2020-01-22,,Japan,1/22/2020 17:00,2.0,0.0,0.0
5,37,2020-01-22,,Thailand,1/22/2020 17:00,2.0,0.0,0.0
6,38,2020-01-22,,South Korea,1/22/2020 17:00,1.0,0.0,0.0
7,51,2020-01-23,Hong Kong,Hong Kong,1/23/20 17:00,2.0,0.0,0.0
8,77,2020-01-23,,Singapore,1/23/20 17:00,1.0,0.0,0.0
9,80,2020-01-23,,Vietnam,1/23/20 17:00,2.0,0.0,0.0


In [8]:
countries = pd.read_csv("countries and continents.csv")
countries_loc = pd.read_csv("Countries Longitude and Latitude.csv", index_col=0)


In [9]:
countries.head()

Unnamed: 0,name,official_name_en,official_name_fr,ISO3166-1-Alpha-2,ISO3166-1-Alpha-3,M49,ITU,MARC,WMO,DS,...,ISO4217-currency_minor_unit,ISO4217-currency_name,ISO4217-currency_numeric_code,is_independent,Capital,Continent,TLD,Languages,Geoname ID,EDGAR
0,,Channel Islands,Îles Anglo-Normandes,,,830,,,,,...,,,,,,,,,,
1,,Sark,Sercq,,,680,,,,,...,,,,,,,,,,
2,Afghanistan,Afghanistan,Afghanistan,AF,AFG,4,AFG,af,AF,AFG,...,2.0,Afghani,971.0,Yes,Kabul,AS,.af,"fa-AF,ps,uz-AF,tk",1149361.0,B2
3,Albania,Albania,Albanie,AL,ALB,8,ALB,aa,AB,AL,...,2.0,Lek,8.0,Yes,Tirana,EU,.al,"sq,el",783754.0,B3
4,Algeria,Algeria,Algérie,DZ,DZA,12,ALG,ae,AL,DZ,...,2.0,Algerian Dinar,12.0,Yes,Algiers,AF,.dz,ar-DZ,2589581.0,B4


In [10]:
countries_loc.head()

Unnamed: 0,longitude,latitude,name
0,33.791638,-84.389488,
1,33.791638,-84.389488,
2,33.93911,67.709953,Afghanistan
3,41.153332,20.168331,Albania
4,28.033886,1.659626,Algeria


In [11]:
countries_loc2 = countries_loc.rename(columns={"name": "Country/Region"})
countries_loc2 = countries_loc2.dropna()

In [12]:
covid3.head()

Unnamed: 0,SNo,ObservationDate,Province/State,Country/Region,Last Update,Confirmed,Deaths,Recovered
0,1,2020-01-22,Anhui,Mainland China,1/22/2020 17:00,1.0,0.0,0.0
1,21,2020-01-22,Macau,Macau,1/22/2020 17:00,1.0,0.0,0.0
2,29,2020-01-22,Taiwan,Taiwan,1/22/2020 17:00,1.0,0.0,0.0
3,32,2020-01-22,Washington,US,1/22/2020 17:00,1.0,0.0,0.0
4,36,2020-01-22,,Japan,1/22/2020 17:00,2.0,0.0,0.0


In [13]:
countries_loc2[countries_loc2['Country/Region'].str.contains("Iv").fillna(False)]

Unnamed: 0,longitude,latitude,Country/Region
61,7.539989,-5.54708,Côte d’Ivoire


In [14]:
countries_loc2.loc[countries_loc2['Country/Region'] == 'China', 'Country/Region'] = 'Mainland China'
countries_loc2.loc[countries_loc2['Country/Region'] == 'Côte d’Ivoire', 'Country/Region'] = 'Ivory Coast' 

In [15]:
countries_loc2

Unnamed: 0,longitude,latitude,Country/Region
2,33.939110,67.709953,Afghanistan
3,41.153332,20.168331,Albania
4,28.033886,1.659626,Algeria
5,-14.270972,-170.132217,American Samoa
6,42.506285,1.521801,Andorra
...,...,...,...
246,24.215527,-12.885834,Western Sahara
247,15.552727,48.516388,Yemen
248,-13.133897,27.849332,Zambia
249,-19.015438,29.154857,Zimbabwe


In [16]:
covid4 = covid3.merge(countries_loc2, how='left')
covid4

Unnamed: 0,SNo,ObservationDate,Province/State,Country/Region,Last Update,Confirmed,Deaths,Recovered,longitude,latitude
0,1,2020-01-22,Anhui,Mainland China,1/22/2020 17:00,1.0,0.0,0.0,35.86166,104.195397
1,21,2020-01-22,Macau,Macau,1/22/2020 17:00,1.0,0.0,0.0,22.198745,113.543873
2,29,2020-01-22,Taiwan,Taiwan,1/22/2020 17:00,1.0,0.0,0.0,23.69781,120.960515
3,32,2020-01-22,Washington,US,1/22/2020 17:00,1.0,0.0,0.0,40.760537,-73.97889
4,36,2020-01-22,,Japan,1/22/2020 17:00,2.0,0.0,0.0,36.204824,138.252924
5,37,2020-01-22,,Thailand,1/22/2020 17:00,2.0,0.0,0.0,15.870032,100.992541
6,38,2020-01-22,,South Korea,1/22/2020 17:00,1.0,0.0,0.0,35.907757,127.766922
7,51,2020-01-23,Hong Kong,Hong Kong,1/23/20 17:00,2.0,0.0,0.0,22.396428,114.109497
8,77,2020-01-23,,Singapore,1/23/20 17:00,1.0,0.0,0.0,1.352083,103.819836
9,80,2020-01-23,,Vietnam,1/23/20 17:00,2.0,0.0,0.0,14.058324,108.277199


In [17]:
?great_circle

Object `great_circle` not found.


In [19]:
from geopy.distance import great_circle
def dist_to_confirmed(lon, lat, confirmed_loc):
    dists = []
    for lon2, lat2, country in confirmed_loc.values:
        d = great_circle((lon, lat), (lon2, lat2))
        dists.append(d.kilometers)
    return np.array(dists)

def calc_confirmed_radius(lon, lat, country, confirmed_loc):
    res = {"Country/Region": country}
    
    dists = dist_to_confirmed(lon, lat, confirmed_loc)
    
    res['avg_dist_to_confirmed'] = np.mean(dists)
    res['confirmed_1k'] = (dists < 1000).sum()
    res['confirmed_2k'] = (dists < 2000).sum()
    res['confirmed_3k'] = (dists < 3000).sum()
    
    return res
    
    

#a = dist_to_confirmed(33.939110, 67.709953, confirmed_loc)

In [20]:
countries_loc2.head()

Unnamed: 0,longitude,latitude,Country/Region
2,33.93911,67.709953,Afghanistan
3,41.153332,20.168331,Albania
4,28.033886,1.659626,Algeria
5,-14.270972,-170.132217,American Samoa
6,42.506285,1.521801,Andorra


In [21]:
dates = pd.date_range("2020-01-22", "2020-02-22")

all_data = dict()
for date in dates:
    confirmed_countries = covid4[covid4['ObservationDate'] <= date]['Country/Region']
    
    confirmed_loc = countries_loc2[countries_loc2['Country/Region'].isin(confirmed_countries)]

    data = countries_loc2[~countries_loc2['Country/Region'].isin(confirmed_countries)]
    next_confirmed =  covid4[covid4['ObservationDate'] == date + pd.Timedelta(1,'D')]['Country/Region']
    
    if next_confirmed.shape[0] == 0:
        continue
    
    data['y'] = 0
    data.loc[data['Country/Region'].isin(next_confirmed), 'y'] = 1
    data['date'] = date
    
    features = []
    for lon, lat, country in data[['longitude', 'latitude','Country/Region']].values:
        features.append(calc_confirmed_radius(lon, lat, country, confirmed_loc))
    
    features = pd.DataFrame(features)
    data = data.merge(features, how='left')
    
    all_data[date] = data

In [56]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

features = ['avg_dist_to_confirmed', 'confirmed_1k', 'confirmed_2k', 'confirmed_3k']
sorted_date = sorted(all_data.keys())


correct = {'Total': 0,'Baseline': 0, 'LR': 0, 'Rank Avg': 0}
for i in range(len(sorted_date)-1) :

    
    date = sorted_date[i]
    next_date = sorted_date[i+1]
    
    Xtrain, ytrain = all_data[date][features], all_data[date]['y']
    Xval, yval = all_data[next_date][features], all_data[next_date]['y']
    
    pipe = make_pipeline(StandardScaler(), LogisticRegression(random_state=0, class_weight='balanced'))
    #pipe = DecisionTreeClassifier(max_depth=None,class_weight='balanced', ccp_alpha=0.01)
    #pipe = ExtraTreesClassifier(n_estimators=100, n_jobs=6, max_depth=4, class_weight='balanced')
    
    
    pipe.fit(Xtrain, ytrain)
    p = pipe.predict_proba(Xval)[:,1]
    
    Xval['p'] = p
    Xval['y'] = yval
    Xval['country'] = all_data[next_date]['Country/Region']
    
    baseline = Xval.sort_values("avg_dist_to_confirmed").head(20)['y'].sum()# / yval.sum()
    lr_top_20 = Xval.sort_values("p", ascending=False).head(20)['y'].sum()# / yval.sum()
    
    Xval['rank_avg'] = 0.9*Xval['avg_dist_to_confirmed'].rank() + 0.1*Xval['p'].rank(ascending=False)
    avg_in_top_20 = Xval.sort_values("rank_avg", ascending=True).head(20)['y'].sum() #/ yval.sum()
    
    correct['Total'] += yval.sum()
    correct['Baseline'] += baseline
    correct['LR'] += lr_top_20
    correct['Rank Avg'] += avg_in_top_20
    
    str_result = "Prediction date: {}\nPositive in train: {}\nConfirmed next date: {}\nBaseline: {}\nLR: {}\nRank avg top 20: {}\n".format( 
        date, ytrain.sum(), yval.sum(), baseline, lr_top_20, avg_in_top_20)
    print(str_result)

Prediction date: 2020-01-22 00:00:00
Positive in train: 3
Confirmed next date: 1
Baseline: 0
LR: 0
Rank avg top 20: 0

Prediction date: 2020-01-23 00:00:00
Positive in train: 1
Confirmed next date: 3
Baseline: 2
LR: 0
Rank avg top 20: 0

Prediction date: 2020-01-24 00:00:00
Positive in train: 3
Confirmed next date: 1
Baseline: 0
LR: 0
Rank avg top 20: 0

Prediction date: 2020-01-25 00:00:00
Positive in train: 1
Confirmed next date: 3
Baseline: 2
LR: 1
Rank avg top 20: 2

Prediction date: 2020-01-26 00:00:00
Positive in train: 3
Confirmed next date: 1
Baseline: 0
LR: 0
Rank avg top 20: 0

Prediction date: 2020-01-27 00:00:00
Positive in train: 1
Confirmed next date: 2
Baseline: 0
LR: 0
Rank avg top 20: 1

Prediction date: 2020-01-28 00:00:00
Positive in train: 2
Confirmed next date: 2
Baseline: 2
LR: 1
Rank avg top 20: 2

Prediction date: 2020-01-29 00:00:00
Positive in train: 2
Confirmed next date: 4
Baseline: 1
LR: 0
Rank avg top 20: 1

Prediction date: 2020-01-30 00:00:00
Positive in

In [57]:
correct
#{'Baseline': 8, 'LR': 4, 'Rank Avg': 6} = 50/50

{'Total': 23, 'Baseline': 8, 'LR': 4, 'Rank Avg': 7}

In [63]:
Xval.sort_values("rank_avg", ascending=True).head(10)

Unnamed: 0,avg_dist_to_confirmed,confirmed_1k,confirmed_2k,confirmed_3k,p,y,country,rank_avg
102,4934.174551,0,1,5,0.943959,0,Kyrgyzstan,1.1
191,4958.310974,0,2,5,0.888048,0,Tajikistan,3.2
208,5026.162088,0,1,4,0.927562,0,Uzbekistan,3.9
22,4951.29032,1,4,10,0.379829,0,Bhutan,4.3
0,5029.039001,0,4,4,0.596962,0,Afghanistan,5.7
98,5038.895405,0,0,5,0.967322,0,Kazakhstan,6.4
147,5058.112703,0,4,5,0.619927,0,Pakistan,8.2
16,5029.894839,1,5,11,0.234302,0,Bangladesh,9.4
127,5143.128327,0,2,7,0.888728,0,Mongolia,10.3
200,5099.356181,1,2,5,0.508006,0,Turkmenistan,10.7


In [64]:
Xval.sort_values("avg_dist_to_confirmed")

Unnamed: 0,avg_dist_to_confirmed,confirmed_1k,confirmed_2k,confirmed_3k,p,y,country,rank_avg
102,4934.174551,0,1,5,0.943959,0,Kyrgyzstan,1.1
22,4951.290320,1,4,10,0.379829,0,Bhutan,4.3
191,4958.310974,0,2,5,0.888048,0,Tajikistan,3.2
208,5026.162088,0,1,4,0.927562,0,Uzbekistan,3.9
0,5029.039001,0,4,4,0.596962,0,Afghanistan,5.7
...,...,...,...,...,...,...,...,...
207,14306.209950,0,0,0,0.000018,0,Uruguay,213.6
64,14398.060708,0,0,0,0.000016,0,Falkland Islands,214.8
154,14448.206840,0,0,0,0.000015,0,Pitcairn Islands,215.8
9,14644.243293,0,0,0,0.000011,0,Argentina,216.8


# Como melhorar?
- Prever/avaliar quais países estão em risco para a próxima semana
    - Mais dados de treino
    - Menos ruído
- Treinar com todos os países confirmados
- Por que é difícil?
    - Depende dos métodos de descoberta (Itália mede diferente de outros países da Europa, China parou de fazer apenas testes de laboratório)
- Países mais pobres
    - Em tese não terão a mesma capacidade de detecção dos países ricos
    - Mas também podem não ter tanta gente cruzando a fronteira
    
- Mais features:
    - Dados econômicos e demográficos, saúde pública
    - Dados de parcerias comerciais (países que recebem muitos viajantes de países com casos confirmados)
    - Quantos confirmados em países a 1k Km, 2k Km...

- Olhar mais países do top N
- Usar com dados da SARS (e outras doenças de propagação rápida)
https://en.wikipedia.org/wiki/Severe_acute_respiratory_syndrome#Epidemiology
   