In [None]:
%matplotlib inline
from matplotlib import pyplot as plt
import seaborn as sns
import pandas as pd
import datetime

# WHO - global influenza occurences
The WHO reports Influenza occurences for all around the world on a weekly basis. It is expected that due to spreading patterns of influenza waves throughout the world, influenza waves different countries can be used as an indicator for the prediction of influenza waves for Germany. The data for the **Influenza transmission zone** South West Europe is analysed below 

Note: The influenza transmission zones join geographically related countries or territories to larger areas that have similar influenza transmission patterns so that an overview can be given. www.who.int/csr/.../swineflu/Influenza_Transmission_Zones.pdf


In [None]:
df_who_global = pd.read_csv('FluNetInteractiveReport.csv',skiprows=3,parse_dates=['SDATE','EDATE']).fillna(value = 0)
df_who_global.head()


In [None]:
print(list(df_who_global))

start_date_plot='2009-08-01'
end_date_plot='2010-03-03'
countries=['United Kingdom of Great Britain and Northern Ireland','Germany','Spain']

df_2years_G_F_I = df_who_global.loc[df_who_global['Country'].isin(countries)&(df_who_global['SDATE']>=start_date_plot)&(df_who_global['EDATE']<=end_date_plot)]

df_2years_G_F_I.set_index('SDATE',inplace=True)
fig = plt.figure()
df_2years_G_F_I.groupby('Country')['ALL_INF'].plot(legend = True)

plt.xlabel('date')
plt.ylabel('# of Influenza cases reported to WHO \n by individual countries')

#plt.savefig('InfluenzaDetectionsOverTime_France_Germany_Italy.eps')


In [None]:
df_inf_per_state_full=df_who_global.pivot(index='EDATE', columns='Country', values='ALL_INF')
df_inf_per_state_full.rename(columns={'United Kingdom of Great Britain and Northern Ireland':'UK'}, inplace=True)

df_inf_per_state_full.index = pd.to_datetime(df_inf_per_state_full.index)

print(list(df_inf_per_state_full))
df_inf_per_state_full.tail()

## RKI - Influenza occurances in Germany

In this section, the data from Robert Koch Institute about activity of Influenza virus in Germany is analysed (https://survstat.rki.de/Content/Query/Create.aspx). The data are published on weekly basis from year 2001. Every year, samples from patients with influenza-like illness are investigated, which are sent by the sentinel network of general practitioners and paediatricians. Only direct detection of influenza virus is notifiable to RKI.
The data are available for all German states (as shown in the figure below), even a finer granularity is available (county level) but wont be used in this analysis.

In [None]:
df_RKI_states = pd.read_csv('RKI_INV_0418_season_comma.csv', sep=';',skiprows=1).fillna(value = 0)
df_RKI_states.set_index('enddate',inplace=True)
df_RKI_states.index = pd.to_datetime(df_RKI_states.index)
df_RKI_states['Germany_sum'] = df_RKI_states['Germany_sum'].astype(float)
#print(type(df_RKI_states.loc['2001-01-14','Bavaria']))
df_RKI_states=df_RKI_states.drop(['Season year and week (40)'], axis=1)
df_RKI_states

In [None]:
start_date_plot=datetime.datetime(2013,12,1)
end_date_plot=datetime.datetime(2016,5,4)

df_RKI_states.loc[start_date_plot:end_date_plot].plot(y=['Bavaria', 'Baden-Wuerttemberg', 'Berlin','Hessen'])
plt.xlabel('date')
plt.ylabel('# detected Influenza cases')
plt.legend()
plt.show()

The number of detected influenza cases reported by RKI with cases reported to WHO from Germany (both on weekly basis) are compared in the following. A significant discrepancy between these two quantities can be observed (the number of cases reported to WHO being significantly lower than detected cases reported on the RKI website). Additionally, a clear trend can be observed in both timeseries: while the number of cases reported to WHO is decreaseng in time, the number of influenza positive samples reported by RKI is increasing.

The dicrepansy between these sources of data was not investigated further, we decided to use the RKI data for the further analysis. Hoverer, the increasing trend in the RKI data needs to be explained: Does it reflect an increasing magnitude of flu waves in the recent years or just a higher detection of influenza viruses (through more samples investigated and recorded in the RKI database)?

In [None]:
start_date_plot=datetime.datetime(1995,12,1)
end_date_plot=datetime.datetime(2018,3,23)

WHO_RKI=pd.DataFrame()
WHO_RKI['Germany WHO'] = df_inf_per_state_full.loc[start_date_plot:end_date_plot,'Germany']
WHO_RKI['Germany RKI']=df_RKI_states.loc[start_date_plot:end_date_plot,'Germany_sum']
#WHO_RKI.plot(secondary_y=['Germany WHO'])
#plt.xlabel('date')
#plt.ylabel('# detected Influenza cases')


fig, ax1 = plt.subplots()
ax2 = ax1.twinx()

color = 'tab:red'
ax1.plot(WHO_RKI.index, WHO_RKI['Germany RKI'], color=color)
ax1.set_xlabel('year')
ax1.set_ylabel('# detected Influenza cases by RKI', color=color)
ax1.tick_params(axis='y', labelcolor=color)


color = 'tab:blue'
ax2.plot(WHO_RKI.index, WHO_RKI['Germany WHO'], color=color)
ax2.set_ylabel('# Influenza cases reported to WHO', color=color)
ax2.tick_params(axis='y', labelcolor=color)

#fig.tight_layout()  # otherwise the right y-label is slightly clipped
plt.show()



print('corr.coef: ' + str(WHO_RKI['Germany RKI'].corr(WHO_RKI['Germany WHO'])))


The increasing trend in the RKI data (number of detected Influenza cases) needs to be explained: Does it reflect an increasing magnitude of flu waves in the recent years or just a higher detection of influenza viruses (through more samples investigated and recorded in the RKI database)? 

To clarify this, the RKI seasonal estimates of doctor visits associated with flu waves were investigated. These are considered to be the best available quantities for representation of the real magnitude of flu waves in Germany and, ultimately, for predicting the number of incoming flu-associated claims. 

RKI provides estimates of excess-doctor visits (Exzess-Konsultationen), excess hospitalisations (Exzess-Hospitalisierungen) and other summary statistics that can be associated with the influenza wave for each season within the Influenza season reports (https://influenza.rki.de/Saisonbericht.aspx). These data are only provided once per year after the end of the season and cannot thus be used for the real-time prediction. Note that the following data were extracted by hand from the reports and can thus be subject to human error. 

In [None]:
RKI_season_summary = pd.read_excel('RKI-Seasonal_reports_summary.xlsx',skiprows=1)
RKI_season_summary=RKI_season_summary.set_index('end_y')
RKI_season_summary.tail()

In the following, the correlation between doctor consultations ('Exzess-Konsultationen') and selected quantities are calculated and plotted. A very strong correlation with unability to work (Exzess-Arbeitsunfaehigkeiten) and hospitalisations can be observed. Also duration of the flu wave is positively correlated with the number of doctor consultations. 

In [None]:
df_corr_summary=pd.DataFrame(index=['corr_coef'])
check=['duration', 'end_w','Exzess-Arbeitsunfaehigkeiten bzw. Pflegebeduerftigkeit bei Kindern und nicht Berufstaetigen', 'Exzess-Hospitalisierungen']

i=0
for c in RKI_season_summary.loc[:,check]:
    df_corr_summary[check[i]]=RKI_season_summary.loc[:,'Exzess-Konsultationen'].corr(RKI_season_summary.loc[:,check[i]])
    i+=1

#RKI_season_summary.loc[:,'Exzess-Konsultationen'].corr(RKI_season_summary.loc[:,check[2]])
df_corr_summary

In [None]:
RKI_season_plot=RKI_season_summary.loc[:,['peak_date','Exzess-Konsultationen', 'Exzess-Hospitalisierungen']]
RKI_season_plot.set_index('peak_date',inplace=True)

RKI_season_plot.plot.bar(secondary_y=['Exzess-Hospitalisierungen'])

print('corr.coef: ' + str(RKI_season_plot['Exzess-Konsultationen'].corr(RKI_season_plot['Exzess-Hospitalisierungen'])))

Finally, the estimated number of consultations due to influenza ('Exzess-Konsultationen') and the number of detected influenza cases (on samples investigated in the laboratories) reported to RKI are compared. The increasing number of detected cases over time (plotted in blue bars) is not due to actual worsening of the flu waves (no trend can be observed in the red bars), but rather due to a higher detection rate (more samples investigated and reported to RKI). 

In [None]:
start_date_season=[12,1]
end_date_season=[4,30]

years_end=[str(e) for e in range(2003,2018,1)]

df_RKI_seasonsum=pd.DataFrame(index=years_end,columns=['RKI-Exzess-Konsultationen', 'RKI-detected influenza cases (from samples)', 'detection rate']).astype(float)
for y in years_end:
    start_date=datetime.datetime(int(y)-1,start_date_season[0],start_date_season[1])
    end_date=datetime.datetime(int(y),end_date_season[0],end_date_season[1])
    df_RKI_seasonsum.loc[y, 'RKI-Exzess-Konsultationen'] =RKI_season_summary.loc[int(y),'Exzess-Konsultationen']
    df_RKI_seasonsum.loc[y, 'RKI-detected influenza cases (from samples)'] =sum(df_RKI_states.loc[start_date:end_date,'Germany_sum'])
    df_RKI_seasonsum.loc[y, 'detection rate'] =df_RKI_seasonsum.loc[y, 'RKI-detected influenza cases (from samples)']/df_RKI_seasonsum.loc[y, 'RKI-Exzess-Konsultationen']

fig, ax1 = plt.subplots()
ax2 = ax1.twinx()
ind = df_RKI_seasonsum.index.astype(int)
width = 0.35

color = 'tab:red'
ax1.bar(ind, df_RKI_seasonsum['RKI-Exzess-Konsultationen'],width, color=color)
#ax1.set_xticks(years_end)
ax1.set_xlabel('year')
ax1.set_ylabel('# RKI-Exzess-Konsultationen \n (annual total)', color=color)
ax1.tick_params(axis='y', labelcolor=color)
ax1.tick_params(axis='x', rotation=90)


color = 'tab:blue'
ax2.bar(ind+width, df_RKI_seasonsum['RKI-detected influenza cases (from samples)'],width , color=color)
ax2.set_ylabel('# RKI - detected Influenza cases \n (annual total)', color=color)
ax2.tick_params(axis='y', labelcolor=color)


plt.show()    
    
    
    
print('corr.coef: ' + str(df_RKI_seasonsum.loc[:, 'RKI-Exzess-Konsultationen'] .corr(df_RKI_seasonsum.loc[:, 'RKI-detected influenza cases (from samples)'])))    
df_RKI_seasonsum

The detection ratio (=RKI-detected influenza cases/RKI-Exzess-Konsultationen) more or less steadily increases form 0.002 in 2003 to 0.02 in 2017. 

In [None]:
df_RKI_seasonsum.plot.bar(df_RKI_seasonsum.index,['detection rate'])
plt.ylabel('detection rate \n = detected cases / Exzess-Konsultationen')
plt.xlabel('year')

It can be concluded, that the number of detected influenza cases, as reported by RKI on weekly basis, cannot be directly used as predictor for the number of doctor visits (and, ultimately, for prediction of number of claims) because the detection rate increases in time. Without taking into account the increasing detection rate, the magnitude of the future flu waves would likely be overestimated.

One could think about using the "praxis index" or "ARE-Konsultationen per 100000 Einwohner" for prediction, but they don't seem to be available in the required frequency and format by RKI. However, this might be further investigated as they obviously also collect this data on weekly basis... (query to RKI?)
https://influenza.rki.de/Diagrams.aspx?agiRegion=0

## Weather data

Wheather data from Deutscher Wetterdienst (DWD) available at (ftp://ftp-cdc.dwd.de/pub/CDC/observations_germany/climate/daily/kl/historical/). Data from station München-Stadt were used for the analysis. They are available from year 1954 until the end of 2016 (the station is operating until now but the downloaded data end at the end of 2017).

The daily mean temperature in °C (TMK) and daily mean of relative humidity in % (UPM) were included in this analysis. These variables were selected because studies showed that it is mainly the cold and dry air that helps spreading influenza viruses (Lowen, A.C. and Steel, J. 2014. Roles of Humidity and Temperature in Shaping Influenza Seasonality)(Davis, R.E, Rossier, C.E., Enfield, K.B. 2012. The Impact of Weather on Influenza and Pneumonia Mortality in New York City, 1975–2002: A Retrospective Study). 

In [None]:
df_weather_row = pd.read_csv('produkt_klima_tag_19540601_20161231_03379.txt', sep=';')

df_weather_row=df_weather_row.set_index(['MESS_DATUM'])
df_weather_row.index = pd.to_datetime(df_weather_row.index, format='%Y%m%d')
df_weather_row=df_weather_row.replace(-999, np.nan)

print(list(df_weather_row))
df_weather_row


Because the influenya activity is reported on weekly basis, the weather data were also transformed to weekly means based for each calendar week (Kalenderwoche) denoted by its enddate (Sunday). 

In [None]:

df_weather.head()


In [None]:
end_of_week=pd.date_range(start=pd.datetime(1954,6,6), end=pd.datetime(2017,1,1), freq='7D').tolist()
df_weather=pd.DataFrame(index=end_of_week)

df_weather['temperature in °C (weekly mean)']=df_weather_row.loc[:, ' TMK'].resample('W').mean()
df_weather['rel.humidity in % (weekly mean)']=df_weather_row.loc[:, ' UPM'].resample('W').mean()

df_weather.plot()

Further, the deviations from long-term monthly mean temperature/humidity were calculated for each week. 

In [None]:
df_monthly_mean=pd.DataFrame.groupby(df_weather_row.loc[:, [' TMK',' UPM']],by=[df_weather_row.index.month]).mean()
df_monthly_mean=df_monthly_mean.rename(columns={' TMK': 'long-term monthly mean temperature',' UPM': 'long-term monthly mean rel.humidity'})
df_monthly_mean

In [None]:
for i in range(1,13):
    df_weather.loc[(df_weather_devFromMean.index.month==i,'temperature deviation from long-term mean')]=df_weather.loc[(df_weather.index.month==i),'temperature in °C (weekly mean)']-df_monthly_mean.loc[i,'long-term monthly mean temperature']
    df_weather.loc[(df_weather_devFromMean.index.month==i,'rel.humidity deviation from long-term mean')]=df_weather.loc[(df_weather.index.month==i),'rel.humidity in % (weekly mean)']-df_monthly_mean.loc[i,'long-term monthly mean rel.humidity']
df_weather.loc[datetime.datetime(2000,1,1):datetime.datetime(2016,1,1)].plot(subplots=True)

In [None]:
def crosscorr(datax, datay, lag=0):
    return datax.corr(datay.shift(lag))   #pandas.Series.corr (Pearson correlation coefficient)

timelag=range(-5,5,1)
#timelag=0
df_crosscorr_weather=pd.DataFrame(index=timelag)
df_crosscorr_weather_summary=pd.DataFrame(index=['max', 'shift [w]'])

for column in df_weather:
    xcov_weekly = [crosscorr(df_RKI_states['Bavaria'],df_weather[column], lag=i) for i in timelag]
    df_crosscorr_weather[column] = xcov_weekly
    df_crosscorr_weather_summary[column]=[max(abs(xcov_weekly), timelag[xcov_weekly.index(max(abs(xcov_weekly))]]
 

df_crosscorr_weather_summary

In [None]:
df_crosscorr_weather.reset_index().plot(x='index', y=['temperature in °C (weekly mean)', 'rel.humidity in % (weekly mean)', 'temperature deviation from long-term mean','rel.humidity deviation from long-term mean'])
plt.xlabel('time lag [weeks]')
plt.ylabel('crosscorrelation')
plt.legend()
plt.show()

## Cross-correlation of influenza cases reported from European countries


In the following, the cross-correlation of the influenza cases reported in Germany and in other European countries is investigated for different time-lags, that may indicate the delay of the flu wave between the countries. 



In [None]:
#settings:

#time lags (in weeks) for calculating the cross-correlation
timelag=range(-5,10,1)

#cutoff year (from which year the analysis should be done)
cutoff=datetime.datetime(2008,1,1)
df_inf_per_state=df_inf_per_state_full.loc[cutoff:]
df_inf_per_state.head()

In [None]:
def crosscorr(datax, datay, lag=0):
    return datax.corr(datay.shift(lag))   #pandas.Series.corr (Pearson correlation coefficient)

df_crosscorr=pd.DataFrame(index=timelag)
df_crosscorr_summary=pd.DataFrame(index=['max', 'shift [w]'])

for column in df_inf_per_state:
    xcov_weekly = [crosscorr(df_inf_per_state['Germany'],df_inf_per_state[column], lag=i) for i in timelag]
    df_crosscorr[column] = xcov_weekly
    df_crosscorr_summary[column]=[max(xcov_weekly), timelag[xcov_weekly.index(max(xcov_weekly))]]
 

df_crosscorr_summary.loc[:,['Italy','Austria','UK', 'Switzerland','France', 'Spain', 'Poland', 'Netherlands', 'Sweden']]


In [None]:

#df_crosscorr.reset_index().plot(x='index', y=['France', 'UK', 'Poland'])
#plt.xlabel('time lag [weeks]')
#plt.ylabel('crosscorrelation')
#plt.legend()
#plt.show()


In [None]:
from datetime import timedelta

countries=['France','Italy','UK','Poland']
lag=0
d=datetime.timedelta(weeks=lag)

plt.subplot(221)
plt.scatter(df_inf_per_state.loc[(df_inf_per_state.index.min()+d):,'Germany'], df_inf_per_state.loc[:(df_inf_per_state.index.max()-d),countries[0]])
plt.title(countries[0])

plt.subplot(222)
plt.scatter(df_inf_per_state.loc[(df_inf_per_state.index.min()+d):,'Germany'], df_inf_per_state.loc[:(df_inf_per_state.index.max()-d),countries[1]])
plt.title(countries[1])

plt.subplot(223)
plt.scatter(df_inf_per_state.loc[(df_inf_per_state.index.min()+d):,'Germany'], df_inf_per_state.loc[:(df_inf_per_state.index.max()-d),countries[2]])
plt.title(countries[2])

plt.subplot(224)
plt.scatter(df_inf_per_state.loc[(df_inf_per_state.index.min()+d):,'Germany'], df_inf_per_state.loc[:(df_inf_per_state.index.max()-d),countries[3]])
plt.title(countries[3])


In [None]:
import numpy as np
from scipy import stats

x=df_inf_per_state.index.values
y=df_inf_per_state['Germany'].as_matrix()
not_nan_ind = ~np.isnan(y)
#m, b, r_val, p_val, std_err = stats.linregress(x[not_nan_ind],y[not_nan_ind])
#detrend_y = y - (m*x + b)
plt.plot(x,y)

df_inf_per_state['Germany']

In [None]:
#detrending the data

from scipy import signal

df_inf_per_state_detr=pd.DataFrame()

x=df_inf_per_state.index.values

for column in df_inf_per_state:
    #df_inf_per_state_detr[column] = signal.detrend(df_inf_per_state[column])
    y=df_inf_per_state[column]
    not_nan_ind = ~np.isnan(y)
    m, b, r_val, p_val, std_err = stats.linregress(x[not_nan_ind],y[not_nan_ind])
    detrend_y = y - (m*x + b)


signal.detrend(x)

## Cross-correlation of influenza cases reported from German states


In the following, the cross-correlation of the influenza cases reported in Bavaria and in other states in Garmeny is investigated for different time-lags, that may indicate the delay of the flu wave between the states. 


In [None]:
#settings:

#time lags (in weeks) for calculating the cross-correlation
timelag=range(-5,10,1)

#cutoff year (from which year the analysis should be done)
cutoff=datetime.datetime(2000,1,1)
df_RKI_states=df_RKI_states.loc[cutoff:]
df_RKI_states.head()


In [None]:
column

In [None]:
def crosscorr(datax, datay, lag=0):
    return datax.corr(datay.shift(lag))   #pandas.Series.corr (Pearson correlation coefficient)


df_crosscorrDE=pd.DataFrame(index=timelag)
df_crosscorrDE_summary=pd.DataFrame(index=['max', 'shift [w]'])

for column in df_RKI_states:
    xcov_weekly = [crosscorr(df_RKI_states['Bavaria'],df_RKI_states[column], lag=i) for i in timelag]
    df_crosscorrDE[column] = xcov_weekly
    df_crosscorrDE_summary[column]=[max(xcov_weekly), timelag[xcov_weekly.index(max(xcov_weekly))]]
 

df_crosscorrDE_summary

In [None]:
#plt.plot(timelag, df_crosscorr['France'])
df_crosscorrDE.reset_index().plot(x='index', y=['Baden-Wuerttemberg', 'Berlin', 'Hessen'])
plt.xlabel('time lag [weeks]')
plt.ylabel('crosscorrelation')
plt.legend()
plt.show()

In [None]:
from datetime import timedelta

countries=['Baden-Wuerttemberg', 'Berlin', 'Hessen','Bremen']
lag=-1
d=datetime.timedelta(weeks=lag)

plt.subplot(221)
plt.scatter(df_RKI_states.loc[(df_RKI_states.index.min()+d):,'Bavaria'], df_RKI_states.loc[:(df_RKI_states.index.max()-d),countries[0]])
plt.title(countries[0])

plt.subplot(222)
plt.scatter(df_RKI_states.loc[(df_RKI_states.index.min()+d):,'Bavaria'], df_RKI_states.loc[:(df_RKI_states.index.max()-d),countries[1]])
plt.title(countries[1])

plt.subplot(223)
plt.scatter(df_RKI_states.loc[(df_RKI_states.index.min()+d):,'Bavaria'], df_RKI_states.loc[:(df_RKI_states.index.max()-d),countries[2]])
plt.title(countries[2])

plt.subplot(224)
plt.scatter(df_RKI_states.loc[(df_RKI_states.index.min()+d):,'Bavaria'], df_RKI_states.loc[:(df_RKI_states.index.max()-d),countries[3]])
plt.title(countries[3])
 

# Google trend data

Google trend data is expected to give a good estimator of the current influenza situation in a country. It's quality for medium and long term are questionable. 

## Comparison Germany Bavaria (Saarland)

The google trend timeseries (keyword Influenza) from Germany and Bavaria (and Saarland) are superimposed to check wether different characteristics regarding initiation time/magnitude etc. can be observed.

In [None]:
# get data through the unofficial Google treds API (pytrends)
import pandas as pd
from pytrends.request import TrendReq
pytrends = TrendReq(hl = 'en-US')

kw_list = ["Influenza"]
timeframe_idx = 'today 5-y' # weekly stats for last 5 years (all givels everyting starting from 2004 but at monthly resolution)
cat_idx_Gesundheit = 45
geo_idx = 'DE'
geo_idxBY = 'DE-BY'
geo_idxSL = 'DE-SL'

pytrends.build_payload(kw_list,cat=cat_idx_Gesundheit,timeframe = timeframe_idx,geo = geo_idx)
pf_googleTrends_Germany = pytrends.interest_over_time()
pf_googleTrends_Germany.rename(columns={'Influenza':'Germany'})

pytrends.build_payload(kw_list,cat=cat_idx_Gesundheit,timeframe = timeframe_idx,geo = geo_idxBY)
pf_googleTrends_Bavaria = pytrends.interest_over_time()
pf_googleTrends_Bavaria.rename(columns={'Influenza':'Bavaria'})

pytrends.build_payload(kw_list,cat=cat_idx_Gesundheit,timeframe = timeframe_idx,geo = geo_idxSL)
pf_googleTrends_Saarland = pytrends.interest_over_time()
pf_googleTrends_Saarland.rename(columns={'Influenza':'Saarland'})

pf_googleTrends_EU = pf_googleTrends_Germany
pf_googleTrends_EU = pf_googleTrends_EU.rename(columns = {'Influenza':'Germany'})

EUcountries = {'Belgium':'BE','Bulgaria':'BG','Czech Republic':'CZ','Denmark':'DK','Estonia':'EE','Ireland':'IE','Greece':'GR','Spain':'ES','France':'FR','Croatia':'HR','Italy':'IT','Cyprus':'CY','Latvia':'LV','Lithuania':'LT','Luxembourg':'LU','Hungary':'HU','Netherlands':'NL','Austria':'AT','Poland':'PL','Portugal':'PT','Romania':'RO','Slovenia':'SI','Slovakia':'SK','Finland':'FI','Sweden':'SE','UK':'GB'}#,'Malta':'MT'}
Influenza_dictionary = {'Belgium':['Influenza'],'Bulgaria':['грип'],'Czech Republic':['chřipka'],'Denmark':['Influenza'],'Estonia':['gripp'],'Ireland':['influenza'],'Greece':['γρίπη'],'Spain':['influenza '],'France':['influenza '],'Croatia':['influenca'],'Italy':['influenza'],'Cyprus':['γρίπη'],'Latvia':'gripa','Lithuania':['gripą'],'Luxembourg':['influenza'],'Hungary':['influenza'],'Netherlands':['influenza'],'Austria':['Influenza'],'Poland':['grypa'],'Portugal':['influenza'],'Romania':['gripă'],'Slovenia':['gripe'],'Slovakia':['chrípka'],'Finland':['influenssa'],'Sweden':['influensa'],'UK':['influenza']}#,'Malta':['influwenza']}



for country_key in EUcountries.keys():
    print(country_key)
    kw_list = Influenza_dictionary[country_key]
    pytrends.build_payload(kw_list,cat=cat_idx_Gesundheit,timeframe = timeframe_idx,geo = EUcountries[country_key])
    temp_df = pytrends.interest_over_time().drop('isPartial',axis =1)
    temp_df = temp_df.rename(columns = {kw_list[0]:country_key})
    pf_googleTrends_EU = pf_googleTrends_EU.join(temp_df,rsuffix = EUcountries[country_key])



In [None]:
pf_googleTrends_EU.tail(22)

In [None]:
fig = plt.figure()
ax = pf_googleTrends_Germany.plot()
pf_googleTrends_Bavaria.plot(ax = ax)
pf_googleTrends_Saarland.plot(ax = ax)

plt.title('Google index of influenza')
plt.xlabel('date')
plt.ylabel('google index')
ax.legend(['Germany','Bavaria','Saarland'])





## Autocorrelation Germany

In [None]:
from pandas.tools.plotting import autocorrelation_plot
fig = plt.figure()
autocorrelation_plot(pf_googleTrends_Germany['Influenza'])
plt.xlabel('Lag [weeks]')
plt.ylabel('autocorrelation')
plt.xlim([0,60])

## Cross-correlation of influenza cases reported from European countries with Google trend data

In [None]:
# get worldwide google Influenza trends 
# get data through the unofficial Google treds API (pytrends)


In [None]:
def crosscorr(datax, datay, lag=0):
    return datax.corr(datay.shift(lag))   #pandas.Series.corr (Pearson correlation coefficient)


timelag=range(-5,10,1)

df_crosscorr_google=pd.DataFrame(index=timelag)
df_crosscorr_google_summary=pd.DataFrame(index=['max', 'shift [w]'])

for column in EUcountries.keys():
    xcov_weekly = [crosscorr(pf_googleTrends_EU['Germany'],pf_googleTrends_EU[column], lag=i) for i in timelag]
    df_crosscorr_google[column] = xcov_weekly
    df_crosscorr_google_summary[column]=[max(xcov_weekly), timelag[xcov_weekly.index(max(xcov_weekly))]]
 

df_crosscorr_google_summary.loc[:,['Italy','Austria','UK', 'Switzerland','France', 'Spain', 'Poland', 'Netherlands', 'Sweden']]#EUcountries.keys()]#]


In [None]:
df_crosscorr_google.reset_index().plot(x='index', y=['France', 'UK', 'Poland'])
plt.xlabel('time lag [weeks]')
plt.ylabel('crosscorrelation')
plt.legend()
plt.show()


## Comparison of magnitude between different countries
To use the data from other countries for prediction of the magnitude of the flu wave in Germany there needs to be a clear relationship between the magnitudes of the flu waves (here in terms of google search index) in the different countries.

In [None]:
fig = plt.figure()

countries = ['Germany','Italy','UK']#,'France','Poland']

for country in countries:
    ax_google = pf_googleTrends_EU[country].plot()
    pf_googleTrends_EU[country].plot(ax = ax)


plt.title('Google index of influenza')
plt.xlabel('date')
plt.ylabel('google index')
ax_google.legend(countries)
plt.xlim([datetime.date(year=2016,month=1,day=1),datetime.date.today()])





# Heatmaps

Heatmap created following: http://www.alexschultz.co.uk/weblog/2010/07/creating-country-level-heatmaps-in-python.html

In [None]:
os.getcwd()


In [None]:
from lxml import etree
import os

def show_flu_inEU_for_week_lxml(date): 
    # date is a datetime object
    # soup a beatiful soup object
    currnt_dir = os.getcwd()
    os.chdir(currnt_dir)
    svg = etree.parse('Blank_map_of_Europe_cropped.svg')
    EUcountries = {'Belgium':'BE','Bulgaria':'BG','Czech Republic':'CZ','Denmark':'DK','Estonia':'EE','Ireland':'IE','Greece':'GR','Spain':'ES','France':'FR','Croatia':'HR','Italy':'IT','Cyprus':'CY','Latvia':'LV','Lithuania':'LT','Luxembourg':'LU','Hungary':'HU','Netherlands':'NL','Austria':'AT','Poland':'PL','Portugal':'PT','Romania':'RO','Slovenia':'SI','Slovakia':'SK','Finland':'FI','Sweden':'SE','UK':'GB'}#,'Malta':'MT'}
    map_countryCode = {'lt-3': 'Lithuania', 'gb-gbn-5': 'UK', 'lv-1': 'Latvia', 'fi-5': 'Finland', 'sk-2': 'Slovakia', 'lu-7': 'Luxembourg', 'si-9': 'Slovenia', 'at-3': 'Austria', 'ie-5': 'Ireland', 'dk-9': 'Denmark', 'be-2': 'Belgium', 'gr-7': 'Greece', 'ee-8': 'Estonia', 'es-4': 'Spain', 'cz-1': 'Czech Republic', 'ro-1': 'Romania', 'hu-3': 'Hungary', 'nl-3': 'Netherlands', 'hr-2': 'Croatia', 'it-4': 'Italy', 'cy-6': 'Cyprus', 'fr-7': 'France', 'pt-0': 'Portugal', 'se-7': 'Sweden', 'de-2': 'Germany', 'bg-3': 'Bulgaria', 'pl-1': 'Poland'}#, 'mt-0': 'Malta'}
#    map_countryCode = {'lt': 'Lithuania', 'gb': 'UK', 'lv': 'Latvia', 'fi': 'Finland', 'sk': 'Slovakia', 'lu': 'Luxembourg', 'si': 'Slovenia', 'at': 'Austria', 'ie': 'Ireland', 'dk': 'Denmark', 'be': 'Belgium', 'gr': 'Greece', 'va': 'Vatican', 'ee': 'Estonia', 'es': 'Spain', 'cz': 'Czech Republic', 'ro': 'Romania', 'mt': 'Malta', 'hu': 'Hungary', 'nl': 'Netherlands', 'hr': 'Croatia', 'it': 'Italy', 'cy': 'Cyprus', 'fr': 'France', 'pt': 'Portugal', 'se': 'Sweden', 'de': 'Germany', 'bg': 'Bulgaria', 'pl': 'Poland'}
    
    path_style ="stroke:#ffffff;stroke-width:0.40082097;stroke-miterlimit:4;stroke-dasharray:none;fill:" 
    #path_style = "fill-opacity:1;stroke:#ffffff;stroke-width:0.5225144" 
    colormap = ["#c0c0c0","#ffeda0","#feb24c","#f03b20"]

    pf_of_date = pf_googleTrends_EU.ix[date]
    root = svg.getroot()
    child_element = root.find('{http://www.w3.org/2000/svg}g')
    # countries with multiple polygons
    gs = child_element.findall('{http://www.w3.org/2000/svg}g')
    
    # countries without multiple polygons
    paths = child_element.findall('{http://www.w3.org/2000/svg}path')
    
    for p in paths:
        p_id = p.get('id')
        
        if p_id in map_countryCode.keys():
            if pf_of_date[map_countryCode[p_id]] > 50:
                color_class = 3
            elif pf_of_date[map_countryCode[p_id]] > 10:
                color_class = 2               
            elif pf_of_date[map_countryCode[p_id]] > 1:
                color_class = 1                  
            else:
                color_class = 0
              
            color = colormap[color_class]
            p.set('style', path_style + color)
            
            
            
    for g in gs:
        g_id = g.get('id')
        if g_id in map_countryCode.keys():        
            if pf_of_date[map_countryCode[g_id]] > 50:
                color_class = 3
            elif pf_of_date[map_countryCode[g_id]] > 10:
                color_class = 2
            elif pf_of_date[map_countryCode[g_id]] > 1:
                color_class = 1
            else:
                color_class = 0
            color = colormap[color_class]  
            
            for t in g.findall('{http://www.w3.org/2000/svg}path'):
                t.set('style',path_style+color)
            for g_3rdlevel in g.findall('{http://www.w3.org/2000/svg}g'):
                for t2 in g_3rdlevel.findall('{http://www.w3.org/2000/svg}path'):
                    t.set('style',path_style+color)
                
    # write everything to a file
    f = svg.write('EU_flu_'+ date.strftime('%Y_%m_%d')+'.svg',pretty_print = True)            
                        
        

In [None]:
startdate = datetime.datetime.strptime('2017-07-30','%Y-%m-%d')
time_steps = 12
time_delta = 14 #days

for i in xrange(0,time_steps):
    date = startdate + timedelta(days= i*time_delta)
    show_flu_inEU_for_week_lxml(date)


new cell