In [None]:
# Figure out what to do about Russia error message

In [6]:
# import os
# import re
import json
import time
import requests
import xlsxwriter
from math import sqrt
from datetime import datetime as dt
from pandas import DataFrame, read_csv, MultiIndex, ExcelWriter

In [1]:
# This block is for building and sending the API query
# Also fills missing values with 0
def build_query(iso): # Add adm1 as an optional parameter?
    q = (
        "SELECT iso, adm1, alert__year, alert__week, SUM(alert__count) " +
        "as alert__count, confidence__cat " +
        "FROM mytable " +
        f"WHERE iso='{iso}' AND confidence__cat='h' " +
        "GROUP BY iso, adm1, alert__year, alert__week"
    )
    return q


def query_params(q):
    params = {
        'sql':q
    }
    return params


def send_request(base_url, data_id, iso): 
    q = build_query(iso)
    p = query_params(q)
    r = requests.get(base_url + '/' + data_id, params=p)
    
    if r.status_code == 200:
        return r.json()['data']

    else:
        print('Something went wrong...')
        print('Status code: ', r.status_code)
        print('Error message: ', r.json()['errors'][0]['detail'])

In [2]:
# This block is for processing data after query
# Update this to make year range dynamic (based on today)
def create_multiindex(df, names):
    countries = df.iso.unique()
    regions = df.adm1.unique()
    years = range(2012, 2022)
    weeks = range(1, 53)

    mi = MultiIndex.from_product(
        iterables=[countries, regions, years, weeks],
        names=names
    )

    return mi


def fill_missing(df):
    names = ['iso', 'adm1', 'alert__year', 'alert__week']
    mi = create_multiindex(df, names)

    filled = (
        df
            .set_index(names)
            .reindex(mi)
            .reset_index()
            .fillna(0)
    )
    
    return filled

In [3]:
# This block is for calculating significance score
# Clean these up and compartmentalize better
def get_cw_cy():
    today = dt.today().isocalendar()
    # cy = today[0]
    if today[1] == 1:
        cw == 52
        cy == today[0] - 1
    else:
        cw = today[1] - 1
        cy = today[0]

    return cw, cy


def make_lookup(df, adm1):
    df = df[df['adm1']==adm1]
    return df.pivot_table(
        index='alert__year',
        columns='alert__week',
        values='alert__count'
    ).fillna(0)


def vizzuality_sd(lookup, week, mean):
    return sqrt(sum((lookup[week] - mean)**2)/len(lookup[week]))


def significance_variables(lookup):
    # Get current week and year
    week, year = get_cw_cy()
    x = lookup[week][year]
    mu = lookup[week].mean()
    sd = vizzuality_sd(lookup, week, mu)
    return x, mu, sd


def calc_sigscore(df, adm1): # Add number of weeks here
    lookup = make_lookup(df, adm1)
    x, mu, sd = significance_variables(lookup)
    if x - mu == 0:
        significance = 0
    else:
        significance = (x - mu) / sd
    return significance, x


def sigscore_tuple(df, country, region):
    sig_score, current_alerts = calc_sigscore(df, region)
    return country, region, sig_score, current_alerts

In [4]:
# This block is to calculate significance score for each adm1 in all countries
def all_adm1_significance(iso_list):
    BASE_URL = 'https://api.resourcewatch.org/v1/query'
    DATA_ID = '54bb00e8-9888-494a-bcd8-9fd3760fe384'

    final_out = []
    # start = 1
    # end = len(iso_list)
    for country in iso_list:
        # if verbose == 2:
        print(country)
        # if verbose == 1:
        #     print(f'\r{(start/end) * 100:.2f}%', end='', flush=True)
        data = send_request(BASE_URL, DATA_ID, country)
        time.sleep(2)
        if data:
            df = fill_missing(DataFrame(data))
            regions = df.adm1.unique()
            out = [sigscore_tuple(df, country, region) for region in regions]
            final_out += out
        # start += 1
    
    rank_df = DataFrame(final_out, columns=['country', 'adm1', 'significance', 'current_alerts'])
    sorted = rank_df.sort_values(by=['significance'], ascending=False)

    return sorted

In [41]:
# BASE_URL = 'https://api.resourcewatch.org/v1/query'
# DATA_ID = '54bb00e8-9888-494a-bcd8-9fd3760fe384'
# q = build_query('UGA')
# p = query_params(q)
# r = requests.get(BASE_URL + '/' + DATA_ID, params=p)

In [42]:
# r.json()

In [7]:
all_adm1_significance(['IND'])

IND


Unnamed: 0,country,adm1,significance,current_alerts
22,IND,13,3.0,14.0
20,IND,29,2.961538,18.0
15,IND,23,2.869867,379.0
1,IND,19,2.828947,167.0
16,IND,15,2.814632,24.0
12,IND,35,2.601589,19.0
3,IND,26,2.056257,215.0
5,IND,7,1.92638,91.0
14,IND,11,1.890613,33.0
23,IND,36,1.870967,29.0


In [43]:
def gadm_countries(file='gadm_adm1.csv'):
    keep_cols = ['GID_0', 'NAME_0', 'GID_1', 'NAME_1']
    gadm_df = read_csv(file, usecols=keep_cols)
    iso_list = gadm_df.GID_0.unique()
    return iso_list

In [44]:
iso_list = gadm_countries()
iso_list = ['COD', 'CAF', 'BEN', 'UGA']

In [47]:
# There is a problem with Russia at the moment
# Group by query produces too many results
# Need to reduce the number of rows
# Consider more restrictive "where" clause or less "group by" criteria
start = dt.now()
print(start)
top_adm1 = all_adm1_significance(iso_list)
end = dt.now()
print(f'Total time: {end - start}')

2021-03-09 12:34:00.766077
AFG
AGO
ALA
ALB
AND
ARE
ARG
ARM
Something went wrong...
Status code:  400
Error message:  null
ASM
ATF
ATG
AUS
AUT
AZE
BDI
BEL
BEN
BES
BFA
BGD
BGR
BHR
BHS
BIH
BLR
BLZ
BMU
BOL
BRA
BRB
BRN
BTN
BWA
CAF
CAN
CHE
CHL
CHN
CIV
CMR
COD
COG
COL
COM
CPV
CRI
CUB
CYM
CYP
CZE
DEU
DJI
DMA
DNK
DOM
DZA
ECU
EGY
ERI
ESH
ESP
EST
ETH
FIN
FJI
FRA
FRO
FSM
GAB
GBR
GEO
GGY
GHA
GIN
GLP
GMB
GNB
GNQ
GRC
GRD
GRL
GTM
GUF
GUM
GUY
HKG
HND
HRV
HTI
HUN
IDN
IMN
IND
IRL
IRN
IRQ
ISL
ISR
ITA
JAM
JEY
JOR
JPN
KAZ
KEN
KGZ
KHM
KNA
KOR
KWT
LAO
LBN
LBR
LBY
LCA
LIE
LKA
LSO
LTU
LUX
LVA
MAC
MAR
MDA
MDG
MEX
MKD
MLI
MLT
MMR
MNE
MNG
MNP
MOZ
MRT
MSR
MTQ
MUS
MWI
MYS
MYT
NAM
NCL
NER
NGA
NIC
NLD
NOR
NPL
NRU
NZL
OMN
PAK
PAN
PER
PHL
PLW
PNG
POL
PRI
PRK
PRT
PRY
PSE
PYF
QAT
REU
ROU
RUS
Something went wrong...
Status code:  400
Error message:  Your are using a "group by" query that produces too many results. Please reduce the number of rows your "group by" query produces (ie. more restrictive "where" clause or use le

In [49]:
# Merge this and the next block into function calls
top_adm1['score'] = top_adm1.significance * top_adm1.current_alerts
top_score = top_adm1.sort_values(['score'], ascending=False).head(20)
top_sig = top_adm1[top_adm1['current_alerts'] > 50].head(20)
print(top_score)
print(top_sig)
# top_adm1.head(5)

     country  adm1  significance  current_alerts        score
420      COD     5      2.885729          1248.0  3601.390017
417      COD     1      2.804475          1030.0  2888.608851
1540     MMR    13      1.751586           861.0  1508.115704
431      COD    20      2.754413           515.0  1418.522538
2210     SSD    10      2.660426           459.0  1221.135713
975      IND    23      2.869867           379.0  1087.679705
311      CAF     9      2.731986           352.0   961.659151
315      CAF     5      2.261719           419.0   947.660437
317      CAF    13      2.939026           314.0   922.854259
1548     MMR     7      2.278280           347.0   790.562994
428      COD    25      2.437059           306.0   745.740054
2207     SSD     4      2.429851           278.0   675.498636
2211     SSD     8      2.347963           261.0   612.818446
323      CAF     6      2.613221           226.0   590.587870
1543     MMR     3      2.419791           206.0   498.476859
961     

In [52]:
today = dt.today()
filename = 'top_20_output_' + today.strftime('%Y%m%d') + '.xlsx'
# Create pandas excel writer
writer = ExcelWriter(filename, engine='xlsxwriter')
# Write each dataframe to a different worksheet
top_sig.to_excel(writer, sheet_name='top_significance', index=False)
top_score.to_excel(writer, sheet_name='top_score', index=False)
# Close the pandas excel writer
writer.save()