In [105]:
import pandas as pd
from pathlib import Path

In [106]:
inputfilepath = Path(r"C:\Users\jasonjia\Dropbox\Projects\denmark_spending\data\spending_matrix_full_long_processed.csv")

In [107]:
df = pd.read_csv(inputfilepath, low_memory=False)
df.shape

(3841992, 7)

In [108]:
df.head()

Unnamed: 0,spender_region,spender_region_name,spender_sector,recipient_region,recipient_region_name,recipient_sector,spend
0,101,København,1,101,København,1,50520576.0
1,101,København,1,101,København,2,8743208.0
2,101,København,1,101,København,3,64380656.0
3,101,København,1,101,København,4,24944372.0
4,101,København,1,101,København,5,15440209.0


In [109]:
list_regionstocheck = ['København', 'Aalborg', 'Aarhus','Foreign']

In [110]:
def get_spend_share(df, groupby_col_name, list_regionstocheck, selected_region=None, spender_sectors=None, recipient_sectors=None):
    df_check = df
    
    if groupby_col_name == 'recipient_region_name':
        regionfilter_col_name = 'spender_region_name'
    elif groupby_col_name == 'spender_region_name':
        regionfilter_col_name = 'recipient_region_name'
    
    if selected_region != None:
        df_check = df_check[df_check[regionfilter_col_name] == selected_region]
    if spender_sectors != None:
        df_check = df_check[df_check['spender_sector'].isin(spender_sectors)]
    if recipient_sectors != None:
        df_check = df_check[df_check['recipient_sector'].isin(recipient_sectors)]
    
    df_check = df_check.groupby(groupby_col_name)        
    df_check = df_check['spend'].agg('sum').reset_index()
    df_check['spend_share'] = (100 * df_check['spend'] / df_check['spend'].sum()).round(2)
    df_check = df_check[df_check[groupby_col_name].isin(list_regionstocheck)]
    return df_check

In [111]:
# Checks
# For 'Spending Destinations', the groupby_col_name is 'recipient_region_name', because we choose the spender region and
# want to know how much the spender region spends on each recipient region
# For 'Revenue Sources', the groupby_col_name is 'spender_region_name', because we choose the recipient region and
# want to know how much the recipient region receives from each spender region

# Tests for 'Spending Destinations' sheet
print("Tests for 'Spending Destinations' sheet \n")

df_check_1 = get_spend_share(df, 'recipient_region_name', list_regionstocheck)

print('df_check_1:\n')
print(df_check_1)
print('\n')

df_check_2 = get_spend_share(df, 'recipient_region_name', list_regionstocheck, 
                             selected_region='Aalborg')
print('df_check_2:\n')
print(df_check_2)
print('\n')

df_check_3 = get_spend_share(df, 'recipient_region_name', list_regionstocheck, 
                             selected_region='Aalborg', spender_sectors = [25])
print('df_check_3:\n')
print(df_check_3)
print('\n')

df_check_4 = get_spend_share(df, 'recipient_region_name', list_regionstocheck, 
                             selected_region='Aalborg', recipient_sectors = [3])
print('df_check_4:\n')
print(df_check_4)
print('\n')

df_check_5 = get_spend_share(df, 'recipient_region_name', list_regionstocheck,
                            selected_region='Aalborg', spender_sectors = [25, 26, 27, 28], recipient_sectors = [2, 3])
print('df_check_5:\n')
print(df_check_5)
print('\n')

# Tests for 'Revenue Sources' shee
print("Tests for 'Revenue Sources' sheet \n")

df_check_6 = get_spend_share(df, 'spender_region_name', list_regionstocheck)

print('df_check_6:\n')
print(df_check_6)
print('\n')

df_check_7 = get_spend_share(df, 'spender_region_name', list_regionstocheck, 
                             selected_region='Aalborg')
print('df_check_7:\n')
print(df_check_7)
print('\n')

df_check_8 = get_spend_share(df, 'spender_region_name', list_regionstocheck, 
                             selected_region='Aalborg', spender_sectors = [25])
print('df_check_8:\n')
print(df_check_8)
print('\n')

df_check_9 = get_spend_share(df, 'spender_region_name', list_regionstocheck, 
                             selected_region='Aalborg', recipient_sectors = [3])
print('df_check_9:\n')
print(df_check_9)
print('\n')

df_check_10 = get_spend_share(df, 'spender_region_name', list_regionstocheck,
                            selected_region='Aalborg', spender_sectors = [25, 26, 27, 28], recipient_sectors = [2, 3])
print('df_check_10:\n')
print(df_check_10)
print('\n')

Tests for 'Spending Destinations' sheet 

df_check_1:

   recipient_region_name         spend  spend_share
1                Aalborg  2.809067e+09         2.04
2                 Aarhus  9.093291e+09         6.60
18               Foreign  1.600943e+10        11.63
51             København  2.784716e+10        20.22


df_check_2:

   recipient_region_name         spend  spend_share
1                Aalborg  1.813951e+09        55.76
2                 Aarhus  1.026537e+08         3.16
18               Foreign  3.395729e+08        10.44
51             København  3.487041e+08        10.72


df_check_3:

   recipient_region_name         spend  spend_share
1                Aalborg  4.143405e+07        53.05
2                 Aarhus  3.256970e+06         4.17
18               Foreign  1.332680e+07        17.06
51             København  7.154202e+06         9.16


df_check_4:

   recipient_region_name         spend  spend_share
1                Aalborg  6.385287e+08        86.59
2               

## Tests

In [None]:
# Check 1: Spender Region = (All), Spender Sector = (All), Recipient Sector = (All)

In [40]:
df_check1 = df.groupby('recipient_region_name')

In [76]:
df_check1

Unnamed: 0,recipient_region_name,spend,spend_share
0,Aabenraa,4.835869e+08,0.35
1,Aalborg,2.809067e+09,2.04
2,Aarhus,9.093291e+09,6.60
3,Albertslund,9.690734e+08,0.70
4,Allerød,3.868411e+08,0.28
...,...,...,...
94,Vejle,1.997018e+09,1.45
95,Vesthimmerlands,1.614975e+08,0.12
96,Viborg,7.898710e+08,0.57
97,Vordingborg,4.586157e+08,0.33


In [41]:
# Some possible handy commands
df_check1.first()

Unnamed: 0_level_0,spender_region,spender_region_name,spender_sector,recipient_region,recipient_sector,spend
recipient_region_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Aabenraa,101,København,1,580,1,6630.750500
Aalborg,101,København,1,851,1,155766.720000
Aarhus,101,København,1,751,1,668476.880000
Albertslund,101,København,1,165,1,34154.387000
Allerød,101,København,1,201,1,13223.701000
...,...,...,...,...,...,...
Vejle,101,København,1,630,1,42382.754000
Vesthimmerlands,101,København,1,820,1,68.142403
Viborg,101,København,1,791,1,25592.176000
Vordingborg,101,København,1,390,1,33915.664000


In [42]:
df_check1.get_group('København')

Unnamed: 0,spender_region,spender_region_name,spender_sector,recipient_region,recipient_region_name,recipient_sector,spend
0,101,København,1,101,København,1,50520576.0
1,101,København,1,101,København,2,8743208.0
2,101,København,1,101,København,3,64380656.0
3,101,København,1,101,København,4,24944372.0
4,101,København,1,101,København,5,15440209.0
...,...,...,...,...,...,...,...
3840615,foreign,Foreign,28,101,København,10,
3840616,foreign,Foreign,28,101,København,11,
3840617,foreign,Foreign,28,101,København,12,
3840618,foreign,Foreign,28,101,København,17,


In [43]:
df_check1 = df_check1['spend'].agg('sum').reset_index()
df_check1

Unnamed: 0,recipient_region_name,spend
0,Aabenraa,4.835869e+08
1,Aalborg,2.809067e+09
2,Aarhus,9.093291e+09
3,Albertslund,9.690734e+08
4,Allerød,3.868411e+08
...,...,...
94,Vejle,1.997018e+09
95,Vesthimmerlands,1.614975e+08
96,Viborg,7.898710e+08
97,Vordingborg,4.586157e+08


In [44]:
df_check1['spend_share'] = (100 * df_check1['spend'] / df_check1['spend'].sum()).round(2)

In [46]:
df_check1[df_check1['recipient_region_name'].isin(['København', 'Aalborg', 'Aarhus','Foreign'])]

Unnamed: 0,recipient_region_name,spend,spend_share
1,Aalborg,2809067000.0,2.04
2,Aarhus,9093291000.0,6.6
18,Foreign,16009430000.0,11.63
51,København,27847160000.0,20.22
