## Create dictionary for age/gender/destination based on age_gender_bkts.csv

In [1]:
import pandas as pd
import numpy as np

In [2]:
ag = pd.read_csv('./input/age_gender_bkts.csv')
ag['gender'] = [g.upper() for g in ag.gender]

In [3]:
df_all = pd.read_csv('./input/train_users.csv')

In [4]:
df_all.gender.value_counts()

-unknown-    95692
FEMALE       63049
MALE         54443
OTHER          282
Name: gender, dtype: int64

In [5]:
country_list = list(ag.country_destination.value_counts().index)

In [6]:
ag.shape

(420, 5)

In [7]:
ag['min_age'] = [ a.split("-")[0] if a != '100+' else 100 for a in ag.age_bucket]

ag = ag.drop(['age_bucket', 'year'], axis=1)
ag_agg = ag.groupby(['country_destination', 'gender'])['population_in_thousands'].sum()

In [8]:
ag_agg

country_destination  gender
AU                   FEMALE     12024
                     MALE       11899
CA                   FEMALE     18066
                     MALE       17805
DE                   FEMALE     41997
                     MALE       40565
ES                   FEMALE     23870
                     MALE       23333
FR                   FEMALE     33493
                     MALE       31490
GB                   FEMALE     32345
                     MALE       31495
IT                   FEMALE     31405
                     MALE       29740
NL                   FEMALE      8486
                     MALE        8362
PT                   FEMALE      5468
                     MALE        5141
US                   FEMALE    165053
                     MALE      160079
Name: population_in_thousands, dtype: float64

In [9]:
for c in country_list:
    ag_agg[ c, '-unknown-'] = ag_agg[ c, 'FEMALE'] + ag_agg[ c, 'MALE']
    ag_agg[ c, 'OTHER'] = ag_agg[ c, 'FEMALE'] + ag_agg[ c, 'MALE']

In [10]:
ag_agg

country_destination  gender   
AU                   FEMALE        12024
                     MALE          11899
CA                   FEMALE        18066
                     MALE          17805
DE                   FEMALE        41997
                     MALE          40565
ES                   FEMALE        23870
                     MALE          23333
FR                   FEMALE        33493
                     MALE          31490
GB                   FEMALE        32345
                     MALE          31495
IT                   FEMALE        31405
                     MALE          29740
NL                   FEMALE         8486
                     MALE           8362
PT                   FEMALE         5468
                     MALE           5141
US                   FEMALE       165053
                     MALE         160079
                     -unknown-    325132
                     OTHER        325132
PT                   -unknown-     10609
                     OTHER

In [11]:
ag_t = ag.groupby(['country_destination', 'min_age']).sum()

In [12]:
ag_t.reset_index(inplace = True)

In [13]:
ag.shape[0]

420

In [14]:
ag_o = ag_t.copy()
ag_o['gender'] = ['OTHER'] * ag_o.shape[0]
ag_t['gender'] = ['-unknown-'] * ag_o.shape[0]

In [15]:
ag = pd.concat([ag, ag_o, ag_t], axis=0)

In [16]:
ag['rel_pop'] = [r/ag_agg[c][g] for (r, c, g) in 
                 ag[['population_in_thousands', 'country_destination', 'gender']].itertuples(index=False)  ]
ag['min_age'] = ag['min_age'].astype(int)

In [17]:
ag[(ag.country_destination == 'PT') & (ag.gender == 'FEMALE')]

Unnamed: 0,country_destination,gender,min_age,population_in_thousands,rel_pop
336,PT,FEMALE,80,194,0.035479
337,PT,FEMALE,40,418,0.076445
338,PT,FEMALE,65,313,0.057242
339,PT,FEMALE,60,341,0.062363
341,PT,FEMALE,85,115,0.021031
344,PT,FEMALE,15,266,0.048647
346,PT,FEMALE,90,45,0.00823
347,PT,FEMALE,25,299,0.054682
348,PT,FEMALE,95,9,0.001646
349,PT,FEMALE,55,366,0.066935


In [18]:
def find_countries(age, gender):
    if age < 105:
        ma = (age/5) * 5     # minimum age
        sdf =  ag[(ag.gender == gender) & (ag.min_age == ma)]
        #sdf = sdf.sort_values('rel_pop', ascending=False)
        #return sdf.ix[sdf.rel_pop.idxmax()].country_destination
        dst_rank = {}
        for c in country_list:
            dst_rank[c] = int(np.round(sdf[sdf.country_destination == c]['rel_pop'] * 750))
        return dst_rank
    else:
        return ('None','None')

In [19]:
c_am = {}  # country with best age-match
for user_g in ['MALE','FEMALE','-unknown-', 'OTHER']:
    if user_g == 'MALE'     : target_g = 'MALE'
    if user_g == 'FEMALE'   : target_g = 'FEMALE'
    if user_g == 'OTHER'    : target_g = 'OTHER'
    if user_g == '-unknown-': target_g = 'MALE'
    
    for a in range(1, 105):
        c_am[a, user_g] = find_countries(a, target_g)


In [20]:
c_am

{(1, '-unknown-'): {'AU': 52,
  'CA': 44,
  'DE': 33,
  'ES': 41,
  'FR': 48,
  'GB': 47,
  'IT': 37,
  'NL': 41,
  'PT': 35,
  'US': 51},
 (1, 'FEMALE'): {'AU': 49,
  'CA': 41,
  'DE': 31,
  'ES': 38,
  'FR': 43,
  'GB': 44,
  'IT': 33,
  'NL': 39,
  'PT': 31,
  'US': 47},
 (1, 'MALE'): {'AU': 52,
  'CA': 44,
  'DE': 33,
  'ES': 41,
  'FR': 48,
  'GB': 47,
  'IT': 37,
  'NL': 41,
  'PT': 35,
  'US': 51},
 (1, 'OTHER'): {'AU': 50,
  'CA': 43,
  'DE': 32,
  'ES': 39,
  'FR': 46,
  'GB': 45,
  'IT': 35,
  'NL': 40,
  'PT': 33,
  'US': 49},
 (2, '-unknown-'): {'AU': 52,
  'CA': 44,
  'DE': 33,
  'ES': 41,
  'FR': 48,
  'GB': 47,
  'IT': 37,
  'NL': 41,
  'PT': 35,
  'US': 51},
 (2, 'FEMALE'): {'AU': 49,
  'CA': 41,
  'DE': 31,
  'ES': 38,
  'FR': 43,
  'GB': 44,
  'IT': 33,
  'NL': 39,
  'PT': 31,
  'US': 47},
 (2, 'MALE'): {'AU': 52,
  'CA': 44,
  'DE': 33,
  'ES': 41,
  'FR': 48,
  'GB': 47,
  'IT': 37,
  'NL': 41,
  'PT': 35,
  'US': 51},
 (2, 'OTHER'): {'AU': 50,
  'CA': 43,
  'DE': 3

In [21]:
import pickle
from os import listdir, makedirs, getcwd, remove
from os.path import isfile, join, abspath, exists, isdir

if not exists('./intermediate'):
    makedirs('./intermediate')
pickle.dump( c_am, open('./intermediate/dst_dict_f750.pkl', 'wb'))

In [22]:
ls -l intermediate

total 2987360
-rw-r--r--  1 hitoshinagano  503   26156492 22 Mar 14:53 df_all
-rw-r--r--  1 hitoshinagano  503      43751 28 Mar 08:13 dst_dict_f750.pkl
-rw-r--r--  1 hitoshinagano  503  275412392 21 Mar 10:54 session_features.csv
-rw-r--r--  1 hitoshinagano  503  275412392 28 Mar 08:13 session_features_1000.csv
-rw-r--r--  1 hitoshinagano  503  410910441 22 Mar 09:48 session_features_1500.csv
-rw-r--r--  1 hitoshinagano  503  139859301 21 Mar 23:22 session_features_500.csv
-rw-r--r--  1 hitoshinagano  503  166961081 22 Mar 14:10 session_features_600.csv
-rw-r--r--  1 hitoshinagano  503  234761845 22 Mar 14:48 session_features_850.csv
