In [1]:
import feather
import copy
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from IPython.display import display

In [15]:
train = feather.read_dataframe('../data/interim/train.ftr')
test = feather.read_dataframe('../data/interim/test.ftr')

train['totals.transactionRevenue'] = train['totals.transactionRevenue'].astype(float)
train['totals.transactionRevenue_log'] = train['totals.transactionRevenue'].apply(lambda x: np.log(x) if x==x else 0)
train['date'] = pd.to_datetime(train['date'], format='%Y%m%d')
test['date'] = pd.to_datetime(test['date'], format='%Y%m%d')

train.shape, test.shape

((903653, 56), (804684, 53))

In [3]:
traffic_columns_train = [col for col in train.columns if "trafficSource" in col]
train[traffic_columns_train].dtypes

trafficSource.adContent                              object
trafficSource.adwordsClickInfo.adNetworkType         object
trafficSource.adwordsClickInfo.criteriaParameters    object
trafficSource.adwordsClickInfo.gclId                 object
trafficSource.adwordsClickInfo.isVideoAd             object
trafficSource.adwordsClickInfo.page                  object
trafficSource.adwordsClickInfo.slot                  object
trafficSource.campaign                               object
trafficSource.campaignCode                           object
trafficSource.isTrueDirect                           object
trafficSource.keyword                                object
trafficSource.medium                                 object
trafficSource.referralPath                           object
trafficSource.source                                 object
dtype: object

In [4]:
traffic_columns_train

['trafficSource.adContent',
 'trafficSource.adwordsClickInfo.adNetworkType',
 'trafficSource.adwordsClickInfo.criteriaParameters',
 'trafficSource.adwordsClickInfo.gclId',
 'trafficSource.adwordsClickInfo.isVideoAd',
 'trafficSource.adwordsClickInfo.page',
 'trafficSource.adwordsClickInfo.slot',
 'trafficSource.campaign',
 'trafficSource.campaignCode',
 'trafficSource.isTrueDirect',
 'trafficSource.keyword',
 'trafficSource.medium',
 'trafficSource.referralPath',
 'trafficSource.source']

In [5]:
traffic_columns_test = [col for col in test.columns if "trafficSource" in col]
test[traffic_columns_test].dtypes

trafficSource.adContent                              object
trafficSource.adwordsClickInfo.adNetworkType         object
trafficSource.adwordsClickInfo.criteriaParameters    object
trafficSource.adwordsClickInfo.gclId                 object
trafficSource.adwordsClickInfo.isVideoAd             object
trafficSource.adwordsClickInfo.page                  object
trafficSource.adwordsClickInfo.slot                  object
trafficSource.campaign                               object
trafficSource.isTrueDirect                           object
trafficSource.keyword                                object
trafficSource.medium                                 object
trafficSource.referralPath                           object
trafficSource.source                                 object
dtype: object

### 欠損値

In [6]:
def find_missing(data):
    total = data.isnull().sum().sort_values(ascending = False)
    percent = (data.isnull().sum()/data.isnull().count()*100).sort_values(ascending = False)
    df = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
    return df.loc[~(df['Total']==0)]

miss_train = find_missing(train[traffic_columns_train])
miss_test = find_missing(test[traffic_columns_test])
display(miss_train)
display(miss_test)

Unnamed: 0,Total,Percent
trafficSource.campaignCode,903652,99.999889
trafficSource.adContent,892707,98.788694
trafficSource.adwordsClickInfo.slot,882193,97.625195
trafficSource.adwordsClickInfo.page,882193,97.625195
trafficSource.adwordsClickInfo.isVideoAd,882193,97.625195
trafficSource.adwordsClickInfo.adNetworkType,882193,97.625195
trafficSource.adwordsClickInfo.gclId,882092,97.614018
trafficSource.isTrueDirect,629648,69.678073
trafficSource.referralPath,572712,63.377425
trafficSource.keyword,502929,55.655102


Unnamed: 0,Total,Percent
trafficSource.adContent,750893,93.315264
trafficSource.adwordsClickInfo.slot,750870,93.312406
trafficSource.adwordsClickInfo.page,750870,93.312406
trafficSource.adwordsClickInfo.isVideoAd,750870,93.312406
trafficSource.adwordsClickInfo.adNetworkType,750870,93.312406
trafficSource.adwordsClickInfo.gclId,750822,93.306441
trafficSource.referralPath,569361,70.75585
trafficSource.isTrueDirect,544171,67.625428
trafficSource.keyword,391032,48.594479


### 各カラムの中身を簡易集計

In [7]:
def aggregates_train_test(colname):
    summary_train = train.groupby(colname).count()['date'].sort_values(ascending=False)
    summary_test = test.groupby(colname).count()['date'].sort_values(ascending=False)
    summary_train.name = 'train'
    summary_test.name = 'test'
    result = pd.concat([summary_train, summary_test], axis=1, sort=True, join='outer').sort_values('train', ascending=False)
    return result

In [9]:
for col in traffic_columns_test:
    print(col)
    display(aggregates_train_test(col))

trafficSource.adContent


Unnamed: 0,train,test
Google Merchandise Collection,5122.0,1640.0
Google Online Store,1245.0,528.0
Display Ad created 3/11/14,967.0,1377.0
Full auto ad IMAGE ONLY,822.0,
Ad from 12/13/16,610.0,
Ad from 11/3/16,489.0,
Display Ad created 3/11/15,392.0,
{KeyWord:Google Brand Items},251.0,167.0
{KeyWord:Google Merchandise},155.0,119.0
Ad from 11/7/16,123.0,


trafficSource.adwordsClickInfo.adNetworkType


Unnamed: 0,train,test
Google Search,21453.0,11590
Search partners,7.0,1
Content,,42223


trafficSource.adwordsClickInfo.criteriaParameters


Unnamed: 0_level_0,train,test
trafficSource.adwordsClickInfo.criteriaParameters,Unnamed: 1_level_1,Unnamed: 2_level_1
not available in demo dataset,903653,804684


trafficSource.adwordsClickInfo.gclId


Unnamed: 0,train,test
Cj0KEQjwmIrJBRCRmJ_x7KDo-9oBEiQAuUPKMufMpuG3ZdwYO8GTsjiBFd5MPHStZa9y_9NCrI8X97oaAglc8P8HAQ,70.0,
Cj0KEQjw1ee_BRD3hK6x993YzeoBEiQA5RH_BEA562M9tvl_mtnAFvtDnDqOQRp1RvxMMgwjcX1LAfwaAj4o8P8HAQ,41.0,
CJH1vbf94M8CFUElgQodyakHgQ,29.0,
Cj0KEQiAw_DEBRChnYiQ_562gsEBEiQA4LcssmB_RWgvpPnltzlzj5rGwqx5lk87wC5CjfcqzneNZewaAiAp8P8HAQ,27.0,
CjwKEAiAj7TCBRCp2Z22ue-zrj4SJACG7SBEJui6ggr6ocA-eDC2-lX7W1m5IA1c_qNbzwZVTqUanxoCb5rw_wcB,24.0,
CN_u9PaVhdACFcNahgodTCQAjw,22.0,
CjwKEAiAxKrFBRDm25f60OegtwwSJABgEC-Z0_DLPcXHm1ZTqlR1YWeWXU875yaqwupt7pGMgFEZThoCeEzw_wcB,21.0,
CNHp7Nf2ytMCFVlWDQod_IoL5A,20.0,4.0
CjwKEAiAvs7CBRC24rao6bGCoiASJABaCt5DtalFxcoSsvr2E2adUhx6z6OE0KAuVtqKzl-BcVN1-hoCNlrw_wcB,20.0,
CjwKEAjw387JBRDPtJePvOej8kASJADkV9TLLYm88mDQieFBbZeS2hrQ7p-uHNZo45H2pUP3uPLDvRoCG2Hw_wcB,18.0,


trafficSource.adwordsClickInfo.isVideoAd


Unnamed: 0_level_0,train,test
trafficSource.adwordsClickInfo.isVideoAd,Unnamed: 1_level_1,Unnamed: 2_level_1
False,21460,53814


trafficSource.adwordsClickInfo.page


Unnamed: 0,train,test
1,21362.0,52551.0
2,73.0,984.0
3,10.0,162.0
5,7.0,23.0
7,3.0,3.0
4,2.0,78.0
9,2.0,1.0
14,1.0,
12,,1.0
6,,10.0


trafficSource.adwordsClickInfo.slot


Unnamed: 0,train,test
Top,20956.0,11491
RHS,504.0,42246
Google Display Network,,77


trafficSource.campaign


Unnamed: 0,train,test
(not set),865347.0,728927.0
Data Share Promo,16403.0,16511.0
AW - Dynamic Search Ads Whole Site,14244.0,8933.0
AW - Accessories,7070.0,2645.0
test-liyuhz,392.0,
AW - Electronics,96.0,1.0
Retail (DO NOT EDIT owners nophakun and tianyu),50.0,
AW - Apparel,46.0,12.0
All Products,4.0,
Data Share,1.0,


trafficSource.isTrueDirect


Unnamed: 0_level_0,train,test
trafficSource.isTrueDirect,Unnamed: 1_level_1,Unnamed: 2_level_1
True,274005,260513


trafficSource.keyword


Unnamed: 0,train,test
(not provided),366363.0,346391.0
6qEhsCssdK0z36ri,11503.0,6221.0
(Remarketing/Content targeting),2298.0,4940.0
1hZbAqLCbjwfgOH7,2264.0,1732.0
google merchandise store,2209.0,778.0
Google Merchandise,1648.0,480.0
google store,1277.0,303.0
youtube,568.0,202.0
(User vertical targeting),489.0,25761.0
1X4Me6ZKNV0zg-jV,467.0,980.0


trafficSource.medium


Unnamed: 0_level_0,train,test
trafficSource.medium,Unnamed: 1_level_1,Unnamed: 2_level_1
organic,381561,357402
referral,330955,235323
(none),143026,130108
cpc,25326,62524
affiliate,16403,16512
cpm,6262,2798
(not set),120,17


trafficSource.referralPath


Unnamed: 0,train,test
/,75523.0,62770.0
/yt/about/,71036.0,8127.0
/analytics/web/,14620.0,18492.0
/yt/about/tr/,14599.0,1.0
/yt/about/vi/,13753.0,
/yt/about/es-419/,12735.0,1.0
/yt/about/pt-BR/,12003.0,2.0
/yt/about/th/,11430.0,
/yt/about/ru/,11193.0,1.0
/yt/about/es/,7092.0,3.0


trafficSource.source


Unnamed: 0,train,test
google,400788.0,417104.0
youtube.com,212602.0,116848.0
(direct),143028.0,130124.0
mall.googleplex.com,66416.0,54125.0
Partners,16411.0,16520.0
analytics.google.com,16172.0,21264.0
dfa,5686.0,1018.0
google.com,4669.0,3672.0
m.facebook.com,3365.0,3372.0
baidu,3356.0,1615.0


### trafficSource.adContentについて

- トラフィック参照元の広告内容。utm_content URL パラメータで設定できます。

In [7]:
train['trafficSource.adContent'].nunique(), test['trafficSource.adContent'].nunique()

(44, 51)

- train, testそれぞれの期間のみに登場する広告が多い。

In [13]:
train[train['trafficSource.adContent']=="Full auto ad IMAGE ONLY"].date.max()

20170728

In [15]:
test[test['trafficSource.adContent']=="Full auto ad IMAGE ONLY"].date.max()

nan

#### kernelの名寄せを試してみる。

In [45]:
def adcontents_mapping(x):
    if  ('google' in x):
        return 'google'
    elif '(not set)' in x or 'nan' in x:
        return x
    elif 'ad' in x:
        return 'ad'
    else:
        return 'others'

In [46]:
train['adContent_map'] = train['trafficSource.adContent'].map(lambda x:adcontents_mapping(str(x).lower())).astype('str')
test['adContent_map'] = test['trafficSource.adContent'].map(lambda x:adcontents_mapping(str(x).lower())).astype('str')

In [47]:
display(aggregates_train_test("adContent_map"))

Unnamed: 0_level_0,train,test
adContent_map,Unnamed: 1_level_1,Unnamed: 2_level_1
others,892858,756614
google,7223,43095
ad,3572,4975


In [48]:
train['adContent_map'].isnull().sum(), train['trafficSource.adContent'].isnull().sum()

(0, 892707)

### trafficSource.adwordsClickInfo.adNetworkTypeについて

- ネットワークの種類。値は、次のいずれかになります。{“Google Search", "Content", "Search partners", "Ad Exchange", "Yahoo Japan Search", "Yahoo Japan AFS", “unknown”}

In [36]:
train['trafficSource.adwordsClickInfo.adNetworkType'].unique(), test['trafficSource.adwordsClickInfo.adNetworkType'].unique()

(array([None, 'Google Search', 'Search partners'], dtype=object),
 array([None, 'Google Search', 'Content', 'Search partners'], dtype=object))

In [41]:
def adnetworktype_mapping(x):
    if  ('google search' in x):
        return 'google search'
    else:
        return 'others'

In [42]:
train['adnetworktype_map'] = train['trafficSource.adwordsClickInfo.adNetworkType'].map(lambda x:adnetworktype_mapping(str(x).lower())).astype('str')
test['adnetworktype_map'] = test['trafficSource.adwordsClickInfo.adNetworkType'].map(lambda x:adnetworktype_mapping(str(x).lower())).astype('str')

In [43]:
display(aggregates_train_test("adnetworktype_map"))

Unnamed: 0_level_0,train,test
adnetworktype_map,Unnamed: 1_level_1,Unnamed: 2_level_1
others,882200,793094
google search,21453,11590


In [44]:
train['adnetworktype_map'].isnull().sum(), train['trafficSource.adwordsClickInfo.adNetworkType'].isnull().sum()

(0, 882193)

### trafficSource.adwordsClickInfo.gclIdについて

- Google クリック ID。  
https://www.ja.advertisercommunity.com/t5/%E3%81%9D%E3%81%AE%E4%BB%96-Google-AdWords/gclid-%E4%BB%A5%E9%99%8D%E3%81%AE%E3%83%91%E3%83%A9%E3%83%A1%E3%83%BC%E3%82%BFurl%E3%81%AB%E9%96%A2%E3%81%99%E3%82%8B%E8%B3%AA%E5%95%8F/m-p/8814/highlight/true

In [17]:
train['trafficSource.adwordsClickInfo.gclId'].nunique(), test['trafficSource.adwordsClickInfo.gclId'].nunique()

(17774, 41317)

#### それぞれ(train, test)の期間にしか出現しないものが多い。

名寄せを結構頑張らないといけない

In [51]:
agg = aggregates_train_test("trafficSource.adwordsClickInfo.gclId")

In [53]:
len(agg), agg["train"].isnull().sum(), agg["test"].isnull().sum()

(59008, 41234, 17691)

In [55]:
agg[agg["train"].isnull() & agg["test"].notnull()].shape

(41234, 2)

In [57]:
agg[agg["train"].notnull() & agg["test"].isnull()].shape

(17691, 2)

### trafficSource.adwordsClickInfo.isVideoAdについて

- TrueView 動画広告の場合、「true」に設定されます。

In [19]:
train['trafficSource.adwordsClickInfo.isVideoAd'].unique(), test['trafficSource.adwordsClickInfo.isVideoAd'].unique()

(array([None, False], dtype=object), array([None, False], dtype=object))

### trafficSource.adwordsClickInfo.pageについて

- 広告が表示された検索結果のページ番号。

In [20]:
train['trafficSource.adwordsClickInfo.page'].nunique(), test['trafficSource.adwordsClickInfo.page'].nunique()

(8, 10)

In [59]:
aggregates_train_test("trafficSource.adwordsClickInfo.page")

Unnamed: 0,train,test
1,21362.0,52551.0
2,73.0,984.0
3,10.0,162.0
5,7.0,23.0
7,3.0,3.0
4,2.0,78.0
9,2.0,1.0
14,1.0,
12,,1.0
6,,10.0


### trafficSource.adwordsClickInfo.slotについて

- 広告の位置。値は次のいずれかになります。{“RHS", "Top"}

In [21]:
train['trafficSource.adwordsClickInfo.slot'].unique(), test['trafficSource.adwordsClickInfo.slot'].unique()

(array([None, 'Top', 'RHS'], dtype=object),
 array([None, 'Top', 'RHS', 'Google Display Network'], dtype=object))

In [60]:
aggregates_train_test("trafficSource.adwordsClickInfo.slot")

Unnamed: 0,train,test
Top,20956.0,11491
RHS,504.0,42246
Google Display Network,,77


In [61]:
def slot_mapping(x):
    if  ('top' in x):
        return 'top'
    elif  ('rhs' in x):
        return 'rhs'
    else:
        return 'others'

In [62]:
train['slot_map'] = train['trafficSource.adwordsClickInfo.slot'].map(lambda x:slot_mapping(str(x).lower())).astype('str')
test['slot_map'] = test['trafficSource.adwordsClickInfo.slot'].map(lambda x:slot_mapping(str(x).lower())).astype('str')

In [63]:
aggregates_train_test("slot_map")

Unnamed: 0,train,test
others,882193,750947
top,20956,11491
rhs,504,42246


### trafficSource.campaignについて

- キャンペーンの値。通常は utm_campaign URL パラメータで設定されます。  
http://rubymarketing.jp/blog/url_parameter

In [24]:
train['trafficSource.campaign'].nunique(), test['trafficSource.campaign'].nunique()

(10, 31)

In [64]:
display(aggregates_train_test("trafficSource.campaign"))

Unnamed: 0,train,test
(not set),865347.0,728927.0
Data Share Promo,16403.0,16511.0
AW - Dynamic Search Ads Whole Site,14244.0,8933.0
AW - Accessories,7070.0,2645.0
test-liyuhz,392.0,
AW - Electronics,96.0,1.0
Retail (DO NOT EDIT owners nophakun and tianyu),50.0,
AW - Apparel,46.0,12.0
All Products,4.0,
Data Share,1.0,


#### 時間に関係する特徴。それぞれにしかないものが怖いので名寄せ

In [68]:
def campaign_mapping(x):
    campaign = ['(not set)', 'data share promo', 'aw - dynamic search ads whole site',
                'aw - accessories']
    if x in campaign:
        return x
    else:
        return 'others'

In [69]:
train['campaign_map'] = train['trafficSource.campaign'].map(lambda x:campaign_mapping(str(x).lower())).astype('str')
test['campaign_map'] = test['trafficSource.campaign'].map(lambda x:campaign_mapping(str(x).lower())).astype('str')

In [70]:
display(aggregates_train_test("campaign_map"))

Unnamed: 0,train,test
(not set),865347,728927
data share promo,16403,16511
aw - dynamic search ads whole site,14244,8933
aw - accessories,7070,2645
others,589,47668


In [75]:
revenue = train[train["campaign_map"]=="others"].groupby("trafficSource.campaign")["totals.transactionRevenue"].median()
size = train[train["campaign_map"]=="others"].groupby("trafficSource.campaign").size()
pd.concat([revenue, size], axis=1).sort_values(0, ascending=False)

Unnamed: 0_level_0,totals.transactionRevenue,0
trafficSource.campaign,Unnamed: 1_level_1,Unnamed: 2_level_1
test-liyuhz,42000000.0,392
AW - Electronics,,96
Retail (DO NOT EDIT owners nophakun and tianyu),22950000.0,50
AW - Apparel,77820000.0,46
All Products,,4
Data Share,,1


### trafficSource.isTrueDirectについて

- セッションの参照元がノーリファラーの場合（ユーザーがブラウザに URL を直接入力したか、ブックマークを利用してウェブサイトにアクセスした場合）、この値は「true」に設定されます。また、連続していても互いに独立する 2 つのセッションにまったく同じキャンペーンの詳細がある場合も、「true」に設定されます。そうでない場合は「null」に設定されます。

In [26]:
train['trafficSource.isTrueDirect'].unique(), test['trafficSource.isTrueDirect'].unique()

(array([None, True], dtype=object), array([True, None], dtype=object))

In [89]:
train["slot"] = train["trafficSource.adwordsClickInfo.slot"].astype(str)
train.groupby(["slot"]).size()

slot
None    882193
RHS        504
Top      20956
dtype: int64

### trafficSource.keywordについて

- 参照元のキーワード。通常は trafficSource.medium が「organic」または「cpc」の場合に設定されます。utm_term URL パラメータで設定できます。

In [27]:
train['trafficSource.keyword'].nunique(), test['trafficSource.keyword'].nunique()

(3659, 2415)

In [83]:
train['trafficSource.keyword'].astype(str).isnull().sum()

0

### trafficSource.mediumについて

- トラフィック参照元のメディア。「organic」、「cpc」、「referral」、または utm_medium URL パラメータの値となります。

In [29]:
train['trafficSource.medium'].unique(), test['trafficSource.medium'].unique()

(array(['organic', 'referral', 'cpc', 'affiliate', '(none)', 'cpm',
        '(not set)'], dtype=object),
 array(['organic', 'cpc', 'cpm', '(none)', 'referral', 'affiliate',
        '(not set)'], dtype=object))

In [30]:
train['trafficSource.medium'].nunique(), test['trafficSource.medium'].nunique()

(7, 7)

### trafficSource.referralPathについて

- trafficSource.medium が「referral」の場合、この値は参照経路に設定されます（参照のホスト名は trafficSource.source に示されます）。

In [31]:
train['trafficSource.referralPath'].nunique(), test['trafficSource.referralPath'].nunique()

(1475, 2197)

### trafficSource.sourceについて

- トラフィックの参照元。検索エンジンの名前、参照ホスト名、または utm_source URL パラメータの値となります。

In [23]:
train['trafficSource.source'].nunique(), test['trafficSource.source'].nunique()

(380, 324)

#### kernelの名寄せを試してみる

In [24]:
def source_mapping(x):
    if  ('google' in x):
        return 'google'
    elif  ('youtube' in x):
        return 'youtube'
    elif '(not set)' in x or 'nan' in x:
        return x
    elif 'yahoo' in x:
        return 'yahoo'
    elif 'facebook' in x:
        return 'facebook'
    elif 'reddit' in x:
        return 'reddit'
    elif 'bing' in x:
        return 'bing'
    elif 'quora' in x:
        return 'quora'
    elif 'outlook' in x:
        return 'outlook'
    elif 'linkedin' in x:
        return 'linkedin'
    elif 'pinterest' in x:
        return 'pinterest'
    elif 'ask' in x:
        return 'ask'
    elif 'siliconvalley' in x:
        return 'siliconvalley'
    elif 'lunametrics' in x:
        return 'lunametrics'
    elif 'amazon' in x:
        return 'amazon'
    elif 'mysearch' in x:
        return 'mysearch'
    elif 'qiita' in x:
        return 'qiita'
    elif 'messenger' in x:
        return 'messenger'
    elif 'twitter' in x:
        return 'twitter'
    elif 't.co' in x:
        return 't.co'
    elif 'vk.com' in x:
        return 'vk.com'
    elif 'search' in x:
        return 'search'
    elif 'edu' in x:
        return 'edu'
    elif 'mail' in x:
        return 'mail'
    elif 'ad' in x:
        return 'ad'
    elif 'golang' in x:
        return 'golang'
    elif 'direct' in x:
        return 'direct'
    elif 'dealspotr' in x:
        return 'dealspotr'
    elif 'sashihara' in x:
        return 'sashihara'
    elif 'phandroid' in x:
        return 'phandroid'
    elif 'baidu' in x:
        return 'baidu'
    elif 'mdn' in x:
        return 'mdn'
    elif 'duckduckgo' in x:
        return 'duckduckgo'
    elif 'seroundtable' in x:
        return 'seroundtable'
    elif 'metrics' in x:
        return 'metrics'
    elif 'sogou' in x:
        return 'sogou'
    elif 'businessinsider' in x:
        return 'businessinsider'
    elif 'github' in x:
        return 'github'
    elif 'gophergala' in x:
        return 'gophergala'
    elif 'yandex' in x:
        return 'yandex'
    elif 'msn' in x:
        return 'msn'
    elif 'dfa' in x:
        return 'dfa'
    elif '(not set)' in x:
        return '(not set)'
    elif 'feedly' in x:
        return 'feedly'
    elif 'arstechnica' in x:
        return 'arstechnica'
    elif 'squishable' in x:
        return 'squishable'
    elif 'flipboard' in x:
        return 'flipboard'
    elif 't-online.de' in x:
        return 't-online.de'
    elif 'sm.cn' in x:
        return 'sm.cn'
    elif 'wow' in x:
        return 'wow'
    elif 'baidu' in x:
        return 'baidu'
    elif 'partners' in x:
        return 'partners'
    else:
        return 'others'

In [25]:
train['source_map'] = train['trafficSource.source'].map(lambda x:source_mapping(str(x).lower())).astype('str')
test['source_map'] = test['trafficSource.source'].map(lambda x:source_mapping(str(x).lower())).astype('str')

In [26]:
display(aggregates_train_test("source_map"))

Unnamed: 0,train,test
google,499205,520977.0
youtube,212697,116875.0
direct,143028,130124.0
partners,16411,16520.0
facebook,6523,5523.0
dfa,5686,1018.0
baidu,3503,1689.0
reddit,2106,2081.0
siliconvalley,2097,
qiita,1818,1581.0
