In [1]:
import pandas as pd
pd.options.display.max_rows=6
import numpy

import datetime
import glob
import json
import math
import sys

In [2]:
class MyEncoder(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, numpy.integer):
            return int(obj)
        #elif isinstance(obj, numpy.floating):
        #    return float(obj)
        #elif isinstance(obj, numpy.ndarray):
        #    return obj.tolist()
        else:
            return super(MyEncoder, self).default(obj)
        
def extract_placement_cat(p):
    if 'direct' in p and 'prospecting' not in p and 'retargeting' not in p and 'geo-fence' not in p:
        return 'direct'
    elif 'direct' not in p and 'prospecting' in p and 'retargeting' not in p and 'geo-fence' not in p:
        return 'prospecting'
    elif 'direct' not in p and 'prospecting' not in p and 'retargeting' in p and 'geo-fence' not in p:
        return 'retargeting'
    elif 'direct' not in p and 'prospecting' not in p and 'retargeting' not in p and 'geo-fence' in p:
        return 'geo-fence'
    else:
        return 'others'
    
def extract_data(df, ica='i'):
    event_time = df['Event Time'].apply(lambda x: datetime.datetime.fromtimestamp(x/1000000))
    df['date'] = event_time.apply(lambda x: x.strftime('%Y-%m-%d'))
    df['hour'] = event_time.apply(lambda x: x.strftime('%H'))
    weekday = 'monday tuesday wednesday thursday friday saturday sunday'.split()
    df['weekday'] = event_time.apply(lambda x: weekday[x.weekday()])

    df = df.merge(operating_systems[['Operating System ID', 'device']], 
                                  on='Operating System ID', how='left')
    if ica != 'a': # activity files doesn't have city and dma
        df = df.merge(cities, on='City ID', how='left')
        df['state_city'] = df['State/Region'] + '_' + df['City']
        df = df.merge(dmas, on='Designated Market Area (DMA) ID', how='left')

    df = df.merge(dbmsites[['DBM Site', 'DBM Site ID']], on='DBM Site ID', how='left')
    df = df.merge(dbmlineitems[['DBM Line Item', 'DBM Line Item ID']], on='DBM Line Item ID', how='left')

    df = df.merge(creatives[['Rendering ID', 'Creative ID', 'Creative', 
                             'Creative Type', 'Creative Pixel Size']], on='Rendering ID', how='left')
    
    df = df.merge(urls[['Creative ID', 'Ad Click URL']], on='Creative ID', how='left')
    
    df = df.merge(placements[['Placement ID', 'Placement']], on='Placement ID', how='left')

    df['placement_cat'] = df['Placement'].apply(extract_placement_cat)

    df = df.merge(browsers, on='Browser/Platform ID', how='left')

    df = df.merge(ads[['Ad ID', 'Ad']], on='Ad ID', how='left')
    return df

# match tables

In [3]:
operating_systems = pd.read_csv('/home/jubauser1/jzou/dcm_account7252/match_tables/dcm_account7252_match_table_operating_systems_20170924_20170925_032758_614633451.csv.gz')
sites = pd.read_csv('/home/jubauser1/jzou/dcm_account7252/match_tables/dcm_account7252_match_table_sites_20170924_20170925_032904_614624942.csv.gz')
placements = pd.read_csv('/home/jubauser1/jzou/dcm_account7252/match_tables/dcm_account7252_match_table_placements_20170924_20170925_030250_614632857.csv.gz')
creatives = pd.read_csv('/home/jubauser1/jzou/dcm_account7252/match_tables/dcm_account7252_match_table_creatives_20170924_20170925_030249_614632856.csv.gz')
browsers = pd.read_csv('/home/jubauser1/jzou/dcm_account7252/match_tables/dcm_account7252_match_table_browsers_20170924_20170925_030246_614632858.csv.gz')
cities = pd.read_csv('/home/jubauser1/jzou/dcm_account7252/match_tables/dcm_account7252_match_table_cities_20170924_20170925_030246_614624939.csv.gz')
activity_cats = pd.read_csv('/home/jubauser1/jzou/dcm_account7252/match_tables/dcm_account7252_match_table_activity_cats_20170924_20170925_031531_614632854.csv.gz')
dmas = pd.read_csv('/home/jubauser1/jzou/dcm_account7252/match_tables/dcm_account7252_match_table_designated_market_areas_20170924_20170925_032802_614624944.csv.gz')
ads = pd.read_csv('/home/jubauser1/jzou/dcm_account7252/match_tables/dcm_account7252_match_table_ads_20170924_20170925_030249_614633450.csv.gz')
urls = pd.read_csv('/home/jubauser1/jzou/dcm_account7252/match_tables/dcm_account7252_match_table_creative_ad_assignments_20170924_20170925_033006_614624943.csv.gz')

additional_os = list()
for i in {1, 3, 5, 7, 10, 13, 21, 22}:
    d = dict()
    d['Operating System ID'] = i
    x = math.pow(2, i)
    d['Operating System'] = list(
        operating_systems[operating_systems['Operating System ID'] == x]['Operating System']
    )[0]
    additional_os.append(d)

operating_systems = operating_systems.append(pd.DataFrame(additional_os), ignore_index=True)
operating_systems

os_to_device = {
    '(not set)': '(not set)',
    'Android': 'mobile',
    'Apple iOS': 'mobile',
    'BlackBerry': 'mobile',
    'DoCoMo': 'mobile',
    'Linux': 'desktop',
    'Macintosh': 'desktop',
    'Microsoft Windows 10': 'desktop',
    'Microsoft Windows 2000': 'desktop',
    'Microsoft Windows 7': 'desktop',
    'Microsoft Windows 8': 'desktop',
    'Microsoft Windows 95': 'desktop',
    'Microsoft Windows 98': 'desktop',
    'Microsoft Windows NT': 'desktop',
    'Microsoft Windows Unknown Version': 'desktop',
    'Microsoft Windows Vista': 'desktop',
    'Microsoft Windows XP': 'desktop',
    'Nokia Maemo': 'mobile',
    'Nokia Series 60': 'mobile',
    'NokiaSeries60': 'mobile',
    'Other': 'other',
    'Other/Unknown': 'Other/Unknown',
    'PS4': 'desktop',
    'PSP': 'desktop',
    'Palm OS': 'mobile',
    'Palm webOS': 'mobile',
    'PalmWebOS': 'mobile',
    'Playstation3': 'desktop',
    'Playstation4': 'desktop',
    'Roku': 'desktop',
    'Samsung Bada': 'mobile',
    'Unix': 'desktop',
    'Unix, Linux': 'desktop',
    'Unix, unknown': 'desktop',
    'Wii': 'desktop',
    'Wii U': 'desktop',
    'WiiU': 'desktop',
    'Windows 2000': 'desktop',
    'Windows 7': 'desktop',
    'Windows 98': 'desktop',
    'Windows Mobile': 'mobile',
    'Windows NT': 'desktop',
    'Windows Phone': 'mobile',
    'Windows Phone 7+': 'mobile',
    'Windows Vista': 'desktop',
    'Windows XP': 'desktop',
    'Windows, version unknown': 'desktop',
    'Windows10': 'desktop',
    'Windows8': 'desktop',
    'WindowsPhone': 'mobile',
    'Xbox 360': 'desktop',
    'Xbox One': 'desktop',
    'Xbox360': 'desktop',
    'XboxOne': 'desktop',
    'iOS': 'mobile'
}
operating_systems['device'] = operating_systems['Operating System'].apply(lambda x: os_to_device[x])

  interactivity=interactivity, compiler=compiler, result=result)
  interactivity=interactivity, compiler=compiler, result=result)


### dcm match tables

In [9]:
dbmsites = pd.read_csv(
    '/home/jubauser1/jzou/dcm_account7252/winery_weekly/dcm_reports/dbmSite_20170602_to_2017-09-24.csv')
dbmsites['DBM Site ID'] = dbmsites['DBM Site ID'].astype(str)
dbmsites = dbmsites[['DBM Site', 'DBM Site ID']].drop_duplicates().copy()

dbmlineitems = pd.read_csv(
    '/home/jubauser1/jzou/dcm_account7252/winery_weekly/dcm_reports/dbmLineItem_20170602_to_2017-09-24.csv')
dbmlineitems['DBM Line Item ID'] = dbmlineitems['DBM Line Item ID'].astype(str)
dbmlineitems = dbmlineitems[['DBM Line Item', 'DBM Line Item ID']].drop_duplicates().copy()

  interactivity=interactivity, compiler=compiler, result=result)


# impression

In [29]:
columns_impr = ['Campaign ID', 'Ad ID', 'Browser/Platform ID', 'City ID', 
                'State/Region', 'Designated Market Area (DMA) ID', 'Event Time',
                'Operating System ID', 'Placement ID', 'Rendering ID', 
                'DBM Site ID', 'DBM Line Item ID', 'DBM Billable Cost (USD)',
                'DBM Media Cost (USD)', 'DBM Total Media Cost (USD)', 'DBM Revenue (USD)']

df = pd.read_csv(
    '/home/jubauser1/jzou/dcm_account7252/winery_weekly/impr/impression_winery_reduced_2017090411_to_2017092509.csv')

In [30]:
event_time = df['Event Time'].apply(lambda x: datetime.datetime.fromtimestamp(x/1000000))
df['date'] = event_time.apply(lambda x: x.strftime('%Y-%m-%d'))
df['hour'] = event_time.apply(lambda x: x.strftime('%H'))
weekday = 'monday tuesday wednesday thursday friday saturday sunday'.split()
df['weekday'] = event_time.apply(lambda x: weekday[x.weekday()])
del event_time

temp = df[['Operating System ID']].merge(operating_systems[['Operating System ID', 'device']], 
                              on='Operating System ID', how='left')
df['device'] = temp['device']
del temp
df

temp = df[['City ID']].merge(cities, on='City ID', how='left')

df['City'] = temp['City'] 
del temp
df['state_city'] = df['State/Region'] + '_' + df['City']

temp = df[['Designated Market Area (DMA) ID']].merge(dmas, on='Designated Market Area (DMA) ID', how='left')
df['Designated Market Area (DMA)'] = temp['Designated Market Area (DMA)']
del temp

temp = df[['DBM Site ID']].merge(dbmsites[['DBM Site', 'DBM Site ID']], on='DBM Site ID', how='left')
df['DBM Site'] = temp['DBM Site'] 
del temp

temp = df[['DBM Line Item ID']].merge(dbmlineitems[['DBM Line Item', 'DBM Line Item ID']], on='DBM Line Item ID', how='left')
df['DBM Line Item'] = temp['DBM Line Item']
del temp

temp = df[['Rendering ID']].merge(creatives[['Rendering ID', 'Creative ID', 
                                             'Creative', 'Creative Pixel Size']], on='Rendering ID', how='left')
df['Creative ID'] = temp['Creative ID']
df['Creative'] = temp['Creative']
df['Creative Pixel Size'] = temp['Creative Pixel Size']
del temp

temp = df[['Creative ID']].merge(urls[['Creative ID', 'Ad Click URL']], on='Creative ID', how='left')
df['Ad Click URL'] = temp['Ad Click URL']
del temp

temp = df[['Placement ID']].merge(placements[['Placement ID', 'Placement']], on='Placement ID', how='left')
df['Placement'] = temp['Placement']
del temp

df['placement_cat'] = df['Placement'].apply(extract_placement_cat)

temp = df[['Browser/Platform ID']].merge(browsers, on='Browser/Platform ID', how='left')
df['Browser/Platform'] = temp['Browser/Platform']
del temp

temp = df[['Ad ID']].merge(ads[['Ad ID', 'Ad']], on='Ad ID', how='left')
df['Ad'] = temp['Ad']
del temp

In [31]:
df.columns

Index(['Event Time', 'Campaign ID', 'Ad ID', 'Rendering ID', 'Placement ID',
       'State/Region', 'Browser/Platform ID', 'Operating System ID',
       'Designated Market Area (DMA) ID', 'City ID', 'DBM Line Item ID',
       'DBM Site ID', 'DBM Media Cost (USD)', 'DBM Revenue (USD)',
       'DBM Total Media Cost (USD)', 'DBM Billable Cost (USD)', 'date', 'hour',
       'weekday', 'device', 'City', 'state_city',
       'Designated Market Area (DMA)', 'DBM Site', 'DBM Line Item',
       'Creative ID', 'Creative', 'Creative Pixel Size', 'Ad Click URL',
       'Placement', 'placement_cat', 'Browser/Platform', 'Ad'],
      dtype='object')

In [32]:
views = [
    'date', 'hour', 'weekday', 'state_city',
    'Designated Market Area (DMA)', 'device',
    'Browser/Platform', 'Placement', 
    'Ad', 'Ad Click URL', 'Creative', 'Creative Pixel Size',
    'DBM Line Item', 'DBM Site'
]

count_by_view_impr = dict()
for placement_cat, placement_group in df.groupby('placement_cat'):
    count_by_view_impr[placement_cat] = dict()
    for view in views:
        count_by_view_impr[placement_cat][view] = dict()
        # count impression or click
        counts = placement_group[view].value_counts()
        count_by_view_impr[placement_cat][view]['counts'] = dict(counts)
        
        view_group = placement_group.groupby(view)
        # cost
        #cost = view_group['DBM Media Cost (USD)'].sum()
        cost = view_group['DBM Media Cost (USD)'].sum()
        count_by_view_impr[placement_cat][view]['cost'] = dict(cost)
        #cost_2 = view_group['DBM Total Media Cost (USD)'].sum()
        #count_by_view[placement_cat][view]['DBM Total Media Cost (USD)'] = dict(cost_2)
        
        # revenue
        revenue = view_group['DBM Revenue (USD)'].sum()
        count_by_view_impr[placement_cat][view]['revenue'] = dict(revenue) 
        del view_group

In [33]:
count_by_view_impr['geo-fence'].keys()

dict_keys(['date', 'hour', 'weekday', 'state_city', 'Designated Market Area (DMA)', 'device', 'Browser/Platform', 'Placement', 'Ad', 'Ad Click URL', 'Creative', 'Creative Pixel Size', 'DBM Line Item', 'DBM Site'])

In [34]:
count_by_view_impr['geo-fence']['hour']['counts'].keys()

dict_keys(['21', '22', '19', '18', '09', '15', '00', '20', '23', '12', '10', '17', '14', '13', '16', '11', '08', '07', '01', '06', '02', '05', '03', '04'])

In [None]:
for placement_cat in ['direct', 'geo-fence', 'prospecting', 'retargeting']:
    for col in ['counts', 'cost', 'revenue']:
        ks = list(count_by_view_impr[placement_cat]['hour'][col].keys())
        for k in ks:
            count_by_view_impr[placement_cat]['hour'][col]['%02d'%k] = count_by_view_impr[placement_cat]['hour'][col].pop(k)

In [35]:
json.dump(count_by_view_impr, 
          open('/home/jubauser1/jzou/dcm_account7252/winery_weekly/impr_json/counts_by_placement_by_view_winery_impr_2017090411_to_2017092509.json', 'w'),
          cls=MyEncoder
         )

# click

In [20]:
click = pd.read_csv('/home/jubauser1/jzou/dcm_account7252/winery_weekly/click_winery_1_campaign.csv', 
                    dtype={'hour': str})

  interactivity=interactivity, compiler=compiler, result=result)


In [21]:
click['DBM Line Item ID'] = click['DBM Line Item ID'].apply(lambda x: x if numpy.isnan(x) else str(int(float(x))))
click['DBM Site ID'] = click['DBM Site ID'].apply(lambda x: x if numpy.isnan(x) else str(int(float(x))))

In [22]:
df = extract_data(click)

In [23]:
views = [
    'date', 'hour', 'weekday', 'state_city',
    'Designated Market Area (DMA)', 'device',
    'Browser/Platform', 'Placement', 
    'Ad', 'Ad Click URL', 'Creative', 'Creative Pixel Size',
    #'DBM Line Item ID', 'DBM Site ID',
    'DBM Line Item', 'DBM Site',
]

count_by_view_click = dict()
for placement_cat, placement_group in df.groupby('placement_cat'):
    count_by_view_click[placement_cat] = dict()
    for view in views:
        count_by_view_click[placement_cat][view] = dict()
        # count impression or click
        counts = placement_group[view].value_counts()
        count_by_view_click[placement_cat][view]['counts'] = dict(counts)
        
        view_group = placement_group.groupby(view)
        # cost
        cost = view_group['DBM Media Cost (USD)'].sum()
        count_by_view_click[placement_cat][view]['cost'] = dict(cost)
        # revenue
        revenue = view_group['DBM Revenue (USD)'].sum()
        count_by_view_click[placement_cat][view]['revenue'] = dict(revenue)    

In [25]:
count_by_view_click['geo-fence'].keys()

dict_keys(['date', 'hour', 'weekday', 'state_city', 'Designated Market Area (DMA)', 'device', 'Browser/Platform', 'Placement', 'Ad', 'Ad Click URL', 'Creative', 'Creative Pixel Size', 'DBM Line Item', 'DBM Site'])

In [26]:
json.dump(count_by_view_click, 
          open('/home/jubauser1/jzou/dcm_account7252/winery_weekly/json/counts_by_placement_by_view_winery_click.json', 'w'),
          cls=MyEncoder)

# activity

In [36]:
activity = pd.read_csv('/home/jubauser1/jzou/dcm_account7252/winery_weekly/activity_winery_1_campaign.csv')

In [37]:
activity['DBM Line Item ID'] = activity['DBM Line Item ID'].apply(lambda x: x if numpy.isnan(x) else str(int(float(x))))
activity['DBM Site ID'] = activity['DBM Site ID'].apply(lambda x: x if numpy.isnan(x) else str(int(float(x))))

In [38]:
activity = activity.merge(activity_cats[['Activity ID', 'Activity']], on='Activity ID', how='left')

In [39]:
df = extract_data(activity, ica='a')

In [40]:
views_act = [
    'date', 'hour', 'weekday', 'device',
    'Browser/Platform', 'Placement', 
    'Ad', 'Ad Click URL', 'Creative', 'Creative Pixel Size',
    #'DBM Line Item ID', 'DBM Site ID',
    'DBM Line Item', 'DBM Site',
]

hvas = [
    'All Products',
    'Checkout',
    'Diamond - Cabernet Sauvignon page',
    'Diamond - Chardonnay page',
    'Diamond - Claret page',
    'Diamond - Diamond Red Blend page',
    'Diamond - Malbec page',
    'Diamond - Merlot page',
    'Diamond - Pavilion page',
    'Diamond - Pinot Grigio page',
    'Diamond - Pinot Noir page',
    'Diamond - Sauvignon Blanc page',
    'Diamond - Syrah Shiraz page',
    'Diamond - Zinfandel page',
    'Diamond Adventure landing page',
    'Diamond Quiz',
    'FFC Home Page',
    'Membership',
    'Membership-Join now',
    'Our Wines Diamond Collection',
    'Shop - Diamond Collection Wines',
    'Shop Now',
    'Shop online - Wine',
    'Shopping Cart',
    'Store Locator',
    'Visit Location']

count_by_view_act = dict()
for placement_cat, placement_group in df.groupby('placement_cat'):
    count_by_view_act[placement_cat] = dict()
    c = placement_group[placement_group['Activity'] != 'Thank You Page']
    t = placement_group[placement_group['Activity'] == 'Thank You Page']
    for view in views_act:
        count_by_view_act[placement_cat][view] = dict()
        # conversion
        conversion = c.groupby(view)['Total Conversions'].sum()
        count_by_view_act[placement_cat][view]['HVA'] = dict(conversion)
        # transaction
        transaction = t.groupby(view)['Total Conversions'].sum()
        count_by_view_act[placement_cat][view]['Transaction'] = dict(transaction) 
        for hva in hvas:
            h = placement_group[placement_group['Activity'] == hva]
            hva_count = h.groupby(view)['Total Conversions'].sum()
            count_by_view_act[placement_cat][view]['hva_%s'%hva] = dict(hva_count) 

In [41]:
json.dump(count_by_view_act, 
          open('/home/jubauser1/jzou/dcm_account7252/winery_weekly/json/counts_by_placement_by_view_winery_activity.json', 'w'),
          cls=MyEncoder)

# generate reports

In [42]:
import json
from collections import Counter
import pandas as pd
pd.options.display.max_rows=6

### json feed

In [43]:
counts_by_placement_by_view_impr = json.load(
    open('/home/jubauser1/jzou/dcm_account7252/winery_weekly/json/counts_by_placement_by_view_winery_impr_est_revenue_dbm_site_line_item.json', 'r'))

counts_by_placement_by_view_click = json.load(
    open('/home/jubauser1/jzou/dcm_account7252/winery_weekly/json/counts_by_placement_by_view_winery_click.json', 'r'))

counts_by_placement_by_view_act = json.load(
    open('/home/jubauser1/jzou/dcm_account7252/winery_weekly/json/counts_by_placement_by_view_winery_activity.json', 'r'))

In [44]:
impr_weekly = json.load(
    open('/home/jubauser1/jzou/dcm_account7252/winery_weekly/impr_json/counts_by_placement_by_view_winery_impr_2017090411_to_2017092509.json', 'r'))

In [38]:
impr_weekly.keys(), impr_weekly['prospecting'].keys(), impr_weekly['prospecting']['weekday'].keys()

(dict_keys(['direct', 'geo-fence', 'prospecting', 'retargeting']),
 dict_keys(['date', 'hour', 'weekday', 'state_city', 'Designated Market Area (DMA)', 'device', 'Browser/Platform', 'Placement', 'Ad', 'Ad Click URL', 'Creative', 'Creative Pixel Size', 'DBM Line Item ID', 'DBM Site ID']),
 dict_keys(['counts', 'cost', 'revenue']))

In [45]:
views = [
    'date', 'hour', 'weekday', 'device', 'state_city', 'Designated Market Area (DMA)',
    'Browser/Platform', 'Placement', 'DBM Line Item', 'DBM Site',
    'Ad', 'Ad Click URL', 'Creative', 'Creative Pixel Size', 
]

for placement_cat in ['direct', 'geo-fence', 'prospecting', 'retargeting']:
    if placement_cat not in impr_weekly:
        continue
    elif placement_cat not in counts_by_placement_by_view_impr:
        counts_by_placement_by_view_impr[placement_cat] = impr_weekly[placement_cat]
    else:
        for view in views:
            if view not in impr_weekly[placement_cat]:
                continue
            elif view not in counts_by_placement_by_view_impr[placement_cat]:
                counts_by_placement_by_view_impr[placement_cat][view] = impr_weekly[placement_cat][view]
            else:
                for col in ['counts', 'cost', 'revenue']:
                    dict_sum = Counter(counts_by_placement_by_view_impr[placement_cat][view][col]) + \
                        Counter(impr_weekly[placement_cat][view][col])
                    counts_by_placement_by_view_impr[placement_cat][view][col] = dict(dict_sum)

In [46]:
json.dump(counts_by_placement_by_view_impr, 
         open('/home/jubauser1/jzou/dcm_account7252/winery_weekly/json/counts_by_placement_by_view_winery_impr.json', 'w'))

### dcm feed

In [47]:
dcm_reports = dict()
dcm_reports['Designated Market Area (DMA)'] = pd.read_csv(
    '/home/jubauser1/jzou/dcm_account7252/winery_weekly/dcm_reports/dma_20170602_to_2017-09-24.csv')
dcm_reports['state_city'] = pd.read_csv(
    '/home/jubauser1/jzou/dcm_account7252/winery_weekly/dcm_reports/state_city_20170602_to_2017-09-24.csv')
dcm_reports['state_city']['state_city'] = dcm_reports['state_city']['State/Region']+'_'+ dcm_reports['state_city']['City']

hvas = [
    'All Products',
    'Checkout',
    'Diamond - Cabernet Sauvignon page',
    'Diamond - Chardonnay page',
    'Diamond - Claret page',
    'Diamond - Diamond Red Blend page',
    'Diamond - Malbec page',
    'Diamond - Merlot page',
    'Diamond - Pavilion page',
    'Diamond - Pinot Grigio page',
    'Diamond - Pinot Noir page',
    'Diamond - Sauvignon Blanc page',
    'Diamond - Syrah Shiraz page',
    'Diamond - Zinfandel page',
    'Diamond Adventure landing page',
    'Diamond Quiz',
    'FFC Home Page',
    'Membership',
    'Membership-Join now',
    'Our Wines Diamond Collection',
    'Shop - Diamond Collection Wines',
    'Shop Now',
    'Shop online - Wine',
    'Shopping Cart',
    'Store Locator',
    'Visit Location',
    '(not set)']

for view in dcm_reports.keys():
    dcm_reports[view]['placement_cat'] = dcm_reports[view]['Placement'].apply(extract_placement_cat)
    for placement_cat, placement_group in dcm_reports[view].groupby('placement_cat'):
        counts_by_placement_by_view_act[placement_cat][view] = dict()
        c = placement_group[placement_group['Activity'] != 'Thank You Page']
        t = placement_group[placement_group['Activity'] == 'Thank You Page']
        # conversion
        conversion = c.groupby(view)['Total Conversions'].sum()
        counts_by_placement_by_view_act[placement_cat][view]['HVA'] = dict(conversion)
        # transaction
        transaction = t.groupby(view)['Total Conversions'].sum()
        counts_by_placement_by_view_act[placement_cat][view]['Transaction'] = dict(transaction) 
        for hva in hvas:
            h = placement_group[placement_group['Activity'] == hva]
            hva_count = h.groupby(view)['Total Conversions'].sum()
            counts_by_placement_by_view_act[placement_cat][view]['hva_%s'%hva] = dict(hva_count)

In [48]:
views = [
    'date', 'hour', 'weekday', 'device', 'state_city', 'Designated Market Area (DMA)',
    'Browser/Platform', 'Placement', 
    'Ad', 'Ad Click URL', 'Creative', 'Creative Pixel Size', 
    #'DBM Line Item ID', 'DBM Site ID',
    'DBM Line Item', 'DBM Site',
]
for placement_cat in ['direct', 'geo-fence', 'prospecting', 'retargeting']:
    writer = pd.ExcelWriter('/home/jubauser1/jzou/winery/%s_20170924.xlsx'%placement_cat)
    for view in views:
        impr_view = pd.DataFrame(counts_by_placement_by_view_impr[placement_cat][view])
        impr_view.rename(columns={'counts': 'Impression'}, inplace=True)

        click_view = pd.DataFrame(counts_by_placement_by_view_click[placement_cat][view])['counts']
        click_view.name = 'Click'
        
        activity_view = pd.DataFrame(counts_by_placement_by_view_act[placement_cat][view])
        
        merged_view = impr_view.join(click_view, how='outer').join(activity_view, how='outer')
        
        merged_view['cost'] = merged_view['cost'] / 1000000000
        merged_view['revenue'] = merged_view['revenue'] / 1000000000
        merged_view['ROI'] = merged_view['revenue'] / merged_view['cost']
        merged_view['CPRevenue'] = merged_view['cost'] / merged_view['revenue']
        merged_view['CPTransaction'] = merged_view['cost'] / merged_view['Transaction']
        merged_view['CPHVA'] = merged_view['cost'] / merged_view['HVA']
        merged_view['CPC'] = merged_view['cost'] / merged_view['Click']
        merged_view['CTR'] = merged_view['Click'] / merged_view['Impression']
        
        merged_view['hva_Store Locator'].fillna(0, inplace=True)
        merged_view['hva_Visit Location'].fillna(0, inplace=True)
        merged_view['CPStore_locator_Visit_location'] = merged_view['cost'] / (merged_view['hva_Store Locator'] + merged_view['hva_Visit Location'])

        if view != 'date' and placement_cat != 'geo-fence':    
            merged_view.sort_values(by=['ROI', 'CPHVA'], ascending=[False, True], inplace=True)
        elif placement_cat == 'geo-fence':
            if view != 'date':
                merged_view.sort_values(by=['CPStore_locator_Visit_location'], ascending=[True], inplace=True)
            
        merged_view[[
            'cost', 'Impression', 'revenue', 'Click', 'HVA', 'Transaction',
            'ROI', 'CPRevenue', 'CPTransaction', 'CPHVA',
            'CPC', 'CTR', 'CPStore_locator_Visit_location',
            'hva_All Products', 'hva_Checkout',
            'hva_Diamond - Cabernet Sauvignon page',
            'hva_Diamond - Chardonnay page', 'hva_Diamond - Claret page',
            'hva_Diamond - Diamond Red Blend page', 'hva_Diamond - Malbec page',
            'hva_Diamond - Merlot page', 'hva_Diamond - Pavilion page',
            'hva_Diamond - Pinot Grigio page', 'hva_Diamond - Pinot Noir page',
            'hva_Diamond - Sauvignon Blanc page', 'hva_Diamond - Syrah Shiraz page',
            'hva_Diamond - Zinfandel page', 'hva_Diamond Adventure landing page',
            'hva_Diamond Quiz', 'hva_FFC Home Page', 'hva_Membership',
            'hva_Membership-Join now', 'hva_Our Wines Diamond Collection',
            'hva_Shop - Diamond Collection Wines', 'hva_Shop Now',
            'hva_Shop online - Wine', 'hva_Shopping Cart', 'hva_Store Locator',
            'hva_Visit Location', ]].to_excel(writer, view.replace('/', '_'))
    
    writer.save()

# impr site name

In [58]:
temp = json.load(open('/home/jubauser1/jzou/dcm_account7252/winery_weekly/json/counts_by_placement_by_view_winery_impr_est_revenue.json', 'r'))

for placement_cat in ['direct', 'geo-fence', 'prospecting', 'retargeting']:
    temp_df = pd.DataFrame(temp[placement_cat]['DBM Site ID'])
    temp_df = temp_df.merge(dbmsites, left_index=True, right_on='DBM Site ID', how='left')
    temp_df.index = temp_df['DBM Site']
    temp[placement_cat]['DBM Site'] = temp_df[['cost', 'counts', 'revenue']].to_dict()

for placement_cat in ['direct', 'geo-fence', 'prospecting', 'retargeting']:
    temp_df = pd.DataFrame(temp[placement_cat]['DBM Line Item ID'])
    temp_df = temp_df.merge(dbmlineitems, left_index=True, right_on='DBM Line Item ID', how='left')
    temp_df.index = temp_df['DBM Line Item']
    temp[placement_cat]['DBM Line Item'] = temp_df[['cost', 'counts', 'revenue']].to_dict()

json.dump(temp, 
         open('/home/jubauser1/jzou/dcm_account7252/winery_weekly/json/counts_by_placement_by_view_winery_impr_est_revenue_dbm_site_line_item.json', 'w'),
         cls=MyEncoder)

# est revenue

In [7]:
for index in df.index:
    if df.loc[index, 'date'] < '2017-07-27':
        df.set_value(index, 'DBM Media Cost (USD)', df.loc[index, 'estimated_cost'])

In [74]:
date = pd.read_csv('/home/jubauser1/jzou/dcm_account7252/winery_weekly/dcm_reports/date_20170602_to_2017-07-25.csv')

date['placement_cat'] = date['Placement'].apply(extract_placement_cat)

date['Total Revenue'] = date['Total Revenue'].apply(lambda x: int(x*1000000000))

In [98]:
date.columns

Index(['Date', 'Placement', 'Activity', 'Activity ID', 'Impressions', 'Clicks',
       'DBM Cost (Account Currency)', 'Media Cost', 'Total Conversions',
       'Total Revenue', 'placement_cat'],
      dtype='object')

In [95]:
for placement_cat, placement_group in date.groupby('placement_cat'):
    d = dict(placement_group.groupby('Date')['Total Revenue'].sum())
    counts_by_placement_by_view_impr[placement_cat]['date']['revenue'] = dict(Counter(counts_by_placement_by_view_impr[placement_cat]['date']['revenue']) + Counter(d))

In [99]:
hour = pd.read_csv('/home/jubauser1/jzou/dcm_account7252/winery_weekly/dcm_reports/hour_20170602_to_2017-07-25.csv',
                   names = ['hour', 'Placement', 'Activity', 'Activity ID', 'Impressions', 'Clicks',
                            'DBM Cost (Account Currency)', 'Media Cost', 'Total Conversions', 'Total Revenue'])
hour

Unnamed: 0,hour,Placement,Activity,Activity ID,Impressions,Clicks,DBM Cost (Account Currency),Media Cost,Total Conversions,Total Revenue
0,(not set),ffc winery_q2-q3 diamond digital_programmatic_...,(not set).1,(not set).2,0,0.1,584.65,0.0,0.0,0.000000000.1
1,(not set),ffc winery_q2-q3 diamond digital_programmatic_...,(not set),(not set),0,0.0,704.20,0.0,0.0,0.0
2,(not set),ffc winery_q2-q3 diamond digital_programmatic_...,(not set),(not set),0,0.0,99.83,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...
11549,23,ffc winery_q2-q3 diamond digital_programmatic_...,Shop - Diamond Collection Wines,6063963,0,0.0,0.00,0.0,1.0,0.0
11550,23,ffc winery_q2-q3 diamond digital_programmatic_...,Shop online - Wine,4075982,0,0.0,0.00,0.0,5.0,0.0
11551,23,ffc winery_q2-q3 diamond digital_programmatic_...,Visit Location,4075383,0,0.0,0.00,0.0,12.0,0.0


In [105]:
hour.drop(0, inplace=True)

In [107]:
hour['placement_cat'] = hour['Placement'].apply(extract_placement_cat)

In [108]:
hour['Total Revenue'] = hour['Total Revenue'].astype(float).apply(lambda x: int(x*1000000000))

In [109]:
dict(hour[hour['placement_cat'] == 'prospecting'].groupby('hour')['Total Revenue'].sum())

{'(not set)': 0,
 '00': 0,
 '01': 0,
 '02': 0,
 '03': 0,
 '04': 0,
 '05': 0,
 '06': 0,
 '07': 0,
 '08': 0,
 '09': 0,
 '10': 0,
 '11': 0,
 '12': 0,
 '13': 395540000000,
 '14': 0,
 '15': 124250000000,
 '16': 0,
 '17': 0,
 '18': 0,
 '19': 0,
 '20': 0,
 '21': 349250000000,
 '22': 0,
 '23': 0}

In [110]:
for placement_cat, placement_group in hour.groupby('placement_cat'):
    d = dict(placement_group.groupby('hour')['Total Revenue'].sum())
    counts_by_placement_by_view_impr[placement_cat]['hour']['revenue'] = \
        dict(Counter(counts_by_placement_by_view_impr[placement_cat]['hour']['revenue']) + Counter(d))

In [115]:
weekday = 'monday tuesday wednesday thursday friday saturday sunday'.split()
date['weekday'] = date['Date'].apply(lambda x: weekday[datetime.datetime.strptime(x, '%Y-%m-%d').weekday()])

In [117]:
for placement_cat, placement_group in date.groupby('placement_cat'):
    d = dict(placement_group.groupby('weekday')['Total Revenue'].sum())
    counts_by_placement_by_view_impr[placement_cat]['weekday']['revenue'] = \
        dict(Counter(counts_by_placement_by_view_impr[placement_cat]['weekday']['revenue']) + Counter(d))

In [118]:
views

['date',
 'hour',
 'weekday',
 'device',
 'state_city',
 'Designated Market Area (DMA)',
 'Browser/Platform',
 'Placement',
 'DBM Line Item ID',
 'DBM Site ID',
 'Ad',
 'Ad Click URL',
 'Creative',
 'Creative Pixel Size']

In [123]:
operating_system = pd.read_csv(
    '/home/jubauser1/jzou/dcm_account7252/winery_weekly/dcm_reports/operating_system_20170602_to_2017-07-25.csv',
    names= ['operating_system', 'Placement', 'Activity', 'Activity ID', 'Impressions', 'Clicks',
       'DBM Cost (Account Currency)', 'Media Cost', 'Total Conversions',
       'Total Revenue'])

In [125]:
operating_system.drop(0, inplace=True)

In [127]:
os_to_device = {
    '(not set)': '(not set)',
    'Android': 'mobile',
    'Apple iOS': 'mobile',
    'BlackBerry': 'mobile',
    'DoCoMo': 'mobile',
    'Linux': 'desktop',
    'Macintosh': 'desktop',
    'Microsoft Windows 10': 'desktop',
    'Microsoft Windows 2000': 'desktop',
    'Microsoft Windows 7': 'desktop',
    'Microsoft Windows 8': 'desktop',
    'Microsoft Windows 95': 'desktop',
    'Microsoft Windows 98': 'desktop',
    'Microsoft Windows NT': 'desktop',
    'Microsoft Windows Unknown Version': 'desktop',
    'Microsoft Windows Vista': 'desktop',
    'Microsoft Windows XP': 'desktop',
    'Nokia Maemo': 'mobile',
    'Nokia Series 60': 'mobile',
    'NokiaSeries60': 'mobile',
    'Other': 'other',
    'Other/Unknown': 'Other/Unknown',
    'PS4': 'desktop',
    'PSP': 'desktop',
    'Palm OS': 'mobile',
    'Palm webOS': 'mobile',
    'PalmWebOS': 'mobile',
    'Playstation3': 'desktop',
    'Playstation4': 'desktop',
    'Roku': 'desktop',
    'Samsung Bada': 'mobile',
    'Unix': 'desktop',
    'Unix, Linux': 'desktop',
    'Unix, unknown': 'desktop',
    'Wii': 'desktop',
    'Wii U': 'desktop',
    'WiiU': 'desktop',
    'Windows 2000': 'desktop',
    'Windows 7': 'desktop',
    'Windows 98': 'desktop',
    'Windows Mobile': 'mobile',
    'Windows NT': 'desktop',
    'Windows Phone': 'mobile',
    'Windows Phone 7+': 'mobile',
    'Windows Vista': 'desktop',
    'Windows XP': 'desktop',
    'Windows, version unknown': 'desktop',
    'Windows10': 'desktop',
    'Windows8': 'desktop',
    'WindowsPhone': 'mobile',
    'Xbox 360': 'desktop',
    'Xbox One': 'desktop',
    'Xbox360': 'desktop',
    'XboxOne': 'desktop',
    'iOS': 'mobile'
}
operating_system['device'] = operating_system['operating_system'].apply(lambda x: os_to_device[x])

In [129]:
operating_system['Total Revenue'] = operating_system['Total Revenue'].astype(float).apply(lambda x: int(x*1000000000))

operating_system['placement_cat'] = operating_system['Placement'].apply(extract_placement_cat)

In [132]:
for placement_cat, placement_group in operating_system.groupby('placement_cat'):
    d = dict(placement_group.groupby('device')['Total Revenue'].sum())
    counts_by_placement_by_view_impr[placement_cat]['device']['revenue'] = \
        dict(Counter(counts_by_placement_by_view_impr[placement_cat]['device']['revenue']) + Counter(d))

In [137]:
browser = pd.read_csv(
    '/home/jubauser1/jzou/dcm_account7252/winery_weekly/dcm_reports/browser_20170602_to_2017-07-25.csv',
    names = ['browser', 'Placement', 'Activity', 'Activity ID', 'Impressions', 'Clicks',
       'DBM Cost (Account Currency)', 'Media Cost', 'Total Conversions',
       'Total Revenue'])
browser.drop(0, inplace=True)

In [140]:
browser['placement_cat'] = browser['Placement'].apply(extract_placement_cat)
browser['Total Revenue'] = browser['Total Revenue'].astype(float).apply(lambda x: int(x*1000000000))

In [143]:
for placement_cat, placement_group in browser.groupby('placement_cat'):
    d = dict(placement_group.groupby('browser')['Total Revenue'].sum())
    counts_by_placement_by_view_impr[placement_cat]['Browser/Platform']['revenue'] = \
        dict(Counter(counts_by_placement_by_view_impr[placement_cat]['Browser/Platform']['revenue']) + Counter(d))

In [145]:
placement = pd.read_csv('/home/jubauser1/jzou/dcm_account7252/winery_weekly/dcm_reports/placement_20170602_to_2017-07-25.csv')
placement

Unnamed: 0,Placement,Activity,Activity ID,Impressions,Clicks,DBM Cost (Account Currency),Media Cost,Total Conversions,Total Revenue
0,ffc winery_q2-q3 diamond campaign_direct_tme o...,(not set),(not set),7924,9,0.0,0.0,0.0,0.0
1,ffc winery_q2-q3 diamond campaign_direct_tme o...,Diamond Adventure landing page,4918486,0,0,0.0,0.0,10.0,0.0
2,ffc winery_q2-q3 diamond campaign_direct_tme o...,FFC Home Page,6011597,0,0,0.0,0.0,4.0,0.0
...,...,...,...,...,...,...,...,...,...
1429,ffc winery_q2-q3 diamond digital_programmatic_...,Shopping Cart,4075980,0,0,0.0,0.0,16.0,0.0
1430,ffc winery_q2-q3 diamond digital_programmatic_...,Thank You Page,4092611,0,0,0.0,0.0,3.0,361.0
1431,ffc winery_q2-q3 diamond digital_programmatic_...,Visit Location,4075383,0,0,0.0,0.0,237.0,0.0


In [146]:
placement['Total Revenue'] = placement['Total Revenue'].astype(float).apply(lambda x: int(x*1000000000))

placement['placement_cat'] = placement['Placement'].apply(extract_placement_cat)

In [148]:
for placement_cat, placement_group in placement.groupby('placement_cat'):
    d = dict(placement_group.groupby('Placement')['Total Revenue'].sum())
    counts_by_placement_by_view_impr[placement_cat]['Placement']['revenue'] = \
        dict(Counter(counts_by_placement_by_view_impr[placement_cat]['Placement']['revenue']) + Counter(d))

In [158]:
ad = pd.read_csv('/home/jubauser1/jzou/dcm_account7252/winery_weekly/dcm_reports/ad_20170602_to_2017-07-25.csv')

In [160]:
ad['Total Revenue'] = ad['Total Revenue'].astype(float).apply(lambda x: int(x*1000000000))

ad['placement_cat'] = ad['Placement'].apply(extract_placement_cat)

In [161]:
for placement_cat, placement_group in ad.groupby('placement_cat'):
    d = dict(placement_group.groupby('Ad')['Total Revenue'].sum())
    counts_by_placement_by_view_impr[placement_cat]['Ad']['revenue'] = \
        dict(Counter(counts_by_placement_by_view_impr[placement_cat]['Ad']['revenue']) + Counter(d))

In [151]:
url = pd.read_csv('/home/jubauser1/jzou/dcm_account7252/winery_weekly/dcm_reports/clickThroughUrl_20170602_to_2017-07-25.csv')

In [162]:
url['Total Revenue'] = url['Total Revenue'].astype(float).apply(lambda x: int(x*1000000000))

url['placement_cat'] = url['Placement'].apply(extract_placement_cat)

In [163]:
for placement_cat, placement_group in url.groupby('placement_cat'):
    d = dict(placement_group.groupby('Click-through URL')['Total Revenue'].sum())
    counts_by_placement_by_view_impr[placement_cat]['Ad Click URL']['revenue'] = \
        dict(Counter(counts_by_placement_by_view_impr[placement_cat]['Ad Click URL']['revenue']) + Counter(d))

In [165]:
creative = pd.read_csv('/home/jubauser1/jzou/dcm_account7252/winery_weekly/dcm_reports/creative_20170602_to_2017-07-25.csv')

In [167]:
creative['Total Revenue'] = creative['Total Revenue'].astype(float).apply(lambda x: int(x*1000000000))

creative['placement_cat'] = creative['Placement'].apply(extract_placement_cat)

In [168]:
for placement_cat, placement_group in creative.groupby('placement_cat'):
    d = dict(placement_group.groupby('Creative')['Total Revenue'].sum())
    counts_by_placement_by_view_impr[placement_cat]['Creative']['revenue'] = \
        dict(Counter(counts_by_placement_by_view_impr[placement_cat]['Creative']['revenue']) + Counter(d))

In [169]:
creative_size = pd.read_csv('/home/jubauser1/jzou/dcm_account7252/winery_weekly/dcm_reports/creative_size_20170602_to_2017-07-25.csv')

In [171]:
creative_size['Total Revenue'] = creative_size['Total Revenue'].astype(float).apply(lambda x: int(x*1000000000))

creative_size['placement_cat'] = creative_size['Placement'].apply(extract_placement_cat)

In [172]:
for placement_cat, placement_group in creative_size.groupby('placement_cat'):
    d = dict(placement_group.groupby('Creative Pixel Size')['Total Revenue'].sum())
    counts_by_placement_by_view_impr[placement_cat]['Creative Pixel Size']['revenue'] = \
        dict(Counter(counts_by_placement_by_view_impr[placement_cat]['Creative Pixel Size']['revenue']) + Counter(d))

In [195]:
json.dump(counts_by_placement_by_view_impr, 
          open('/home/jubauser1/jzou/dcm_account7252/winery_weekly/json/counts_by_placement_by_view_winery_impr_est_revenue.json', 'w'),
         cls=MyEncoder)

In [183]:
str(int(float('13069488.0')))

'13069488'

In [205]:
for placement_cat in ['direct', 'geo-fence', 'prospecting', 'retargeting']:
    for col in ['HVA', 'Transaction', 'hva_All Products', 'hva_Checkout', 'hva_Diamond - Cabernet Sauvignon page', 'hva_Diamond - Chardonnay page', 'hva_Diamond - Claret page', 'hva_Diamond - Diamond Red Blend page', 'hva_Diamond - Malbec page', 'hva_Diamond - Merlot page', 'hva_Diamond - Pavilion page', 'hva_Diamond - Pinot Grigio page', 'hva_Diamond - Pinot Noir page', 'hva_Diamond - Sauvignon Blanc page', 'hva_Diamond - Syrah Shiraz page', 'hva_Diamond - Zinfandel page', 'hva_Diamond Adventure landing page', 'hva_Diamond Quiz', 'hva_FFC Home Page', 'hva_Membership', 'hva_Membership-Join now', 'hva_Our Wines Diamond Collection', 'hva_Shop - Diamond Collection Wines', 'hva_Shop Now', 'hva_Shop online - Wine', 'hva_Shopping Cart', 'hva_Store Locator', 'hva_Visit Location']:
        ks = list(counts_by_placement_by_view_act[placement_cat]['DBM Site ID'][col].keys())
        for k in ks:
            counts_by_placement_by_view_act[placement_cat]['DBM Site ID'][col][str(int(float(k)))] = \
                counts_by_placement_by_view_act[placement_cat]['DBM Site ID'][col].pop(k)

In [198]:
json.dump(counts_by_placement_by_view_click,
          open('/home/jubauser1/jzou/dcm_account7252/winery_weekly/json/counts_by_placement_by_view_winery_click.json', 'w'))

In [206]:
json.dump(counts_by_placement_by_view_act,
         open('/home/jubauser1/jzou/dcm_account7252/winery_weekly/json/counts_by_placement_by_view_winery_activity.json', 'w'))

In [191]:
counts_by_placement_by_view_impr[placement_cat].keys()

dict_keys(['date', 'hour', 'weekday', 'state_city', 'Designated Market Area (DMA)', 'device', 'Browser/Platform', 'Placement', 'Ad', 'Ad Click URL', 'Creative', 'Creative Pixel Size', 'DBM Line Item ID', 'DBM Site ID'])