In [1]:
import requests
import re
from string import ascii_lowercase
import random
from collections import defaultdict
from csv import writer

### Extract brand preference from click data

In this notebook, I'm extracting the brand preference per client from click data.


##### Input
Click data, for example from email clicks or visit data.

##### Output
CSV with ranked brand preference per client. 

##### Example output
1,k.dreus@gmail.com,Nike,5\
k.dreus@gmail.com,Adidas,3\
k.dreus@gmail.com,Under Armour,2\
k.dreus@gmail.com,Skechers,1\
koen@gmail.com,Adidas,3\
koen@gmail.com,Nike,1\
etc.

In [2]:
# define number of favorite brands per customer
most_favorite_brands = 3

In [3]:
# download content of nelson.nl homepage and extract links with regex library
response = requests.get('https://www.nelson.nl/')
links = re.findall('<a href="(\/.*?\/)">', str(response.content))

In [4]:
# create a dataset with URL's and check validity of each URL (response code is 200)
valid_url_list = []

for url in links:
    response = requests.get(f'https://www.nelson.nl/{url}')
    if response.status_code == 200:
        valid_url_list.append(url)
        
print(valid_url_list[:5])

['/cookies/', '/schoenen/boots/', '/schoenen/laarzen/', '/schoenen/veterschoenen/', '/dames/schoenen/sneakers/']


In [5]:
# create fake customer email addresses
domains = ['@hotmail.com', '@gmail.com', '@outlook.com', '@yahoo.com']
customer_list = []

for num in range(150):
    random_string = []
    for letter in range(10):
        random_string.append(random.choice(ascii_lowercase))
    email = ''.join(random_string + [random.choice(domains)])
    customer_list.append(email)

In [6]:
# create fake dataset of url journey per customer
url_path_per_customer = []

for email in customer_list:
    random_number = random.randint(1, 50)
    for num in range(random_number):
        url_path_per_customer.append([email, random.choice(valid_url_list)])

In [7]:
# manually created brand slug dictionary. Normally to be gotten through the API.
brand_slug_dict = {'dr-martens': 'Dr. Martens',
'skechers': 'Skechers',
'ara': 'Ara',
'maruti': 'Maruti',
'converse': 'Converse',
'mexx': 'Mexx',
'ecco': 'Ecco',
'cruyff': 'Cruyff',
'van-lier': 'Van Lier',
'la-strada': 'La Strada',
'hugo-boss': 'Hugo Boss',
'guess':'Guess',
'gabor': 'Gabor',
'kipling': 'Kipling',
'timberland': 'Timberland',
'birkenstock': 'Birkenstock'}

In [8]:
# extract score per customer. One pageview on a branded url (e.g. /schoenen/merk:skechers) scores one point!
ddict = defaultdict(dict)

for email, url in url_path_per_customer:
    for brand_slug in brand_slug_dict.keys():
        if brand_slug in url:
            brand = brand_slug_dict[brand_slug]
            if brand in ddict[email]:
                ddict[email][brand] += 1
            else:
                ddict[email][brand] = 1

In [10]:
#...and write to csv
with open('dump_favo_brand.csv', 'w', newline = '') as f:
    csv_writer = writer(f)
    csv_writer.writerow(['rank', 'email', 'brand', 'score', 'description'])
    
    for k, v in ddict.items():
        
        # sort values in second dictionary
        v = sorted(zip(v.values(), v.keys()), reverse = True)
        
        # enumerate and write to csv
        for num, (key, value) in enumerate(v):
            if num == 0:
                csv_writer.writerow([num + 1, k, value, key, 'primary'])
            elif num < most_favorite_brands:
                csv_writer.writerow([num + 1, k, value, key, 'secondary'])
            else:
                break