## Load Data

In [1]:
0# Mount Drive
from google.colab import drive
drive.mount('/content/drive') 

Mounted at /content/drive


In [2]:
import os
# The path below should point to the directory containing this notebook and the associated utility files
# Change it if necessary
os.chdir('drive/MyDrive/Colab Notebooks/FourthBrain/data/')
!ls

all_contacts.csv		   last-registered-event-field-history.csv
email_field_hist.csv		   pageview_hist.csv
event_hist.csv			   preprocess_data_clustering.ipynb
hub_spot_export_clean01212021.pkl  Reports
hubspot_export.csv		   students_conversion_clean01212021.pkl


In [3]:
# Import libraries 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from urllib.parse import urlparse

In [4]:
# load data
pageview_hist = pd.read_csv('pageview_hist.csv', encoding='utf-8').fillna(0)
event_hist = pd.read_csv('event_hist.csv', encoding='utf-8').fillna(0)
hubspot_export = pd.read_csv('hubspot_export.csv', encoding='utf-8').fillna(0)

  interactivity=interactivity, compiler=compiler, result=result)
  interactivity=interactivity, compiler=compiler, result=result)


## Categorization
Transform the collected event log into actual clickstream data. 

===============
* A : homepage | B : blog | C : events 
* D : specialization | E : machine learning yearning
* F : forums | G : the batch | H : careers
* I : about | J : course-signup | K : tensorflow
* L : climate | M : ai notes | N : dl notes
* O : newletter archive | P : event registration
* Q : other | R : contact | S : ai ambassadorship

### Event 1

In [5]:
# locate null exclusive rows and remove them
pageviews = pageview_hist.drop(list(np.ravel(np.where(\
                                    pageview_hist['Last Page Seen Current Value'] == 0))))

events = event_hist.drop(list(np.ravel(np.where(\
                                    event_hist['Last Registered Event Date Current Value'] == 0))))

print('pageviews shape:', pageviews.shape)
print('events shape:', events.shape)

pageviews shape: (90533, 9)
events shape: (56081, 92)


In [6]:
# preserve index order of pageviews
idx = pageviews.index.to_list()

# construct df containing url sequences and times
url_sequences = pd.DataFrame([])
url_sequences['ID'] = pageviews['Contact ID']
url_sequences['event1'] = pageviews['Last Page Seen Current Value']
url_sequences['t1'] = pageviews['Last Page Seen Change Date']
url_sequences['event2'] = pageviews['Last Page Seen Previous Value (1)']
url_sequences['t2'] = pageviews['Last Page Seen Change Date (1)']
url_sequences['event3'] = pageviews['Last Page Seen Previous Value (2)']
url_sequences['t3'] = pageviews['Last Page Seen Change Date (2)']
url_sequences['event4'] = pageviews['...']

# display first 5 rows of dataframe url_sequences
print("URL SEQUENCES DF")
url_sequences.head(1)

URL SEQUENCES DF


Unnamed: 0,ID,event1,t1,event2,t2,event3,t3,event4
5,333,https://www.deeplearning.ai/become-a-deeplearn...,2020-05-21 04:22,https://www.deeplearning.ai/events/?utm_campai...,2020-05-21 04:22,0,0,0


In [7]:
# prepare dataframe for urlparse function
m = pd.DataFrame([])
m['url'] = url_sequences['event1'].unique()

# list of indices for sample tracking
idx = m.index.to_list()

# list of column names
df_columns = ['scheme', 'netloc', 'path',
              'params', 'query', 'fragment']

# prepare null dataframe with columns = df_columns and index = idx
url_segments = pd.DataFrame([], columns=df_columns, index=idx).fillna(0)

# segment urls for categorization
url_segments['scheme'] = [urlparse(m['url'][i]).scheme for i in range(len(idx))]
url_segments['netloc'] = [urlparse(m['url'][i]).netloc for i in range(len(idx))]
url_segments['path'] = [urlparse(m['url'][i]).path for i in range(len(idx))]
url_segments['params'] = [urlparse(m['url'][i]).params for i in range(len(idx))]
url_segments['query'] = [urlparse(m['url'][i]).query for i in range(len(idx))]
url_segments['fragment'] = [urlparse(m['url'][i]).fragment for i in range(len(idx))]

# display first 5 rows
print('PARSED URLS DF')
url_segments.head(1)

PARSED URLS DF


Unnamed: 0,scheme,netloc,path,params,query,fragment
0,https,www.deeplearning.ai,/become-a-deeplearning-ai-events-ambassador/,,,


In [8]:
dic = {'become-a-deeplearning-ai-events-ambassador' : 'S', 'blog' : 'B', 'thebatch' : 'G',
       'generative-adversarial-networks-specialization' : 'D',
       'machine-learning-yearning' : 'E',
       'natural-language-processing-specialization' : 'D', ' ' : 'Q', 'careers' : 'H',
       'events' : 'C', 'forums' : 'F', '1w-3kMnSXR9ynOeSC5vJWFQ3hul4' : 'Q',
       'deep-learning-specialization' : 'D', 'ai-for-medicine' : 'D', 'course-signup' : 'J',
       'contact-us' : 'R', 'climate' : 'L', 'about-us' : 'I', 'ai-for-everyone' : 'M',
       'tensorflow-in-practice' : 'K', 'tensorflow-data-and-deployment' : 'K',
       '18b-0Xg-iTwW-qVdOXFow3w3hul4' : 'Q', '1qOs8ajvWQGq4F4ClL_RbRA3hul4' : 'Q',
       'ai-career-program-for-experienced-engineers' : 'H', 'hs' : 'Q',
       '-temporary-slug-45e6d038-071c-4c82-9eae-c664fa342a92' : 'Q', 'press' : 'I',
       '-temporary-slug-2c0727dd-b118-46ff-a8ea-696379a06821' : 'Q', 'bootcamp' : 'C',
       'tensorflow-from-basics-to-mastery' : 'K',
       'ai-career-program-for-university-graduates' : 'H', 'privacy' : 'I', 'page' : 'A',
       '1uPx8FCrfQ3qWp_tv8WmCuA3hul4' : 'Q',
       '-temporary-slug-4e3b41cb-fdac-4680-870c-4e50df654998' : 'Q', 'THEBATCH' : 'G',
       'ai-nots' : 'M', 'dl-notes' : 'N', 'ai-notes' : 'M', '1sw5nCSRdRPWcbhr3a_03jg3hul4' : 'Q',
       'ai-internship' : 'C', 'machine-learning-yearning-cn' : 'E', 'mail' : 'I',
       'translate_c' : 'Q', 'notes' : 'I', 'TheBatch' : 'G', 'search' : 'Q', 'terms-of-use' : 'I',
       'newsletter-archive-test' : 'O'}

In [9]:
# define variable paths 
paths = url_segments['path']

# seperate strings in path by '/'
unique_paths = pd.DataFrame([i.split('/') for i in \
                            url_segments['path'].unique()])[1].unique()

# convert to pandas dataframe
_paths = pd.DataFrame(unique_paths, columns=["unique_paths"])

# append path encodings 
_paths['encoded_paths'] = pd.DataFrame(data=[list(dic.values())]).T

# apply encodings to data set
for i in [i for i in range(len(_paths)) if i != 6]:
  idx2 = paths[paths.str.contains(f'{_paths["unique_paths"].iloc[i]}')].index
  paths[idx2] = _paths["encoded_paths"].iloc[i]

In [10]:
netloc_dic = {'www.deeplearning.ai' : 'A', 'blog.deeplearning.ai' : 'B', 'share.hsforms.com' : 'P',
       'info.deeplearning.ai' : 'I', 'www.mlyearning.org' : 'E', 'deeplearning.ai' : 'A',
       'translate.googleusercontent.com' : 'A'}

locs = url_segments['netloc'].iloc[paths.iloc[np.where(paths == '/')].index].map(netloc_dic)
paths.iloc[locs.index] = locs

In [11]:
X = pd.DataFrame([])
X['event1'] = paths
X.head()

Unnamed: 0,event1
0,S
1,B
2,G
3,D
4,G


In [12]:
df = pd.DataFrame([])

d = url_sequences['t1'].str.split(expand=True)
date = d[0].str.split('-', expand=True)
time = d[1].str.split(':', expand=True)

years = date.iloc[:,0]
months = date.iloc[:,1]
days = date.iloc[:,2]
hours = time.iloc[:,0]
seconds = time.iloc[:,1]

df['year'] = years
df['month'] = months
df['day'] = days
df['hour'] = hours
df['second'] = seconds

In [13]:
seasons = {
'12': 'winter', '01' : 'winter', '02' : 'winter',
'03' : 'spring', '04' : 'spring', '05' : 'spring',
'06' : 'summer', '07' : 'summer', '08' : 'summer',
'09' : 'fall', '10' : 'fall', '11' : 'fall'
}

parts = {
'00':'dawn','01':'dawn','02':'dawn','03':'dawn',
'04':'morning','05':'morning','06':'morning','07':'morning',
'08':'daytime','09':'daytime','10':'daytime','11':'daytime',
'12':'evening','13':'evening','14':'evening','15':'Evening',
'16':'Dusk','17':'Dusk','18':'Dusk','19':'Dusk',
'20':'Night','21':'Night','22':'Night','23':'Night'
}

df['month'] = df['month'].map(seasons)
df['hour'] = df['hour'].map(parts)

### Event 2

In [14]:
# locate null exclusive rows and remove them
pageviews = pageview_hist.drop(list(np.ravel(np.where(\
                                    pageview_hist['Last Page Seen Previous Value (1)'] == 0))))

events = event_hist.drop(list(np.ravel(np.where(\
                                    event_hist['Last Registered Event Date Current Value'] == 0))))

print('pageviews shape:', pageviews.shape)
print('events shape:', events.shape)

pageviews shape: (37642, 9)
events shape: (56081, 92)


In [15]:
# preserve index order of pageviews
idx = pageviews.index.to_list()

# construct df containing url sequences and times
url_sequences = pd.DataFrame([])
url_sequences['ID'] = pageviews['Contact ID']
url_sequences['event1'] = pageviews['Last Page Seen Current Value']
url_sequences['t1'] = pageviews['Last Page Seen Change Date']
url_sequences['event2'] = pageviews['Last Page Seen Previous Value (1)']
url_sequences['t2'] = pageviews['Last Page Seen Change Date (1)']
url_sequences['event3'] = pageviews['Last Page Seen Previous Value (2)']
url_sequences['t3'] = pageviews['Last Page Seen Change Date (2)']
url_sequences['event4'] = pageviews['...']

# display first 5 rows of dataframe url_sequences
print("URL SEQUENCES DF")
url_sequences.head(1)

URL SEQUENCES DF


Unnamed: 0,ID,event1,t1,event2,t2,event3,t3,event4
5,333,https://www.deeplearning.ai/become-a-deeplearn...,2020-05-21 04:22,https://www.deeplearning.ai/events/?utm_campai...,2020-05-21 04:22,0,0,0


In [16]:
# prepare dataframe for urlparse function
m = pd.DataFrame([])
m['url'] = url_sequences['event1'].unique()

# list of indices for sample tracking
idx = m.index.to_list()

# list of column names
df_columns = ['scheme', 'netloc', 'path',
              'params', 'query', 'fragment']

# prepare null dataframe with columns = df_columns and index = idx
url_segments = pd.DataFrame([], columns=df_columns, index=idx).fillna(0)

# segment urls for categorization
url_segments['scheme'] = [urlparse(m['url'][i]).scheme for i in range(len(idx))]
url_segments['netloc'] = [urlparse(m['url'][i]).netloc for i in range(len(idx))]
url_segments['path'] = [urlparse(m['url'][i]).path for i in range(len(idx))]
url_segments['params'] = [urlparse(m['url'][i]).params for i in range(len(idx))]
url_segments['query'] = [urlparse(m['url'][i]).query for i in range(len(idx))]
url_segments['fragment'] = [urlparse(m['url'][i]).fragment for i in range(len(idx))]

# display first 5 rows
print('PARSED URLS DF')
url_segments.head(1)

PARSED URLS DF


Unnamed: 0,scheme,netloc,path,params,query,fragment
0,https,www.deeplearning.ai,/become-a-deeplearning-ai-events-ambassador/,,,


In [17]:
dic3 = {'become-a-deeplearning-ai-events-ambassador' : 'S', 'blog' : 'B',
       'generative-adversarial-networks-specialization' : 'D',
       'natural-language-processing-specialization' : 'D', '' : 'Q', 'thebatch' : 'G',
       'machine-learning-yearning' : 'E', 'careers' : 'H', 'events' : 'C', 'forums' : 'F',
       'deep-learning-specialization' : 'D', 'ai-for-medicine' : 'D', 'course-signup' : 'J',
       'about-us' : 'I', 'ai-for-everyone' : 'M', 'tensorflow-in-practice' : 'K',
       'tensorflow-data-and-deployment' : 'K', '1w-3kMnSXR9ynOeSC5vJWFQ3hul4' : 'Q',
       '1qOs8ajvWQGq4F4ClL_RbRA3hul4' : 'Q',
       'ai-career-program-for-experienced-engineers' : 'H',
       '18b-0Xg-iTwW-qVdOXFow3w3hul4' : 'Q',
       '-temporary-slug-45e6d038-071c-4c82-9eae-c664fa342a92' : 'Q', 'hs' : 'Q',
       'contact-us' : 'I', 'press' : 'I',
       '-temporary-slug-2c0727dd-b118-46ff-a8ea-696379a06821' : 'Q', 'bootcamp' : 'C',
       'tensorflow-from-basics-to-mastery' : 'K',
       'ai-career-program-for-university-graduates' : 'H', 'privacy' : 'I', 'page' : 'A',
       'climate' : 'L', '1uPx8FCrfQ3qWp_tv8WmCuA3hul4' : 'Q', 'ai-nots' : 'M', 'dl-notes' : 'N',
       'ai-notes' : 'M', '1sw5nCSRdRPWcbhr3a_03jg3hul4' : 'Q', 'ai-internship' : 'H',
       'machine-learning-yearning-cn' : 'E', 'notes' : 'M', 'translate_c' : 'Q',
       'terms-of-use' : 'I', 'newsletter-archive-test' : 'O', }

In [18]:
# define variable paths 
paths = url_segments['path']

# seperate strings in path by '/'
unique_paths = pd.DataFrame([i.split('/') for i in \
                            url_segments['path'].unique()])[1].unique()

In [19]:
# convert to pandas dataframe
_paths = pd.DataFrame(unique_paths, columns=["unique_paths"])

# append path encodings 
_paths['encoded_paths'] = pd.DataFrame(data=[list(dic3.values())]).T

# apply encodings to data set
for i in [i for i in range(len(_paths)) if i !=4]:
  idx2 = paths[paths.str.contains(f'{_paths["unique_paths"].iloc[i]}')].index
  paths[idx2] = _paths["encoded_paths"].iloc[i]

In [20]:
# any entry with netloc but no param must be represented in our data set. If not,
# many data points are lost and will go unaccounted for.

netloc_dic = {'www.deeplearning.ai' : 'A', 'blog.deeplearning.ai' : 'B', 'share.hsforms.com' : 'P',
       'info.deeplearning.ai' : 'I', 'www.mlyearning.org' : 'E', 'deeplearning.ai' : 'A',
       'translate.googleusercontent.com' : 'A'}

locs = url_segments['netloc'].iloc[paths.iloc[np.where(paths == '/')].index].map(netloc_dic)
paths.iloc[locs.index] = locs

In [21]:
X['event2'] = paths
X.fillna(0)

Unnamed: 0,event1,event2
0,S,S
1,B,B
2,G,D
3,D,D
4,G,A
...,...,...
14364,G,0
14365,G,0
14366,B,0
14367,G,0


### Event 3

In [22]:
# locate null exclusive rows and remove them
pageviews = pageview_hist.drop(list(np.ravel(np.where(\
                                    pageview_hist['Last Page Seen Previous Value (2)'] == 0))))

events = event_hist.drop(list(np.ravel(np.where(\
                                    event_hist['Last Registered Event Date Current Value'] == 0))))

print('pageviews shape:', pageviews.shape)
print('events shape:', events.shape)

pageviews shape: (24318, 9)
events shape: (56081, 92)


In [23]:
# preserve index order of pageviews
idx = pageviews.index.to_list()

# construct df containing url sequences and times
url_sequences = pd.DataFrame([])
url_sequences['ID'] = pageviews['Contact ID']
url_sequences['event1'] = pageviews['Last Page Seen Current Value']
url_sequences['t1'] = pageviews['Last Page Seen Change Date']
url_sequences['event2'] = pageviews['Last Page Seen Previous Value (1)']
url_sequences['t2'] = pageviews['Last Page Seen Change Date (1)']
url_sequences['event3'] = pageviews['Last Page Seen Previous Value (2)']
url_sequences['t3'] = pageviews['Last Page Seen Change Date (2)']
url_sequences['event4'] = pageviews['...']

# display first 5 rows of dataframe url_sequences
print("URL SEQUENCES DF")
url_sequences.head(1)

URL SEQUENCES DF


Unnamed: 0,ID,event1,t1,event2,t2,event3,t3,event4
82,2354,https://www.deeplearning.ai/generative-adversa...,2020-09-30 17:17,https://www.deeplearning.ai/,2020-09-30 17:16,https://www.deeplearning.ai/blog/working-ai-bu...,2020-09-10 17:00,0


In [24]:
# prepare dataframe for urlparse function
m = pd.DataFrame([])
m['url'] = url_sequences['event1'].unique()

# list of indices for sample tracking
idx = m.index.to_list()

# list of column names
df_columns = ['scheme', 'netloc', 'path',
              'params', 'query', 'fragment']

# prepare null dataframe with columns = df_columns and index = idx
url_segments = pd.DataFrame([], columns=df_columns, index=idx).fillna(0)

# segment urls for categorization
url_segments['scheme'] = [urlparse(m['url'][i]).scheme for i in range(len(idx))]
url_segments['netloc'] = [urlparse(m['url'][i]).netloc for i in range(len(idx))]
url_segments['path'] = [urlparse(m['url'][i]).path for i in range(len(idx))]
url_segments['params'] = [urlparse(m['url'][i]).params for i in range(len(idx))]
url_segments['query'] = [urlparse(m['url'][i]).query for i in range(len(idx))]
url_segments['fragment'] = [urlparse(m['url'][i]).fragment for i in range(len(idx))]

# display first 5 rows
print('PARSED URLS DF')
url_segments.head(1)

PARSED URLS DF


Unnamed: 0,scheme,netloc,path,params,query,fragment
0,https,www.deeplearning.ai,/generative-adversarial-networks-specialization/,,,


In [25]:
dic2 = { 'generative-adversarial-networks-specialization' : 'D',
  'natural-language-processing-specialization' : 'D', 'blog' : 'B', 'thebatch' : 'G',
  'machine-learning-yearning' : 'E', 'events' : 'C', 'forums' : 'F', '' : 'Q',
  'deep-learning-specialization' : 'D', 'careers' : 'H', 'ai-for-medicine' : 'M',
  'about-us' : 'I', 'course-signup' : 'J', 'tensorflow-in-practice' : 'K',
  '1w-3kMnSXR9ynOeSC5vJWFQ3hul4' : 'Q', 'tensorflow-data-and-deployment' : 'K',
  '1qOs8ajvWQGq4F4ClL_RbRA3hul4' : 'Q',
  'become-a-deeplearning-ai-events-ambassador' : 'S', 'ai-for-everyone' : 'M',
  'hs' : 'Q', 'contact-us' : 'I',
  '-temporary-slug-45e6d038-071c-4c82-9eae-c664fa342a92' : 'Q', 'press' : 'I',
  'ai-career-program-for-experienced-engineers' : 'H',
  '-temporary-slug-2c0727dd-b118-46ff-a8ea-696379a06821' : 'Q',
  '18b-0Xg-iTwW-qVdOXFow3w3hul4' : 'Q',
  'tensorflow-from-basics-to-mastery' : 'K',
  'ai-career-program-for-university-graduates' : 'H', 'bootcamp' : 'C',
  'privacy' : 'I', 'page' : 'A', 'climate' : 'L', 'ai-nots' : 'M', 'dl-notes' : 'N',
  '1uPx8FCrfQ3qWp_tv8WmCuA3hul4' : 'Q', 'ai-notes' : 'M',
  '1sw5nCSRdRPWcbhr3a_03jg3hul4' : 'Q', 'ai-internship' : 'C', 'translate_c' : 'Q',
  'newsletter-archive-test' : 'O'}

In [26]:
# define variable paths 
paths = url_segments['path']

# seperate strings in path by '/'
unique_paths = pd.DataFrame([i.split('/') for i in \
                            url_segments['path'].unique()])[1].unique()

In [27]:
# convert to pandas dataframe
_paths = pd.DataFrame(unique_paths, columns=["unique_paths"])

# append path encodings 
_paths['encoded_paths'] = pd.DataFrame(data=[list(dic2.values())]).T

# apply encodings to data set
for i in [i for i in range(len(_paths)) if i != 7]:
  idx2 = paths[paths.str.contains(f'{_paths["unique_paths"].iloc[i]}')].index
  paths[idx2] = _paths["encoded_paths"].iloc[i]

In [28]:
netloc_dic = {'www.deeplearning.ai' : 'A', 'blog.deeplearning.ai' : 'B', 'share.hsforms.com' : 'P',
       'info.deeplearning.ai' : 'I', 'www.mlyearning.org' : 'E', 'deeplearning.ai' : 'A',
       'translate.googleusercontent.com' : 'A'}

locs = url_segments['netloc'].iloc[paths.iloc[np.where(paths == '/')].index].map(netloc_dic)
paths.iloc[locs.index] = locs

In [29]:
X['event3'] = paths
X.fillna(0)

Unnamed: 0,event1,event2,event3
0,S,S,D
1,B,B,D
2,G,D,B
3,D,D,B
4,G,A,B
...,...,...,...
14364,G,0,0
14365,G,0,0
14366,B,0,0
14367,G,0,0


### Event 4

In [58]:
# locate null exclusive rows and remove them
pageviews = pageview_hist.drop(list(np.ravel(np.where(\
                                    pageview_hist['...'] == 0))))

print('pageviews shape:', pageviews.shape)

pageviews shape: (16463, 9)


In [59]:
# preserve index order of pageviews
idx = pageviews.index.to_list()

# construct df containing url sequences and times
url_sequences = pd.DataFrame([])
url_sequences['ID'] = pageviews['Contact ID']
url_sequences['event1'] = pageviews['Last Page Seen Current Value']
url_sequences['t1'] = pageviews['Last Page Seen Change Date']
url_sequences['event2'] = pageviews['Last Page Seen Previous Value (1)']
url_sequences['t2'] = pageviews['Last Page Seen Change Date (1)']
url_sequences['event3'] = pageviews['Last Page Seen Previous Value (2)']
url_sequences['t3'] = pageviews['Last Page Seen Change Date (2)']
url_sequences['event4'] = pageviews['...']

# display first 5 rows of dataframe url_sequences
print("URL SEQUENCES DF")
url_sequences.head(1)

URL SEQUENCES DF


Unnamed: 0,ID,event1,t1,event2,t2,event3,t3,event4
92,233,https://www.deeplearning.ai/natural-language-p...,2020-09-29 06:56,https://www.deeplearning.ai/,2020-09-29 06:55,https://www.deeplearning.ai/events/,2020-09-23 07:55,https://www.deeplearning.ai/thebatch/


In [71]:
# prepare dataframe for urlparse function
m = pd.DataFrame([])
m['url'] = url_sequences['event1'].unique()

# list of indices for sample tracking
idx = m.index.to_list()

# list of column names
df_columns = ['scheme', 'netloc', 'path',
              'params', 'query', 'fragment']

# prepare null dataframe with columns = df_columns and index = idx
url_segments = pd.DataFrame([], columns=df_columns, index=idx).fillna(0)

# segment urls for categorization
url_segments['scheme'] = [urlparse(m['url'][i]).scheme for i in range(len(idx))]
url_segments['netloc'] = [urlparse(m['url'][i]).netloc for i in range(len(idx))]
url_segments['path'] = [urlparse(m['url'][i]).path for i in range(len(idx))]
url_segments['params'] = [urlparse(m['url'][i]).params for i in range(len(idx))]
url_segments['query'] = [urlparse(m['url'][i]).query for i in range(len(idx))]
url_segments['fragment'] = [urlparse(m['url'][i]).fragment for i in range(len(idx))]

# display first 5 rows
print('PARSED URLS DF')
url_segments.head(1)

PARSED URLS DF


Unnamed: 0,scheme,netloc,path,params,query,fragment
0,https,www.deeplearning.ai,/natural-language-processing-specialization/,,,


In [72]:
# define variable paths 
paths = url_segments['path']

# seperate strings in path by '/'
unique_paths = pd.DataFrame([i.split('/') for i in \
                            url_segments['path'].unique()])[1].unique()

In [62]:
dic4 = {'natural-language-processing-specialization' : 'D', 'blog' : 'B', 'thebatch' : 'G',
       'machine-learning-yearning' : 'E', 'forums' : 'F', '' : 'Q',
       'deep-learning-specialization' : 'D', 'careers' : 'H',
       'generative-adversarial-networks-specialization' : 'D',
       'ai-for-medicine' : 'D', 'events' : 'C', 'about-us' : 'I', 'course-signup' : 'P',
       'tensorflow-in-practice' : 'K', '1w-3kMnSXR9ynOeSC5vJWFQ3hul4' : 'Q',
       'tensorflow-data-and-deployment' : 'K', '1qOs8ajvWQGq4F4ClL_RbRA3hul4' : 'Q',
       'ai-for-everyone' : 'M', 'hs' : 'Q', 'contact-us' : 'I',
       'become-a-deeplearning-ai-events-ambassador' : 'S',
       '-temporary-slug-45e6d038-071c-4c82-9eae-c664fa342a92' : 'Q', 'press' : 'I',
       '-temporary-slug-2c0727dd-b118-46ff-a8ea-696379a06821' : 'Q',
       'ai-career-program-for-experienced-engineers' : 'H',
       'tensorflow-from-basics-to-mastery' : 'K',
       'ai-career-program-for-university-graduates' : 'H', 'bootcamp' : 'C',
       '18b-0Xg-iTwW-qVdOXFow3w3hul4' : 'Q', 'privacy' : 'I', 'page' : 'A', 'climate' : 'L',
       'ai-nots' : 'M', 'dl-notes' : 'N', 'ai-notes' : 'N', '1sw5nCSRdRPWcbhr3a_03jg3hul4' : 'Q',
       '1uPx8FCrfQ3qWp_tv8WmCuA3hul4' : 'Q', 'ai-internship' : 'H', 'translate_c' : 'Q',
       'newsletter-archive-test' : 'O'}

In [73]:
# convert to pandas dataframe
_paths = pd.DataFrame(unique_paths, columns=["unique_paths"])

# append path encodings 
_paths['encoded_paths'] = pd.DataFrame(data=[list(dic4.values())]).T

# apply encodings to data set
for i in [i for i in range(len(_paths)) if i != 5]:
  idx2 = paths[paths.str.contains(f'{_paths["unique_paths"].iloc[i]}')].index
  paths[idx2] = _paths["encoded_paths"].iloc[i]

In [75]:
netloc_dic = {'www.deeplearning.ai' : 'A', 'blog.deeplearning.ai' : 'B', 'share.hsforms.com' : 'P',
       'info.deeplearning.ai' : 'I', 'www.mlyearning.org' : 'E', 'deeplearning.ai' : 'A',
       'translate.googleusercontent.com' : 'A'}

locs = url_segments['netloc'].iloc[paths.iloc[np.where(paths == '/')].index].map(netloc_dic)
paths.iloc[locs.index] = locs

In [77]:
X['event4'] = paths
X.fillna(0)
X.to_csv('X.csv')