### Library Functions

In [1]:
import boto3
import re
import json
import spacy
import math
import os.path
import sys, os, unicodedata
import pandas as pd
import smart_open

from boto3 import Session
from collections import defaultdict
from datetime import datetime, timedelta
from pandas.io.json import json_normalize

In [2]:
# File Path Constants
URL_IGNORE_LIST_PATH                         = './configuration/url-ignore-list.json'
KEYWORD_IGNORE_LIST_PATH                     = './configuration/keyword-ignore-list.txt'
USER_SEGMENT_LIST_PATH                       = './configuration/part-00000-2aa20d63-3e3a-47d2-8bed-d199cef5b814-c000.json'
DATERANGE_CONFIGURATION_PATH                 = './configuration/daterange.txt'
CUSTOMER_LIFERAY_MANUALLY_GENERATED_KEYWORDS = './configuration/customer-lr-manual-keywords.csv'
WWW_LIFERAY_MANUALLY_GENERATED_KEYWORDS      = './configuration/www-lr-manual-keywords.csv'
AMAZON_WEB_SERVICE_E3_BASE_FOLDER            = r'C:\Users\liferay\Documents\analytics data\export'
INDIVIDUAL_OUTPUT_DIRECTORY                  = './output/individual/'
SEGMENT_OUTPUT_DIRECTORY                     = './output/segment/'

In [3]:
MINIMUM_TOPIC_OF_INTEREST_THRESHOLD_SCORE = .1
DECAY_MULTIPLIER_BASE = .90

# https://stackoverflow.com/questions/11066400/remove-punctuation-from-unicode-formatted-strings/11066687#11066687
PUNCTUATION_UNICODE_TABLE = dict.fromkeys(i for i in range(sys.maxunicode) if unicodedata.category(chr(i)).startswith('P'))
NON_ENGLISH_URL_REGEX = re.compile('\/zh(_CN)?\/'
                                   '|\/fr(_FR)?\/'
                                   '|\/de(_DE)?\/'
                                   '|\/it(_IT)?\/'
                                   '|\/ja(_JP|-JP)?\/'
                                   '|\/pt(-br|_BR|_PT)?\/'
                                   '|\/es(-es|_ES)?\/'
                                   '|\/ru\/')
WWW_OR_CUSTOMER_LIFERAY_URL_REGEX = re.compile(r'^https://www\.liferay|^https://customer\.liferay')
BOT_AND_CRAWLER_REGEX = re.compile('((.*)(bot|Bot)(.*)'
                                   '|(.*)spider(.*)'
                                   '|(.*)crawler(.*)'
                                   '|HubSpot'
                                   '|CloudFlare\-AlwaysOnline'
                                   '|WkHTMLtoPDF)')
PARENTHESIS_REGEX = re.compile(u'\(.*?\)')
BANNED_KEYWORDS_LIST = []
INTEREST_CALCULATION_WINDOW_TIMEDELTA = timedelta(30)

DATE_RANGE_OPTIONS = {
    'day'   : timedelta(1),
    'week'  : timedelta(7),
    'month' : timedelta(30)
}

UTM_PARAMETERS = ['utm_source', 'utm_medium', 'utm_campaign', 'utm_term', 'utm_content']
HUBSPOT_PARAMETERS = ['_hsenc', '_hsmi', '__hstc', '__hssc', '__hsfp']
GOOGLE_ANALYTICS_PARAMETERS = ['_ga', '_gac']
URL_REDIRECT_PARAMETERS = ['redirect', '_3_WAR_osbknowledgebaseportlet_redirect']
ALL_OPTIONAL_URL_PARAMETERS = UTM_PARAMETERS + HUBSPOT_PARAMETERS + GOOGLE_ANALYTICS_PARAMETERS + URL_REDIRECT_PARAMETERS

with open(KEYWORD_IGNORE_LIST_PATH, 'r', encoding='utf-8') as f:
    for line in f:
        BANNED_KEYWORDS_LIST.append(line.strip())

nlp = spacy.load('en')

# Populate URL Ignore List
URL_IGNORE_LIST_MATCH = []
URL_IGNORE_LIST_CONTAINS = []

with open(URL_IGNORE_LIST_PATH, 'r', encoding='utf-8') as f:
    for line in f:
        json_result = json.loads(line)      
        comparison_type = json_result['Type']
        
        if comparison_type == 'match':
            URL_IGNORE_LIST_MATCH = json_result['URLs']
        elif comparison_type == 'contains':
            URL_IGNORE_LIST_CONTAINS = json_result['URLs']
        else:
            print("UNEXPECTED TYPE: {}".format(comparison_type))

START_DATE_STRING = 0
END_DATE_STRING = 0
START_DATE_DATETIME = 0
END_DATE_DATETIME = 0
CALCULATE_YESTERDAY_ONLY = False

In [4]:
# Read configuration file for start/end dates
with open(DATERANGE_CONFIGURATION_PATH, 'r', encoding='utf-8') as f:
    
    # First parameter is to calculate 'all' or only 'yesterday' topics of interest
    for line in f:
        # Ignore lines starting with a pound-sign
        if line.startswith('#'):
            continue
        else:
            if line.strip() == 'yesterday':
                CALCULATE_YESTERDAY_ONLY = True
            break
                
    # Second parameter is the start date
    for line in f:
        # Ignore lines starting with a pound-sign
        if line.startswith('#'):
            continue
        else:
            START_DATE_STRING = line.strip()
            START_DATE_DATETIME = datetime.strptime(line.strip(), '%Y%m%d')
            break
            
    # Third parameter is for end date
    for line in f:
        # Ignore lines starting with a pound-sign
        if line.startswith('#'):
            continue
        else:
            if line == 'yesterday':
                END_DATE_DATETIME = (datetime.today() - timedelta(1))
                END_DATE_STRING = END_DATE_DATETIME.strftime('%Y%m%d')
            else:
                END_DATE_STRING = line.strip()
                END_DATE_DATETIME = datetime.strptime(line.strip(), '%Y%m%d')

In [5]:
if True:
    print(START_DATE_STRING)
    print(END_DATE_STRING)

    print(START_DATE_DATETIME)
    print(END_DATE_DATETIME)

    print(CALCULATE_YESTERDAY_ONLY)

20180518
20180520
2018-05-18 00:00:00
2018-05-20 07:44:40.317892
False


#### Augment Tokenizer
The tokenizer fails on many hypenated words, so I wanted to augment it to work better.
Examples: 

* State-of-the-art collaboration platform targets quality patient care.
* Share files with a simple drag-and-drop. Liferay Sync transforms the Liferay platform into a central and secure easy-to-use document sharing service.
* Importing/Exporting Pages and Content - portal - Knowledge | "Liferay

In [6]:
import spacy
from spacy.attrs import *

#from spacy.symbols import ORTH, POS, TAG

# Source: https://github.com/explosion/spaCy/issues/396


nlp = spacy.load('en')
nlp.tokenizer.add_special_case(u'state-of-the-art', [{ORTH: 'state-of-the-art',
                                                      LEMMA: 'state-of-the-art', 
                                                      LOWER: 'state-of-the-art',
                                                      SHAPE: 'xxxxxxxxxxxxxxxx',
                                                      POS: 'ADJ', 
                                                      TAG: 'JJ'}])
nlp.tokenizer.add_special_case(u'State-of-the-art', [{ORTH: 'State-of-the-art',
                                                      LEMMA: 'state-of-the-art', 
                                                      LOWER: 'state-of-the-art',
                                                      SHAPE: 'xxxxxxxxxxxxxxxx',
                                                      POS: 'ADJ', 
                                                      TAG: 'JJ'}])
nlp.tokenizer.add_special_case(u'drag-and-drop', [{ORTH: 'drag-and-drop',
                                                      LEMMA: 'drag-and-drop', 
                                                      LOWER: 'drag-and-drop',
                                                      SHAPE: 'xxxxxxxxxxxxx',
                                                      POS: 'ADJ', 
                                                      TAG: 'JJ'}])

In [7]:
# Library Functions

import re
import langdetect
import string
from collections import OrderedDict
from langdetect.lang_detect_exception import ErrorCode, LangDetectException
from string import printable


def playFinishedSound():
    """
    This is for alerting me that something has finished executing.
    This will play a sound.
    """
    from pygame import mixer
    mixer.init()
    mixer.music.load('./configuration/finished.mp3')
    mixer.music.play()

def replace_punctuation(text):
    """
    The purpose of this function is to replace non-ASCII punctuation with its equivalent.
    """
    return text.replace("?", "'")

def segmentWordsIntoKeyWordPhraseList(words, debug=False):

    phrase_list = []
    
    if debug: print("\nOriginal Sentence: {}".format(words))
    # First segment the words by '|' or '-'
    split_words = re.split(r'[\|]| \- ', words)
    split_words = [s.strip() for s in split_words]
    cleaned_up_and_split_sentences = []
    
    # Search for instances of acronymns surrounded in parenthesis. Ex: (DXP)
    # Remove those, and add it automatically to the phrase list
    for sentence in split_words:
        terms_within_parenthesis = [term[1:-1] for term in re.findall(PARENTHESIS_REGEX, sentence)]
        phrase_list += terms_within_parenthesis
        if debug: print(terms_within_parenthesis)
            
        remaining_text = ''.join(re.split(PARENTHESIS_REGEX, sentence))
        cleaned_up_and_split_sentences.append(remaining_text)
        if debug: print(remaining_text)
        
    for sentence in cleaned_up_and_split_sentences:
        if debug: print("Sentence: {}".format(sentence))
        doc = nlp(sentence)
        for chunk in doc.noun_chunks:
            if debug: print("\tText: {} \n\tRoot: {} \n\tRoot Dependency: {} \n\tRoot Head: {}".format(chunk.text, chunk.root.text, chunk.root.dep_, chunk.root.head.text))
            
            text = chunk.text
            if debug:
                print(text)
                print("\tPOS: {}".format(chunk[0].pos_))
                print("\tTag: {}".format(chunk[0].tag_))
                print("\tChunk[0]: {}".format(chunk[0]))
                
            # Skip keywords that contain CD (Cardinal Numbers) for now
            if 'CD' in [c.tag_ for c in chunk]:
                print("Skipping, contains CD")
                continue
            
            # Skip URLs
            url_found = False
            for token in chunk:
                if debug: print(token)
                if token.like_url:
                    url_found = True
                    print("Skipping, URL Detected! ({})".format(text))
                    break
                    
            if url_found:
                continue
            
            # We'll skip a phrase for now if it contains a number
            # E.g. Free download: Gartner's evaluation of 21 digital 
            # experience platform (DXP) providers based on their completeness of vision and ability to execute
            
            # CD - [5 Critical Things] Nobody Tells You About Building a Journey Map
            # Recursively remove until no more? - These six customer experience trends will shape business in 2018
            if chunk[0].tag_ in ['DT', 'PRP$', 'WP', 'PRP', 'WRB', 'CD', ':']:
                if debug: print("Starting 'ignore word' found in: {}".format(chunk))
                #text = ' '.join(s.text for s in chunk[1:])
                
                unwanted_text = chunk[0].text
                if debug: print("Unwanted text: {}".format(unwanted_text))
                text = chunk[1:].text
                
                # If we shrunk it down to nothing
                if not text:
                    continue
            
            # Removes invisible characters
            printable_string = ''.join(char for char in text.strip() if char in printable)
            
            # Converts string to lower case; if matches criteria
            # Note: Keep acroynmns the same, check if 2 or more letters, and all caps
            printable_string = modifyCapitalizationOfWords(printable_string)
            
            #if 'blog' in printable_string:
            #    print("Original Sentence: [{}]".format(words))
            #    print("Blog Word: [{}]".format(printable_string))
            
            if text == chunk.root.text:
                phrase_list.append(printable_string)
            else:
                phrase_list.append(printable_string)
                #phrase_list.append(chunk.root.text.lower())
            
    if debug: print("Final list: {}".format(phrase_list))
    return phrase_list
    
def modifyCapitalizationOfWords(text):
    """
    This function will take the given noun phrase, and adjust captialization as necessary.
    Currently it only retains acronymn capitalization.
    I should ventually add a proper noun list as well.
    """
    
    updated_text = [word if (len(word) >=2) and (word.upper() == word) else word.lower() for word in text.split()]
    
    return ' '.join(updated_text)
    
def isEnglish(text, debug=False):
    
    # Empty String
    if not text.strip():
        return False
    
    try:
        text.encode(encoding='utf-8').decode('ascii')
    except UnicodeDecodeError:
        if debug:
            print("Failed Unicode Detector")
        return False

    try:
        possible_language_list = langdetect.detect_langs(text)
        
        if debug:
            print(possible_language_list)
        
        for entry in possible_language_list:
            if ((entry.lang == 'en') and (entry.prob > .50)):
                return True
    
        return False

    except LangDetectException:
        print("**** Language Exception caught!")
        display("Original Text: [{}]".format(text))
    
    return True

In [8]:
def get_list_of_date_folders(start_date='20180227', end_date='20180326'):
    start_date = datetime.strptime(start_date, '%Y%m%d')
    end_date = datetime.strptime(end_date, '%Y%m%d')
    step = timedelta(days=1)

    list_of_date_folder_names = []

    while start_date <= end_date:
        date_string = start_date.date().strftime('%Y%m%d')
        list_of_date_folder_names.append(date_string)
        start_date += step

    return list_of_date_folder_names

def read_json_as_list(full_file_path):
    all_web_browsing_history = []

    with open(full_file_path, 'r', encoding='utf-8') as f:
        for counter, line in enumerate(f):
            dict_entry = json.loads(line)       
            all_web_browsing_history.append(dict_entry)
                
    return all_web_browsing_history
                

def convert_string_of_json_to_df(list_of_json):
    start_time = datetime.now()
    df = json_normalize(list_of_json)
    print("\tExecution Time: {}".format(datetime.now() - start_time))
    return df

def get_s3_keys(s3, bucket):
    """Get a list of keys in an S3 bucket."""
    keys = []
    resp = s3.list_objects_v2(Bucket=bucket)
    for obj in resp['Contents']:
        keys.append(obj['Key'])
    return keys

### Populate Segment Info

In [9]:
# Populate Segment Information
segment_lookup_df = pd.DataFrame()
json_list = read_json_as_list(USER_SEGMENT_LIST_PATH)
segment_lookup_df = json_normalize(json_list)
display(segment_lookup_df)
segment_lookup_df = segment_lookup_df.set_index(['identifier', 'datasource', 'datasourceindividualpk'])['segmentnames'].apply(pd.Series).stack()
segment_lookup_df = pd.DataFrame(segment_lookup_df)
segment_lookup_df = segment_lookup_df.reset_index().rename(columns={0 : 'segmentName'})

# Switch order of columns
segment_lookup_df = segment_lookup_df[['segmentName', 'identifier', 'datasource', 'datasourceindividualpk']]


if False:
    display(temp_df)
    for index, row in temp_df.groupby('segmentName'):
        print("index")
        display(index)
        print("row")
        display(row)
        print("identifier")
        display(row['identifier'])
        break
        


Unnamed: 0,datasource,datasourceindividualpk,datecreated,datemodified,demographicvaluesmap.additionalName,demographicvaluesmap.address,demographicvaluesmap.addressRegion,demographicvaluesmap.age,demographicvaluesmap.birthDate,demographicvaluesmap.browserInfo,...,demographicvaluesmap.salary,demographicvaluesmap.sfCampaign,demographicvaluesmap.sfCampaignId,demographicvaluesmap.telephone,demographicvaluesmap.whatAreYouBuilding,demographicvaluesmap.whatBusinessChallengesAreYouFacing,demographicvaluesmap.whatWouldYouLikeToTalkAbout,demographicvaluesmap.worksFor,identifier,segmentnames
0,AWI_0f2_q_9uZvuIRBN3,AWMcP1kSgcT3bCtqg0B0,2018-03-29T04:31:07.026Z,2018-05-04T16:12:43.025Z,,,[California],,"[Jan 1, 1970]",[Mozilla/5.0 (Macintosh; Intel Mac OS X 10.11;...,...,,,,[323-555-1212],,,[hello from production!],"[Liferay, Inc.]",AWJEyFZzAxvlVqtdUejP,"[test-country-6, test-country-5, test-country-..."
1,AWI_0f2_q_9uZvuIRBN3,AWMrshzkYcRwJ2HMl9L,2018-03-21T18:19:58.147Z,2018-05-04T15:46:13.251Z,,,,,,,...,,,,,,,,,AWJJygFDNu2tno9FNhiS,"[LRDCOM UAT, test-country-8, test-country-5, t..."
2,AWJJqhlLEfU1zWepDky,ce900bd2-cff6-408e-9f31-2fd8df69aaf2,2018-03-20T19:42:55.167Z,2018-05-04T15:02:28.774Z,,,,,"[Jan 1, 1970]",,...,,,,,,,,,AWJE75a_XXGJek5_6fgG,"[LRDCOM UAT, test-country-8, test-country-5, t..."
3,AWI_0f2_q_9uZvuIRBN3,AWMEF885_nPDtHhujxtt,2018-03-29T04:31:06.833Z,2018-05-04T18:29:23.267Z,,,[null],,"[Jan 1, 1970]",[Mozilla/5.0 (iPhone; CPU iPhone OS 11_2 like ...,...,,,,[859-992-9397],,[Knowledge Management],[Testing form],[Liferay],AWJE6fWZXXGJek5_6dee,"[test-country-6, test-country-5, test-country-..."
4,AWI_0f2_q_9uZvuIRBN3,AWJmhhegDxZZoLqfGlvt,2018-03-20T19:05:29.835Z,2018-05-04T09:39:39.803Z,,,[null],,"[Jan 1, 1970]",[Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_...,...,,,,,[Call Center Application],,,[Liferay],AWJEzVPrAxvlVqtdUg5I,"[test-country-6, test-country-5, test-country-..."
5,AWI_0f2_q_9uZvuIRBN3,AWMV_xFj_nPDtHhuj3Xu,2018-03-20T18:59:34.384Z,2018-05-04T18:16:57.836Z,,,,,"[Jan 1, 1970]",,...,,,,,,,,,AWJExdwAxvlVqtdUeMD,"[test-country-8, test-country-5, test-country-..."
6,AWI_0f2_q_9uZvuIRBN3,AWMfjwNOgcT3bCtqg166,2018-03-28T00:06:48.880Z,2018-05-04T16:39:38.839Z,,[Thaltej],,,"[Jan 1, 1970]",,...,,,,[9825500947],,,,,AWJE8CAv526STMIypds6,"[test-country-6, test-country-5, LRDCOM UAT, t..."
7,AWI_0f2_q_9uZvuIRBN3,AWLieIKmsCV4XVL0I1xJ,2018-03-29T04:30:54.581Z,2018-05-04T09:57:15.658Z,,,[null],,"[Jan 1, 1970]",[Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_...,...,,[70170000001Letn],[70170000001Letn],[669943712],,,[Test sobre nuevo formulario],[Liferay],AWJEyQt2U_NIucVEYIP3,"[test-country-6, test-country-5, test-country-..."
8,AWJESY7Tq_9uZvuIRDlC,1084893,2018-03-20T19:04:16.309Z,2018-05-04T11:14:13.447Z,,,,,"[Jan 1, 1970]",,...,,,,,,,,,AWJEzDS1U_NIucVEYJcz,"[test-country-8, test-country-5, test-country-..."
9,AWJESY7Tq_9uZvuIRDlC,204980,2018-03-20T19:05:29.835Z,2018-05-04T09:39:39.803Z,,,[null],,"[Jan 1, 1970]",[Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_...,...,,,,,[Call Center Application],,,[Liferay],AWJEzVPrAxvlVqtdUg5I,"[test-country-6, test-country-5, test-country-..."


In [10]:
if False: display(segment_lookup_df)

### ETL Functions

In [66]:
import re
import numpy as np
import pandas as pd
from furl import furl

def show_dataframe_length_before_and_after(f, df):
    print("\tBefore: {}".format(len(df)))
    df = f(df)
    print("\tAfter: {}".format(len(df)))
    return df

def keep_only_unload_events(df):
    df = df[df['eventid'] == 'unload']
    return df

def remove_all_bots(df):
    df = df[~df['context.crawler'].str.contains('True', na=False)]
    df = df[~df['context.userAgent'].str.match(BOT_AND_CRAWLER_REGEX, na=False)]
    return df

def remove_non_english_urls(df):
    df = df[~df['context.url'].str.contains(NON_ENGLISH_URL_REGEX, na=False)]
    return df

def populate_url_ignore_list(df):
    
    URL_IGNORE_LIST_MATCH_REGEX_STRING    = '|'.join(['^{}$'.format(s.strip()) for s in URL_IGNORE_LIST_MATCH])
    URL_IGNORE_LIST_CONTAINS_REGEX_STRING = '|'.join(URL_IGNORE_LIST_CONTAINS)

    # TODO: Maybe use 'normalized_url' only?
    df['Ignore URL'] = df['context.url'].str.match(URL_IGNORE_LIST_MATCH_REGEX_STRING) \
                     | df['context.og:url'].str.match(URL_IGNORE_LIST_MATCH_REGEX_STRING) \
                     | df['context.url'].str.match(URL_IGNORE_LIST_CONTAINS_REGEX_STRING) \
                     | df['context.og:url'].str.match(URL_IGNORE_LIST_CONTAINS_REGEX_STRING)
    return df

def remove_non_customer_www_lr_urls(df):
    df = df[df['context.url'].str.contains(WWW_OR_CUSTOMER_LIFERAY_URL_REGEX, na=False)]
    return df

def remove_empty_user_id_entries(df):
    df['userid'].replace('', np.nan, inplace=True)
    df.dropna(subset=['userid'], inplace=True)
    return df

def __removeUrlParameters(url, parameter_list):  
    f = furl(url)
    remaining_parameters = { k: f.args[k] for k in f.args if k not in parameter_list }
    f.args = remaining_parameters    
    return f.url

def populateNormalizedUrlField(df):
    df['normalized_url'] = df['context.og:url'].fillna(df['context.url'])
    df['normalized_url'] = df['normalized_url'].apply(lambda x: __removeUrlParameters(x, ALL_OPTIONAL_URL_PARAMETERS))
    return df

def replaceBlankSpacesWithNan(df):
    # '\s+' is 1 or more
    df.replace(r'^\s*$', np.nan, regex=True, inplace=True)
    return df

def filterUnwantedColumns(df):
    wanted_columns_list = ['eventdate', 
                           'analyticskey', 
                           'userid', 
                           'eventid', 
                           'Ignore URL',
                           'normalized_url',
                           'context.url', 'context.og:url', 
                           'context.title', 'context.og:title', 
                           'context.description', 'context.og:description', 
                           'context.keywords',
                           'context.contentLanguageId',
                           'eventproperties.scrollDepth', 
                           'eventproperties.viewDuration', 
                           'context.userAgent', 
                           'context.platformName', 
                           'context.browserName', 
                           'context.country', 
                           'context.region', 
                           'context.city', 
                           'clientip']
    df = df[wanted_columns_list]
    return df

def convertColumnsToAppropriateDataTypes(df):
    print("Converting eventdate to datetime objects")
    df['eventdate'] = pd.to_datetime(df['eventdate'])
    print("Converting viewDuration to int")
    df['eventproperties.viewDuration'] = pd.to_numeric(df['eventproperties.viewDuration'])
    return df

### Read JSON files and save as DataFrame

In [13]:
%%time

# This is in case an exception is thrown when accessing AWS, E3 data
RETRY_LIMIT = 3

# Plan go through list of directories, and parse in all the relevant JSON files.
BASE_S3_FILE_DIRECTORY = 's3://lcs-ec-analytics/'
start_date = START_DATE_STRING
end_date = END_DATE_STRING

list_of_date_folder_names = get_list_of_date_folders(start_date=start_date, end_date=end_date)
s3 = boto3.client('s3')
keys_list = get_s3_keys(s3, 'lcs-ec-analytics')
full_df = pd.DataFrame()

if True:
    os.write(1, "\nReading JSON files from AWS S3".encode())
    # For reading from AWS S3 sever
    for sub_folder_name in list_of_date_folder_names:
        matching_s3_file_list = [match for match in keys_list if ((sub_folder_name in match) 
                                                                  and (match.endswith('.json')) 
                                                                  and ('user_individual_segment' not in match))]


        if len(matching_s3_file_list) == 0:
            print("[WARNING] - No entries found for this date: {}".format(sub_folder_name))
            print("\n{}".format(matching_s3_file_list))
            os.write(1, "[WARNING] - No entries found for this date: {}".format(sub_folder_name).encode())
            os.write(1, "\n{}".format(matching_s3_file_list).encode())
            continue
        elif len(matching_s3_file_list) >= 2:
            print("[WARNING] - More than 1 entry found for this date: {}".format(sub_folder_name))
            print("Only reading from 1st entry")
            print("\n{}".format(matching_s3_file_list))
            os.write(1, "[WARNING] - More than 1 entry found for this date: {}".format(sub_folder_name).encode())
            continue

        file_path = BASE_S3_FILE_DIRECTORY + matching_s3_file_list[0]
        print("\n{}".format(file_path))
        os.write(1, "\n\t\n{}".format(file_path).encode())
        
        attempts = 0

        while attempts < RETRY_LIMIT:
            try:
                json_list = []
                for line in smart_open.smart_open(file_path):
                    line = line.decode("utf-8").strip()
                    dict_entry = json.loads(line)       
                    json_list.append(dict_entry)

                print("\tEntries: {}".format(len(json_list)))

                if len(json_list) == 0:
                    print("\t[WARNING] 0 entries detected! Skipping...")
                    os.write(1, "\t[WARNING] 0 entries detected! Skipping...".encode())
                    continue

                df = convert_string_of_json_to_df(json_list)

                # XXX: Workaround to improve memory usage
                df = keep_only_unload_events(df)

                full_df = full_df.append(df, ignore_index=True)
                break
                
            except KeyboardInterrupt:
                print("KeyboardInterrupt detected. Aborting...")
                raise KeyboardInterrupt
            except Exception as e: 
                print(str(e))
                print("Unexpected error detected!")                
                attempts += 1
                
                if attempts < RETRY_LIMIT:
                    print("\tRetrying ({})...".format(attempts))
                    os.write(1, "\tRetrying ({})...".format(attempts).encode())
                else:
                    print("\t[ERROR] - Unable to read file after {} retries! Skipping...".format(RETRY_LIMIT))
                    os.write(1, "\t[ERROR] - Unable to read file after {} retries! Skipping...".format(RETRY_LIMIT).encode())
    
# Only execute this if we're reading the JSON files locally
else:
    for sub_folder_name in list_of_date_folder_names:
        directory_name = os.path.join(AMAZON_WEB_SERVICE_E3_BASE_FOLDER, sub_folder_name)

        for filename in os.listdir(directory_name):
            full_directory_and_file_name = os.path.join(directory_name, filename)

            if filename.endswith(".json"): 
                try:
                    print("\n{}".format(full_directory_and_file_name))
                    json_list = read_json_as_list(full_directory_and_file_name)
                    print("\tEntries: {}".format(len(json_list)))
                    
                    if len(json_list) == 0:
                        print("[WARNING] 0 entries detected! Skipping...")
                        continue
                    
                    
                    df = convert_string_of_json_to_df(json_list)

                    # XXX: Workaround to improve memory usage
                    df = keep_only_unload_events(df)

                    full_df = full_df.append(df, ignore_index=True)
                except KeyboardInterrupt:
                    print("KeyboardInterrupt detected. Aborting...")
                except:
                    print("Unexpected error detected!") 
                    
playFinishedSound()


s3://lcs-ec-analytics/export/20180518/part-00000-cf86f54b-c73f-4929-9e2f-d755a9d92f35-c000.json
	Entries: 150773
	Execution Time: 0:00:24.934992

s3://lcs-ec-analytics/export/20180519/part-00000-0c1bdfcc-4d29-4673-b499-c3c658d40b10-c000.json
	Entries: 218835
	Execution Time: 0:00:34.879888

s3://lcs-ec-analytics/export/20180520/part-00000-9f1541d8-38c7-4b8c-87ec-aa483b9be604-c000.json
	Entries: 202868
	Execution Time: 0:00:32.097698
Wall time: 3min 15s


In [14]:
with pd.option_context('display.max_rows', 10, 'display.max_columns', None, 'display.max_colwidth', 50):
    display(full_df['context.contentLanguageId'].unique())
    display(full_df[full_df['context.contentLanguageId'] == 'ru-RU'])

array(['fr-FR', 'es-ES', 'en-US', 'it-IT', 'de-DE', 'zh-CN', 'pt-BR',
       'en-AU', 'ja-JP', nan], dtype=object)

Unnamed: 0,analyticskey,applicationid,clientip,context.browserName,context.canonicalUrl,context.city,context.contentLanguageId,context.country,context.crawler,context.description,context.devicePixelRatio,context.deviceType,context.fb:admins,context.keywords,context.languageId,context.og:description,context.og:image,context.og:title,context.og:type,context.og:url,context.platformName,context.referrer,context.region,context.screenHeight,context.screenWidth,context.title,context.url,context.userAgent,createdate,eventdate,eventid,eventproperties.articleId,eventproperties.depth,eventproperties.entryId,eventproperties.fieldName,eventproperties.fieldTitle,eventproperties.fileEntryId,eventproperties.fileEntryUUID,eventproperties.focusDuration,eventproperties.formId,eventproperties.formTitle,eventproperties.groupId,eventproperties.href,eventproperties.page,eventproperties.pageLoadTime,eventproperties.preview,eventproperties.recordId,eventproperties.scrollDepth,eventproperties.sessionId,eventproperties.src,eventproperties.tagName,eventproperties.text,eventproperties.title,eventproperties.version,eventproperties.viewDuration,partitionkey,userid


In [67]:
%%time

import warnings

os.write(1, "\nStarting ETL Process\n".encode())

# Surpress Warning Messages from "removing non-English URLs"
warnings.filterwarnings("ignore", 'This pattern has match groups')

print("Keeping only UNLOAD events")
etl_df = show_dataframe_length_before_and_after(keep_only_unload_events, full_df)

print("Removing Bots")
etl_df = show_dataframe_length_before_and_after(remove_all_bots, etl_df)

print("Removing Non-English URLs")
etl_df = show_dataframe_length_before_and_after(remove_non_english_urls, etl_df)

print("Removing non-customer, non-www URLs")
etl_df = show_dataframe_length_before_and_after(remove_non_customer_www_lr_urls, etl_df)

print("Removing empty userid entries")
etl_df = show_dataframe_length_before_and_after(remove_empty_user_id_entries, etl_df)

print("Populating normalized_url field")
etl_df = populateNormalizedUrlField(etl_df)

print("Populating URL Ignore List")
etl_df = show_dataframe_length_before_and_after(populate_url_ignore_list, etl_df)
print("Ignoring {} URLs".format(len(etl_df[etl_df['Ignore URL'] == True])))

print("Removing unwanted columns")
etl_df = filterUnwantedColumns(etl_df)

print("Converting columns to appropriate data types")
etl_df = convertColumnsToAppropriateDataTypes(etl_df)

print("Replacing blank spaces with NaN")
etl_df = replaceBlankSpacesWithNan(etl_df)

os.write(1, "\n\tFinished ETL\n".encode())

Keeping only UNLOAD events
	Before: 7812
	After: 7812
Removing Bots
	Before: 7812
	After: 7780
Removing Non-English URLs
	Before: 7780
	After: 5941
Removing non-customer, non-www URLs
	Before: 5941
	After: 5271
Removing empty userid entries
	Before: 5271
	After: 5271
Populating normalized_url field
Populating URL Ignore List
	Before: 5271
	After: 5271
Ignoring 1301 URLs
Removing unwanted columns
Converting columns to appropriate data types
Converting eventdate to datetime objects
Converting viewDuration to int
Replacing blank spaces with NaN
Wall time: 3.44 s


In [68]:
%%time

# Make a copy, and use it
clean_df = etl_df.copy()
clean_df.sort_values('eventdate')
display("Length: {}".format(len(clean_df)))

'Length: 5271'

Wall time: 17.5 ms


### Save URLs for Web Scraping

In [69]:
# Disable for production (for now)
if False:
    url_s = pd.Series(clean_df['normalized_url'].unique()).sort_values()
    print("Number of URLs: {}".format(len(url_s)))
    url_s.to_csv('./output/Unique Visitor URLs.csv', index=False)

### Create DataFrame: URL Lookup Information
This will be the centralized URL to information Data Frame.

In [70]:
%%time
os.write(1, "\tPopulating Lookup Tables".encode())
url_to_title          = clean_df.groupby(['normalized_url'])['context.title'].apply(set)
url_to_og_title       = clean_df.groupby(['normalized_url'])['context.og:title'].apply(set)
url_to_description    = clean_df.groupby(['normalized_url'])['context.description'].apply(set)
url_to_og_description = clean_df.groupby(['normalized_url'])['context.og:description'].apply(set)
url_to_keywords       = clean_df.groupby(['normalized_url'])['context.keywords'].apply(set)

Wall time: 376 ms


In [71]:
def createUrlToKeywordDf():
    columns = ['normalized_url',
           'analyticsclient.merged_title', 
           'analyticsclient.merged_description', 
           'analyticsclient.merged_keywords',
           'analyticsclient.generated_keywords']

    url_to_keyword_df = pd.DataFrame(columns=columns)
    url_to_keyword_df['normalized_url'] = clean_df['normalized_url'].unique()
    #display(url_to_keyword_df)
    return url_to_keyword_df

def generateKeywordsFromTitleDescriptionKeywords(title, og_title, description, og_description, keywords, debug=False):
    merged_title = title.union(og_title)
    merged_description = description.union(og_description)
    
    keywords_from_title = set()
    keywords_from_description = set()
    keywords_from_keywords = set()
    
    only_english_titles = set()
    only_english_descriptions = set()
    only_english_keyword_set = set()
    
    title_description_to_keyword_cache = defaultdict(int)

    for entry in merged_title:

        # Skip empty strings       
        if pd.isnull(entry):
            continue
            
        # remove weird HTML punct
        entry = replace_punctuation(entry)
        
        cached_result = title_description_to_keyword_cache[entry]
        
        if cached_result != 0:
            keywords_from_title.update(cached_result)
            only_english_titles.update([entry])
        elif isEnglish(entry):
            #print("isEnglish() passed")
            #print("entry: ", entry)
            keyword_phrase_list = segmentWordsIntoKeyWordPhraseList(entry, debug=False)
            keywords_from_title.update(keyword_phrase_list)
            only_english_titles.update([entry])
            #print("entry: {}".format(entry))
            #print("only_english_titles: {}".format(only_english_titles))
            
            # Update Cache:
            title_description_to_keyword_cache[entry] = keyword_phrase_list
        else:
            print("Non-English detected: [{}]".format(entry))
            title_description_to_keyword_cache[entry] = []
            continue
    
    for entry in merged_description:        
        # Skip empty strings
        if pd.isnull(entry):
            continue
            
        # remove punct
        entry = replace_punctuation(entry)
        
        cached_result = title_description_to_keyword_cache[entry]
        
        if cached_result != 0:
            keywords_from_description.update(cached_result)
            only_english_descriptions.update([entry])
        elif isEnglish(entry):
            keyword_phrase_list = segmentWordsIntoKeyWordPhraseList(entry)
            keywords_from_description.update(keyword_phrase_list)
            only_english_descriptions.update([entry])
            
            # Update Cache:
            title_description_to_keyword_cache[entry] = keyword_phrase_list
        else:
            print("Non-English detected: [{}]".format(entry))
            title_description_to_keyword_cache[entry] = []
            continue
        
    for entry in keywords:
        
        # Skip empty strings
        if pd.isnull(entry):
            continue
            
        if isEnglish(entry):
            split_list = [s.strip() for s in entry.split(',')]
            keywords_from_keywords.update(set(split_list if split_list else []))
            only_english_keyword_set.update(set(split_list if split_list else []))
        else:
            print("Non-English detected: [{}]".format(entry))
            continue
    
    # Debugging
    if debug:
        print("\n\tMerged Title: {} => {}".format(only_english_titles, keywords_from_title))
        print("\tMerged Descr: {} => {}".format(only_english_descriptions, keywords_from_description))
        print("\tKeywords:     {} => {}".format(only_english_keyword_set, keywords_from_keywords))
        
    # merge all sets together
    all_keywords_merged = keywords_from_keywords.union(keywords_from_title, keywords_from_description)
    if debug: print("\tAll Keywords: {}".format(all_keywords_merged))

    # We return the English list of inputs we processed, and the final keyword output
    return list(only_english_titles), list(only_english_descriptions), list(only_english_keyword_set), list(all_keywords_merged)

def populateUrlToKeywordDf(url_to_keyword_df, debug=False):
    unique_url_list = url_to_keyword_df['normalized_url'].unique()

    for counter, url in enumerate(unique_url_list):       
        title = url_to_title.get(url, set())
        og_title = url_to_og_title.get(url, set())
        description = url_to_description.get(url, set())
        og_description = url_to_og_description.get(url, set())
        keywords_set = url_to_keywords.get(url, set())

        if debug: 
            print('\n{} / {}'.format(counter, len(unique_url_list)))
            print('{}'.format(url))
        merged_title_list, merged_description_list, merged_keyword_list, generated_keyword_list = generateKeywordsFromTitleDescriptionKeywords(title, og_title, description, og_description, keywords_set)

        # Populate url_to_keyword_df, with keywords
        index = url_to_keyword_df.loc[url_to_keyword_df['normalized_url'] == url]
        if len(index.index.values) > 1:
            print("ERROR: There shouldn't be more than 1 entry for the URL list!")
            print("index: {}".format(index))
            print("index.index.values: {}".format(index.index.values))
            break

        if len(merged_title_list) > 0: 
            url_to_keyword_df.at[index.index.values[0], 'analyticsclient.merged_title'] = merged_title_list

        if len(merged_description_list) > 0: 
            url_to_keyword_df.at[index.index.values[0], 'analyticsclient.merged_description'] = merged_description_list

        if len(merged_keyword_list) > 0: 
            url_to_keyword_df.at[index.index.values[0], 'analyticsclient.merged_keywords'] = merged_keyword_list

        url_to_keyword_df.at[index.index.values[0], 'analyticsclient.generated_keywords'] = generated_keyword_list
        
        if counter % 100 == 0:
            print("{} / {}".format(counter, len(unique_url_list)))
        
    return url_to_keyword_df

def addKeywordBoosting(df, debug=True):
    www_lr_manual_keywords = pd.read_csv(WWW_LIFERAY_MANUALLY_GENERATED_KEYWORDS)
    customer_lr_manual_keywords = pd.read_csv(CUSTOMER_LIFERAY_MANUALLY_GENERATED_KEYWORDS)
    all_lr_manual_keywords = www_lr_manual_keywords.append(customer_lr_manual_keywords, ignore_index=True)
    all_lr_manual_keywords = all_lr_manual_keywords[['URL', 'Keywords']]
    all_lr_manual_keywords = all_lr_manual_keywords.dropna(how='any')
    all_lr_manual_keywords['Keywords'] = all_lr_manual_keywords['Keywords'].apply(lambda x: 
                                               [modifyCapitalizationOfWords(s.strip()) for s in x.split(',') if s.strip()])

    # Populate existing url-to-keyword lookup dataframe
    temp_df = pd.merge(df, all_lr_manual_keywords, how='left', left_on='normalized_url', right_on='URL')

    # Rename the "Keywords" column to "manual.keywords"
    temp_df.rename(columns={'Keywords' : 'manual.keywords'}, inplace=True)

    # Rearrange order of columns
    temp_df = temp_df[['normalized_url',
                       'analyticsclient.generated_keywords',
                       'manual.keywords',
                       'analyticsclient.merged_title',
                       'analyticsclient.merged_description', 
                       'analyticsclient.merged_keywords']]

    # Replace analyticsclient.generated_keywords [] with NaN
    temp_df.loc[temp_df['analyticsclient.generated_keywords'].str.len() == 0, 'analyticsclient.generated_keywords'] = np.nan

    # Filter out URLs where the "Automatically Generated Keywords" or "Manually generated Keywords" are missing
    if debug:
        print("Removing entries where both auto & manually generated keywords are missing")
        print("Before: {}".format(len(temp_df)))
    
    with pd.option_context('display.max_rows', 200, 'display.max_columns', None, 'display.max_colwidth', 50):
        display(temp_df)

    temp_df = temp_df[(~temp_df['analyticsclient.generated_keywords'].isnull()) | (~temp_df['manual.keywords'].isnull())]

    #with pd.option_context('display.max_rows', 200, 'display.max_columns', None, 'display.max_colwidth', 50):
    #    display(temp_df)

    if debug:
        print("After: {}".format(len(temp_df)))

    return temp_df

def generateUrlToKeywordDict(df, keyword_types=[''], use_banned_word_list=True, debug=True):
    """
    TODO:
    There will be multiple options for what type of keywords you can select from
    * manual - these are the tags manually added (there aren't that many of these)
    * title_description_keyword - these are the tags provided by the metadata
    * web_scraping - these are the tags generated by web scraping
    """
    import numpy
    
    # Add new empty column to df, for storing the combined keywords
    df['combined keywords'] = np.nan
    df['combined keywords'] = df['combined keywords'].astype(object)
    
    url_s = df['normalized_url'].unique()
    url_lookup_cache = dict()
    no_keywords_urls = []
    
    for counter, url in enumerate(url_s):
        
        if debug: 
            print("\n{} / {} - {}".format(counter, len(url_s), url))
        generated_keyword_list          = df.loc[df['normalized_url'] == url]['analyticsclient.generated_keywords'].values.tolist()
        manually_populated_keyword_list = df.loc[df['normalized_url'] == url]['manual.keywords'].values.tolist()

        # Filter [nan] scenarios
        if numpy.nan in generated_keyword_list:
            generated_keyword_list = []
        elif len(generated_keyword_list) >= 1:
            generated_keyword_list = generated_keyword_list[0]

        if numpy.nan in manually_populated_keyword_list:
            manually_populated_keyword_list = []
        elif len(manually_populated_keyword_list) >= 1:
            manually_populated_keyword_list = manually_populated_keyword_list[0]

        aggregate_keyword_list = list(set(generated_keyword_list).union(set(manually_populated_keyword_list)))

        if use_banned_word_list:
            aggregate_keyword_list = [w for w in aggregate_keyword_list if w.lower() not in BANNED_KEYWORDS_LIST]

        # Cache result
        url_lookup_cache[url] = aggregate_keyword_list
        if debug:
            print("\t{}".format(aggregate_keyword_list))
        
        if not aggregate_keyword_list:
            print("\tWarning: [{}] has 0 entries!".format(url))
            no_keywords_urls.append(url)
            
        # Add the entry back to the dataframe     
        index = df.loc[df['normalized_url'] == url]
        df.at[index.index.values[0], 'combined keywords'] = aggregate_keyword_list
            
    return url_lookup_cache, df, no_keywords_urls

# For Debugging
def lookUpKeywordBreakdownBasedOnUrl(url):
    title = url_to_title.get(url, set())
    og_title = url_to_og_title.get(url, set())
    description = url_to_description.get(url, set())
    og_description = url_to_og_description.get(url, set())
    keywords_set = url_to_keywords.get(url, set())
    
    print("Title: {}".format(title))
    print("og_title: {}".format(og_title))
    print("description: {}".format(description))
    print("og_description: {}".format(og_description))
    print("keywords: {}".format(keywords_set))
    
def generateKeywordToIndividualKeywordList(url_to_keyword_df):
    url_to_keyword_df = url_to_keyword_df[['normalized_url', 'combined keywords']]

    # Expand each normalized_url, into its own keyword row
    expanded_keywords_df = url_to_keyword_df['combined keywords'].apply(lambda x: pd.Series(x))

    url_to_unique_keyword_df = pd.DataFrame()

    for index, row in expanded_keywords_df.iterrows():
        row_df = row.dropna().to_frame(name='unique keyword')
        row_df['normalized_url'] = url_to_keyword_df['normalized_url'].loc[index]   
        url_to_unique_keyword_df = url_to_unique_keyword_df.append(row_df, ignore_index=True)

        if index % 500 == 0:
            print("{} / {}".format(index, len(expanded_keywords_df)))
    
    return url_to_unique_keyword_df

### Populate URL to Information Dataframe


I don't know why this is so resource intensive...
Maybe because of remove punctuation function?

In [72]:
%%time

os.write(1, "\nStarting keyword generation".encode())
url_to_keyword_df = createUrlToKeywordDf()
url_to_keyword_df = populateUrlToKeywordDf(url_to_keyword_df)

url_to_keyword_df = addKeywordBoosting(url_to_keyword_df)
url_lookup_cache, url_to_keyword_df, urls_without_keywords_list = generateUrlToKeywordDict(url_to_keyword_df)
url_to_unique_keyword_df = generateKeywordToIndividualKeywordList(url_to_keyword_df)
os.write(1, "\n\tFinished keyword generation".encode())

0 / 869
Non-English detected: [Liferay : vers une expérience digitale personnalisée]
Non-English detected: [Software per la digital experience, su misura per te.]
Non-English detected: [Liferay: Software de experiência digital sob medida para seu negócio]
Non-English detected: [Liferay: Digital Experience Software für Ihre Anforderungen.]
Non-English detected: [Liferay: Software de experiencia digital adaptado a tus necesidades.]
Non-English detected: [Liferay est un éditeur de logiciels open source permettant à ses clients de créer des expériences digitales personnalisées à travers le web, mobile et objets connectés.]
Non-English detected: [Portale, Intranets, Websites und durchgängige digitale Erlebnisse auf einer Plattform.]
Non-English detected: [Crie portais, intranets, sites e experiências conectadas com a plataforma mais flexível.]
Non-English detected: [Crea portali moderni, intranet, siti web ed esperienze connesse sulla piattaforma più flessibile disponibile sul mercato.]
Non

Non-English detected: [Venue | Liferay]
Non-English detected: [Venue | Liferay]
Non-English detected: [More than “free software,” open source describes a development culture and way of doing business that has changed the landscape of software development. This document outlines the benefits of open source software for the enterprise and also provides criteria for evaluating and choosing an open source solution.]
Non-English detected: [OSB WWW Marketing Events Users]
Non-English detected: [ČSOB]
Non-English detected: [Delvag nutzt Liferay für seine Webportale]
Non-English detected: [Nach dem Upgrade auf die aktuellste Version von Liferay Portal lassen sich bei der Delvag Group Inhalte für das Internet deutlich effizienter bearbeiten und die Mitarbeiterakzeptanz im internen Webportal hat sich erhöht. Die erweiterbare und flexible Liferay Plattform hat sich sehr gut bewährt.]
Non-English detected: [EATEL | Liferay]
Skipping, contains CD
Skipping, contains CD
Non-English detected: [Illin

Unnamed: 0,normalized_url,analyticsclient.generated_keywords,manual.keywords,analyticsclient.merged_title,analyticsclient.merged_description,analyticsclient.merged_keywords
0,https://customer.liferay.com/c/portal/layout?p...,"[documentation, portal, ""liferay, knowledge]",,"[Documentation - Knowledge - Portal, Documenta...",,
1,https://www.liferay.com/,"[connected experiences, digital experience sof...","[liferay, DXP]",[Liferay: Digital experience software tailored...,"[Build modern portals, intranets, websites and...",
2,https://www.liferay.com/contact-us,"[contact us, liferay]","[liferay company, contact us, company info]",[Contact Us | Liferay],,
3,https://customer.liferay.com/documentation/kno...,"[knowledge base, portal, ""liferay, knowledge]",,"[Knowledge Base - Knowledge - Portal, Knowledg...",,
4,https://customer.liferay.com/documentation,"[documentation, portal, ""liferay, knowledge]",,"[Documentation - Knowledge - Portal, Documenta...",,
5,https://www.liferay.com/digital-experience-pla...,"[liferay digital experience platform, usable d...","[liferay DXP, features, dxp software]",[Liferay Digital Experience Platform (DXP)],"[A single, consolidated platform for building ...",
6,https://customer.liferay.com/documentation/sea...,"[portal, ""liferay, knowledge, search]",,"[Search - Knowledge - Portal, Search - Knowled...",,
7,https://www.liferay.com/downloads/thanks-for-d...,"[others, millions, thanks, liferay community, ...",,[Thanks for Downloading Liferay],[You have joined millions of others in downloa...,
8,https://www.liferay.com/careers,"[crazy ship, liferay, careers, amp, open seas,...",,"[Liferay Jobs &amp; Careers, Liferay Jobs & Ca...","[At Liferay, we're looking for fiery-eyed zeal...",
9,https://www.liferay.com/downloads,"[liferay downloads, download liferay DXP, soft...","[liferay downloads, open source projects, life...",[Liferay Downloads - https://www.liferay.com/d...,"[Download Liferay DXP, Liferay Portal, and oth...",


After: 756

0 / 756 - https://customer.liferay.com/c/portal/layout?p_l_id=82890&p_v_l_s_g_id=0
	['documentation', 'portal']

1 / 756 - https://www.liferay.com/
	['connected experiences', 'digital experience software', 'DXP', 'intranets', 'modern portals', 'websites']

2 / 756 - https://www.liferay.com/contact-us
	['liferay company', 'company info', 'contact us']

3 / 756 - https://customer.liferay.com/documentation/knowledge-base/-/kb/2951285
	['knowledge base', 'portal']

4 / 756 - https://customer.liferay.com/documentation
	['documentation', 'portal']

5 / 756 - https://www.liferay.com/digital-experience-platform
	['liferay digital experience platform', 'usable digital experiences customers', 'liferay DXP', 'dxp software', 'DXP', 'single, consolidated platform']

6 / 756 - https://customer.liferay.com/documentation/search?p_p_state=normal&_1_WAR_osbknowledgebaseportlet_keywords=saml&p_p_id=1_WAR_osbknowledgebaseportlet
	['portal']

7 / 756 - https://www.liferay.com/downloads/thanks-f

50 / 756 - https://www.liferay.com/downloads/liferay-portal/additional-files-beta
	['connected experiences', 'digital experience software', 'intranets', 'modern portals', 'websites']

51 / 756 - https://www.liferay.com/resource?folderId=3292406&title=Liferay+DXP+New+Features+Summary
	['liferay digital experience platform', "liferay dxp's key features", 'wide array', 'new features', 'overview', 'summary', 'improved functionality', 'application', 'liferay digital experience platform new features summary', 'modular architecture', 'updated audience']

52 / 756 - https://www.liferay.com/resource?folderId=13811&title=Three+Key+Strategies+for+Consistent+Customer+Experiences
	['ability', 'customers', 'information', 'brand', 'channels', 'consistent user experience', 'purchasing decisions', 'paper', 'other words', 'consistent customer experiences', 'digital blueprint']

53 / 756 - https://customer.liferay.com/documentation/7.0/deploy/-/official_documentation/deployment/using-lcs
	['dashboard', '

94 / 756 - https://customer.liferay.com/documentation/search?p_p_state=normal&_1_WAR_osbknowledgebaseportlet_keywords=cache-control&p_p_id=1_WAR_osbknowledgebaseportlet
	['portal']

95 / 756 - https://www.liferay.com/solutions/industries/banking
	['intranet', 'portal', 'financial services', 'liferay portal & intranet solutions', 'improved customer experience', 'website software', 'banks', 'website', 'wealth management', 'innovation', 'financial services intranet', 'financial firms']

96 / 756 - https://www.liferay.com/services/training
	['liferay training', 'start', 'liferay best practices', 'time', 'deliver results']

97 / 756 - https://www.liferay.com/services/training/topics/fundamentals
	['liferay fundamentals course', 'box', 'liferay training']

98 / 756 - https://customer.liferay.com/documentation/search?p_p_state=normal&_1_WAR_osbknowledgebaseportlet_keywords=web+content+folder&p_p_id=1_WAR_osbknowledgebaseportlet
	['portal']

99 / 756 - https://customer.liferay.com/documentatio


105 / 756 - https://www.liferay.com/home?p_p_lifecycle=0&p_p_state=maximized&p_p_id=1_WAR_googlesearchapplianceportlet_INSTANCE_0000&_1_WAR_googlesearchapplianceportlet_INSTANCE_0000_keywords=cookies+notification
	['connected experiences', 'digital experience software', 'intranets', 'modern portals', 'websites']

106 / 756 - https://customer.liferay.com/web/knowledge/documentation
	['documentation', 'portal']

107 / 756 - https://customer.liferay.com/documentation/7.0/deploy/-/official_documentation/deployment/introduction-to-product-deployment
	['liferay DXP deployment', 'portal', 'deployment']

108 / 756 - https://www.liferay.com/downloads/liferay-portal/community-plugins/-/software_catalog/products/8340396
	['connected experiences', 'digital experience software', 'intranets', 'modern portals', 'websites']

109 / 756 - https://www.liferay.com/downloads/liferay-portal/community-plugins/-/software_catalog/products/
	['connected experiences', 'digital experience software', 'intranets',

141 / 756 - https://www.liferay.com/en/web/events-ldsf-dach-new/home?p_p_col_pos=1&p_p_mode=view&p_p_id=56_INSTANCE_zdHKtlkEaekz&p_p_state=normal&p_p_col_count=9&p_p_lifecycle=0&p_p_col_id=column-1
	['home']

142 / 756 - https://www.liferay.com/en/web/events-ldsf-dach-new/home?_15_version=1.2&_15_folderId=0&p_p_state=maximized&_15_struts_action=/journal/edit_article&p_p_col_count=9&p_p_col_pos=3&_15_redirect=https://www.liferay.com:443/en/web/events-ldsf-dach-new/home?p_p_id=56_INSTANCE_46E6PClxyLk1%26p_p_lifecycle=0%26p_p_state=normal%26p_p_mode=view%26p_p_col_id=column-1%26p_p_col_pos=3%26p_p_col_count=9&p_p_mode=view&_15_articleId=231916275&p_p_auth=WQdk132g&_15_groupId=231915721&p_p_lifecycle=0&p_p_id=15&p_p_col_id=column-1
	['home']

143 / 756 - https://www.liferay.com/en/web/events-ldsf-dach-new/home?p_p_col_pos=3&p_p_mode=view&p_p_id=56_INSTANCE_46E6PClxyLk1&p_p_state=normal&p_p_col_count=9&p_p_lifecycle=0&p_p_col_id=column-1
	['home']

144 / 756 - https://www.liferay.com/en/web

167 / 756 - https://www.liferay.com/group/control_panel/manage?_15_struts_action=/journal/edit_article&p_p_lifecycle=0&p_p_state=maximized&p_p_id=15&controlPanelCategory=current_site.content&refererPlid=231354466&_15_redirect=https://www.liferay.com:443/group/control_panel/manage?p_p_auth=dod4xydn%26p_p_id=15%26p_p_lifecycle=0%26p_p_state=maximized%26p_p_mode=view%26doAsGroupId=10182%26refererPlid=231354466%26controlPanelCategory=current_site.content%26_15_advancedSearch=0%26_15_viewFolders=1%26_15_viewEntries=1%26_15_keywords=ldsf%26_15_status=%26_15_folderStart=0%26_15_searchArticleId=%26_15_searchType=1%26_15_folderEnd=20%26_15_type=%26_15_andOperator=%26_15_folderId=0%26_15_entryStart=0%26_15_displayStyle=%26_15_entryEnd=20%26_15_content=%26_15_title=%26_15_description=%26_15_struts_action=%252Fjournal%252Fsearch%26_15_searchFolderId=0%26_15_showSearchInfo=1&doAsGroupId=10182&p_p_mode=view&p_p_auth=dod4xydn&_15_folderId=16763354&_15_groupId=10182&_15_articleId=25524680
	['web conte


184 / 756 - https://www.liferay.com/en/group/events-demo-site/symposium?_15_version=1.5&_15_articleId=231932303&p_p_id=15&p_p_state=maximized&_15_struts_action=/journal/edit_article&p_p_col_count=8&p_p_col_pos=6&_15_redirect=https://www.liferay.com:443/en/group/events-demo-site/symposium?p_p_id=56_INSTANCE_w6oKvwCjhElT%26p_p_lifecycle=0%26p_p_state=normal%26p_p_mode=view%26p_p_col_id=column-1%26p_p_col_pos=6%26p_p_col_count=8&p_p_mode=view&p_p_auth=Uy31nAk9&_15_folderId=231932789&_15_groupId=231929011&p_p_lifecycle=0&p_p_col_id=column-1
	[]

185 / 756 - https://www.liferay.com/en/group/events-demo-site/symposium?p_p_col_pos=6&p_p_mode=view&p_p_id=56_INSTANCE_w6oKvwCjhElT&p_p_state=normal&p_p_col_count=8&p_p_lifecycle=0&p_p_col_id=column-1
	[]

186 / 756 - https://www.liferay.com/en/group/events-demo-site/symposium?_15_version=1.6&_15_articleId=231932303&p_p_id=15&p_p_state=maximized&_15_struts_action=/journal/edit_article&p_p_col_count=8&p_p_col_pos=6&_15_redirect=https://www.liferay.

	['portal', 'deployment', 'elasticsearch']

200 / 756 - https://customer.liferay.com/documentation/7.0/deploy/-/official_documentation/deployment/upgrading-to-liferay-7
	['quartz', 'upgrade', 'permission algorithm', 'deployment', 'checklist', 'upgrading', 'social office', 'liferay DXP', 'backup data', 'portal', 'index']

201 / 756 - https://customer.liferay.com/documentation/search?p_p_mode=view&_1_WAR_osbknowledgebaseportlet_mvcPath=/search/view.jsp&p_p_id=1_WAR_osbknowledgebaseportlet&p_p_state=normal&p_p_lifecycle=0&_1_WAR_osbknowledgebaseportlet_assetCategoryIds=80679
	['portal']

202 / 756 - https://customer.liferay.com/documentation/knowledge-base/-/kb/2848949
	['knowledge base', 'portal']

203 / 756 - https://www.liferay.com/web/events-symposium-spain-new
	['connected experiences', 'digital experience software', 'intranets', 'modern portals', 'websites']

204 / 756 - https://www.liferay.com/web/events-symposium-spain
	['spain liferay symposium']

205 / 756 - https://www.liferay.

218 / 756 - https://www.liferay.com/group/control_panel/manage?_15_struts_action=/journal/edit_article&_15_structureId=496430&p_p_state=maximized&p_p_id=15&controlPanelCategory=current_site.content&p_p_lifecycle=0&_15_redirect=https://www.liferay.com:443/group/control_panel/manage?p_p_auth=t3BHhr0f%26p_p_id=15%26p_p_lifecycle=0%26p_p_state=maximized%26p_p_mode=view%26doAsGroupId=10182%26refererPlid=11198%26controlPanelCategory=current_site.content%26_15_entryEnd=20%26_15_displayStyle=%26_15_viewEntries=1%26_15_viewFolders=1%26_15_folderStart=0%26_15_action=browseFolder%26_15_struts_action=%252Fjournal%252Fview%26_15_folderEnd=20%26_15_entryStart=0%26_15_folderId=1721363&doAsGroupId=10182&p_p_mode=view&p_p_auth=t3BHhr0f&_15_folderId=1721363&_15_groupId=10182&refererPlid=11198
	['web content']

219 / 756 - https://customer.liferay.com/documentation/search?p_p_state=normal&_1_WAR_osbknowledgebaseportlet_keywords=server+administration&p_p_id=1_WAR_osbknowledgebaseportlet
	['server administ

234 / 756 - https://www.liferay.com/group/control_panel/manage?_174_p_u_i_d=3536791&_174_tabs2=current&_174_tabs1=users&p_p_state=maximized&p_p_id=174&controlPanelCategory=current_site.pages&_174_struts_action=/site_memberships_admin/edit_site_assignments&p_p_lifecycle=0&_174_redirect=https://www.liferay.com:443/group/control_panel/manage?p_p_auth=7Tgt15fF%26p_p_id=174%26p_p_lifecycle=0%26p_p_state=maximized%26p_p_mode=view%26doAsGroupId=231929011%26refererPlid=231354466%26controlPanelCategory=current_site.pages%26_174_cur=0&doAsGroupId=231929011&p_p_mode=view&p_p_auth=7Tgt15fF&_174_cur=0&_174_groupId=231929011&refererPlid=231354466
	['site memberships']

235 / 756 - https://www.liferay.com/group/control_panel/manage?_174_p_u_i_d=139518933&_174_tabs1=users&p_p_state=maximized&p_p_id=174&controlPanelCategory=current_site.pages&_174_redirect=https://www.liferay.com:443/group/control_panel/manage?p_p_auth=7Tgt15fF%26p_p_id=174%26p_p_lifecycle=0%26p_p_state=maximized%26p_p_mode=view%26doAs

245 / 756 - https://www.liferay.com/group/control_panel/manage?_174_tabs2=available&_174_tabs1=users&p_p_state=maximized&p_p_id=174&controlPanelCategory=current_site.pages&_174_struts_action=/site_memberships_admin/edit_site_assignments&p_p_lifecycle=0&_174_redirect=https://www.liferay.com:443/group/control_panel/manage?p_p_auth=7Tgt15fF%26p_p_id=174%26p_p_lifecycle=0%26p_p_state=maximized%26p_p_mode=view%26doAsGroupId=231929011%26refererPlid=231354466%26controlPanelCategory=current_site.pages%26_174_cur=0%26_174_cur=0&doAsGroupId=231929011&p_p_mode=view&p_p_auth=7Tgt15fF&refererPlid=231354466
	['site memberships']

246 / 756 - https://www.liferay.com/group/control_panel/manage?_174_tabs2=available&_174_tabs1=users&p_p_state=maximized&p_p_id=174&controlPanelCategory=current_site.pages&_174_struts_action=/site_memberships_admin/edit_site_assignments&p_p_lifecycle=0&_174_redirect=https://www.liferay.com:443/group/control_panel/manage?p_p_auth=7Tgt15fF%26p_p_id=174%26p_p_lifecycle=0%26p_p

252 / 756 - https://www.liferay.com/group/control_panel/manage?_174_tabs2=available&_174_tabs1=users&p_p_state=maximized&p_p_id=174&controlPanelCategory=current_site.pages&_174_struts_action=/site_memberships_admin/edit_site_assignments&p_p_lifecycle=0&_174_redirect=https://www.liferay.com:443/group/control_panel/manage?p_p_auth=7Tgt15fF%26p_p_id=174%26p_p_lifecycle=0%26p_p_state=maximized%26p_p_mode=view%26doAsGroupId=231929011%26refererPlid=231354466%26controlPanelCategory=current_site.pages%26_174_cur=0%26_174_cur=0%26_174_cur=0&doAsGroupId=231929011&p_p_mode=view&p_p_auth=7Tgt15fF&_174_groupId=231929011&refererPlid=231354466
	['site memberships']

253 / 756 - https://www.liferay.com/group/control_panel/manage?_1_WAR_osbwwwmarketingeventsportlet_orderByType=asc&_1_WAR_osbwwwmarketingeventsportlet_advancedSearch=false&_1_WAR_osbwwwmarketingeventsportlet_orderByCol=start-date&_1_WAR_osbwwwmarketingeventsportlet_resetCur=false&p_p_id=1_WAR_osbwwwmarketingeventsportlet&p_p_state=maximiz

261 / 756 - https://www.liferay.com/group/control_panel/manage?_1_WAR_osbwwwmarketingeventsportlet_marketingEventSessionId=231902867&_1_WAR_osbwwwmarketingeventsportlet_redirect=https://www.liferay.com:443/group/control_panel/manage?p_p_id=1_WAR_osbwwwmarketingeventsportlet%26p_p_lifecycle=0%26p_p_state=maximized%26p_p_mode=view%26doAsGroupId=231869941%26refererPlid=160627076%26controlPanelCategory=current_site.content%26_1_WAR_osbwwwmarketingeventsportlet_mvcPath=%252Fadmin%252Fedit_marketing_event.jsp%26_1_WAR_osbwwwmarketingeventsportlet_tabs1=sessions%26_1_WAR_osbwwwmarketingeventsportlet_redirect=%252Fgroup%252Fcontrol_panel%252Fmanage%253FcontrolPanelCategory%253Dcurrent_site.content%2526doAsGroupId%253D231869941%2526p_p_id%253D1_WAR_osbwwwmarketingeventsportlet%2526p_p_lifecycle%253D0%2526p_p_mode%253Dview%2526p_p_state%253Dmaximized%2526refererPlid%253D160627076%26_1_WAR_osbwwwmarketingeventsportlet_marketingEventId=231868441%26_1_WAR_osbwwwmarketingeventsportlet_delta=20%26_1_

271 / 756 - https://www.liferay.com/group/control_panel/manage?_1_WAR_osbwwwmarketingeventsportlet_redirect=https://www.liferay.com:443/group/control_panel/manage?p_p_id=1_WAR_osbwwwmarketingeventsportlet%26p_p_lifecycle=0%26p_p_state=maximized%26p_p_mode=view%26doAsGroupId=231869941%26refererPlid=160627076%26controlPanelCategory=current_site.content%26_1_WAR_osbwwwmarketingeventsportlet_mvcPath=%252Fadmin%252Fedit_marketing_event.jsp%26_1_WAR_osbwwwmarketingeventsportlet_tabs1=sessions%26_1_WAR_osbwwwmarketingeventsportlet_redirect=%252Fgroup%252Fcontrol_panel%252Fmanage%253FcontrolPanelCategory%253Dcurrent_site.content%2526doAsGroupId%253D231869941%2526p_p_id%253D1_WAR_osbwwwmarketingeventsportlet%2526p_p_lifecycle%253D0%2526p_p_mode%253Dview%2526p_p_state%253Dmaximized%2526refererPlid%253D160627076%26_1_WAR_osbwwwmarketingeventsportlet_marketingEventId=231868441%26_1_WAR_osbwwwmarketingeventsportlet_cur=1%26_1_WAR_osbwwwmarketingeventsportlet_delta=20%26_1_WAR_osbwwwmarketingeventsp

280 / 756 - https://www.liferay.com/en_AU/digital-experience-platform
	['liferay digital experience platform', 'usable digital experiences customers', 'DXP', 'single, consolidated platform']

281 / 756 - https://www.liferay.com/en_AU/resources?93350categoryIds=93352
	['IT', "liferay's resource library", 'IT and business leaders', 'whitepapers', 'business', 'leadership resources', 'ebooks', 'case studies']

282 / 756 - https://www.liferay.com/en/group/events-demo-site/symposium?_15_version=1.0&p_p_id=15&p_p_state=maximized&_15_struts_action=/journal/edit_article&p_p_lifecycle=0&p_p_col_pos=7&_15_redirect=https://www.liferay.com:443/en/group/events-demo-site/symposium?p_p_id=56_INSTANCE_x5KYEyx3YzAm%26p_p_lifecycle=0%26p_p_state=normal%26p_p_mode=view%26p_p_col_id=column-1%26p_p_col_pos=7&p_p_mode=view&_15_articleId=231934868&_15_folderId=231932941&_15_groupId=231929011&p_p_col_id=column-1&p_p_auth=Uy31nAk9
	[]

283 / 756 - https://www.liferay.com/group/control_panel?p_p_id=137&refererPl


305 / 756 - https://www.liferay.com/en/group/events-demo-site/symposium?p_p_col_pos=8&p_p_mode=view&p_p_id=56_INSTANCE_x5KYEyx3YzAm&p_p_state=normal&p_p_col_count=9&p_p_lifecycle=0&p_p_col_id=column-1
	[]

306 / 756 - https://www.liferay.com/en/group/control_panel/manage?_15_struts_action=/journal/edit_article&_15_structureId=231908325&p_p_state=maximized&p_p_id=15&controlPanelCategory=current_site.content&p_p_lifecycle=0&_15_redirect=https://www.liferay.com:443/en/group/control_panel/manage?p_p_auth=LI2kerFT%26p_p_id=15%26p_p_lifecycle=0%26p_p_state=maximized%26p_p_mode=view%26doAsGroupId=231929011%26refererPlid=231929017%26controlPanelCategory=current_site.content%26_15_entryEnd=20%26_15_displayStyle=%26_15_viewEntries=1%26_15_viewFolders=1%26_15_folderStart=0%26_15_action=browseFolder%26_15_struts_action=%252Fjournal%252Fview%26_15_folderEnd=20%26_15_entryStart=0%26_15_folderId=231932789&doAsGroupId=231929011&p_p_mode=view&p_p_auth=LI2kerFT&_15_folderId=231932789&_15_groupId=231929

320 / 756 - https://www.liferay.com/group/control_panel/manage?_15_struts_action=/journal/edit_article&p_p_lifecycle=0&p_p_state=maximized&p_p_id=15&controlPanelCategory=current_site.content&refererPlid=231929015&_15_redirect=https://www.liferay.com:443/group/control_panel/manage?p_p_auth=BKLN7zz7%26p_p_id=15%26p_p_lifecycle=0%26p_p_state=maximized%26p_p_mode=view%26doAsGroupId=10195%26refererPlid=231929015%26controlPanelCategory=current_site.content&doAsGroupId=10195&p_p_mode=view&p_p_auth=BKLN7zz7&_15_folderId=0&_15_groupId=10195
	['web content']

321 / 756 - https://www.liferay.com/group/control_panel/manage?doAsGroupId=10195&p_p_mode=view&p_p_auth=BKLN7zz7&p_p_id=15&p_p_state=maximized&p_p_lifecycle=1&controlPanelCategory=current_site.content&_15_struts_action=/journal/edit_article&refererPlid=231929015
	['web content']

322 / 756 - https://customer.liferay.com/documentation/7.0/deploy/-/official_documentation/deployment/managing-liferay-with-liferay-connected-services
	['dashboard

349 / 756 - https://customer.liferay.com/documentation/search?p_p_mode=view&_1_WAR_osbknowledgebaseportlet_mvcPath=/search/view.jsp&p_p_id=1_WAR_osbknowledgebaseportlet&p_p_state=normal&_1_WAR_osbknowledgebaseportlet_keywords=upgrading&p_p_lifecycle=0&_1_WAR_osbknowledgebaseportlet_permissionTypes=2&_1_WAR_osbknowledgebaseportlet_assetCategoryIds=80889
	['portal']

350 / 756 - https://customer.liferay.com/documentation/search?p_p_mode=view&_1_WAR_osbknowledgebaseportlet_mvcPath=/search/view.jsp&p_p_id=1_WAR_osbknowledgebaseportlet&p_p_state=normal&_1_WAR_osbknowledgebaseportlet_keywords=upgrading&p_p_lifecycle=0&_1_WAR_osbknowledgebaseportlet_permissionTypes=2&_1_WAR_osbknowledgebaseportlet_assetCategoryIds=80679
	['portal']

351 / 756 - https://customer.liferay.com/documentation/knowledge-base/-/kb/1466628
	['knowledge base', 'portal']

352 / 756 - https://customer.liferay.com/documentation/knowledge-base/-/kb/339518
	['knowledge base', 'portal']

353 / 756 - https://www.liferay.com/r

388 / 756 - https://customer.liferay.com/documentation/knowledge-base/-/kb/14673
	['knowledge base', 'portal']

389 / 756 - https://customer.liferay.com/documentation/knowledge-base/-/kb/124296
	['knowledge base', 'portal']

390 / 756 - https://customer.liferay.com/
	['documentation', 'portal']

391 / 756 - https://customer.liferay.com/documentation/knowledge-base/-/kb/2119411
	['knowledge base', 'portal']

392 / 756 - https://customer.liferay.com/documentation/knowledge-base/-/kb/29162
	['knowledge base', 'portal']

393 / 756 - https://customer.liferay.com/documentation/knowledge-base/-/kb/309969
	['SLO', 'knowledge base', 'idp', 'SSO', 'liferay SAML', 'liferay DXP', 'portal', 'SP']

394 / 756 - https://customer.liferay.com/documentation/knowledge-base/-/kb/14694
	['knowledge base', 'portal']

395 / 756 - https://customer.liferay.com/documentation/knowledge-base/-/kb/2752274
	['knowledge base', 'portal']

396 / 756 - https://customer.liferay.com/documentation/knowledge-base/-/kb/38442


409 / 756 - https://www.liferay.com/en/group/control_panel/manage?_1_WAR_osbwwwmarketingeventsportlet_redirect=https://www.liferay.com:443/en/group/control_panel/manage?p_p_id=1_WAR_osbwwwmarketingeventsportlet%26p_p_lifecycle=0%26p_p_state=maximized%26p_p_mode=view%26doAsGroupId=231929011%26refererPlid=231929017%26controlPanelCategory=current_site.content%26_1_WAR_osbwwwmarketingeventsportlet_mvcPath=%252Fadmin%252Fedit_marketing_event.jsp%26_1_WAR_osbwwwmarketingeventsportlet_tabs1=sponsors%26_1_WAR_osbwwwmarketingeventsportlet_redirect=%252Fen%252Fgroup%252Fcontrol_panel%252Fmanage%253Fp_p_id%253D1_WAR_osbwwwmarketingeventsportlet%2526p_p_lifecycle%253D0%2526p_p_state%253Dmaximized%2526p_p_mode%253Dview%2526doAsGroupId%253D231929011%2526refererPlid%253D231929017%2526controlPanelCategory%253Dcurrent_site.content%26_1_WAR_osbwwwmarketingeventsportlet_marketingEventId=231931129%26_1_WAR_osbwwwmarketingeventsportlet_delta1=20%26_1_WAR_osbwwwmarketingeventsportlet_keywords=%26_1_WAR_osb


412 / 756 - https://www.liferay.com/en/group/control_panel/manage?_20_groupId=231929011&doAsGroupId=231929011&p_p_mode=view&_20_folderId=231932562&_20_struts_action=/document_library/select_file_entry&p_p_id=20&p_p_state=pop_up&p_p_lifecycle=0&controlPanelCategory=current_site.content&refererPlid=231929017
	['documents', 'media']

413 / 756 - https://www.liferay.com/en/group/control_panel/manage?_1_WAR_osbwwwmarketingeventsportlet_delta1=20&p_p_id=1_WAR_osbwwwmarketingeventsportlet&p_p_state=maximized&_1_WAR_osbwwwmarketingeventsportlet_cur1=1&_1_WAR_osbwwwmarketingeventsportlet_marketingEventId=231931129&controlPanelCategory=current_site.content&p_p_lifecycle=0&_1_WAR_osbwwwmarketingeventsportlet_redirect=/en/group/control_panel/manage?p_p_id=1_WAR_osbwwwmarketingeventsportlet%26p_p_lifecycle=0%26p_p_state=maximized%26p_p_mode=view%26doAsGroupId=231929011%26refererPlid=231929017%26controlPanelCategory=current_site.content&doAsGroupId=231929011&_1_WAR_osbwwwmarketingeventsportlet_adva

419 / 756 - https://www.liferay.com/resources?_166_refererPortletName=183&_166_structureAvailableFields=_166_getAvailableFields&p_p_lifecycle=0&p_p_state=pop_up&_166_classNameId=10083&p_p_id=166&_166_redirect=https://www.liferay.com:443/resources?p_p_id=166%26p_p_lifecycle=0%26p_p_state=pop_up%26p_p_mode=view%26doAsGroupId=10182%26_166_refererPortletName=183%26_166_scopeTitle=Application%2BDisplay%2BTemplates%26_166_groupId=10182%26_166_showHeader=false%26_166_classNameId=10083%26_166_eventName=selectStructure%26_166_struts_action=%252Fdynamic_data_mapping%252Fview_template&_166_classPK=0&doAsGroupId=10182&p_p_mode=view&_166_struts_action=/dynamic_data_mapping/edit_template&_166_templateId=149699&_166_type=display&_166_scopeTitle=Application+Display+Templates&_166_groupId=10182
	['ebooks', 'whitepapers', "liferay's resource library", 'case studies', 'IT and business leaders']

420 / 756 - https://www.liferay.com/resources?_166_refererPortletName=183&p_p_lifecycle=0&p_p_state=pop_up&_16

433 / 756 - https://www.liferay.com/en/group/control_panel/manage?_1_WAR_osbwwwmarketingeventsportlet_redirect=https://www.liferay.com:443/en/group/control_panel/manage?p_p_id=1_WAR_osbwwwmarketingeventsportlet%26p_p_lifecycle=0%26p_p_state=maximized%26p_p_mode=view%26doAsGroupId=231929011%26refererPlid=231929017%26controlPanelCategory=current_site.content%26_1_WAR_osbwwwmarketingeventsportlet_mvcPath=%252Fadmin%252Fedit_marketing_event.jsp%26_1_WAR_osbwwwmarketingeventsportlet_tabs1=sponsors%26_1_WAR_osbwwwmarketingeventsportlet_redirect=%252Fen%252Fgroup%252Fcontrol_panel%252Fmanage%253Fp_p_id%253D1_WAR_osbwwwmarketingeventsportlet%2526p_p_lifecycle%253D0%2526p_p_state%253Dmaximized%2526p_p_mode%253Dview%2526doAsGroupId%253D231929011%2526refererPlid%253D231929017%2526controlPanelCategory%253Dcurrent_site.content%26_1_WAR_osbwwwmarketingeventsportlet_marketingEventId=231931129%26_1_WAR_osbwwwmarketingeventsportlet_delta1=20%26_1_WAR_osbwwwmarketingeventsportlet_keywords=%26_1_WAR_osbw


437 / 756 - https://www.liferay.com/resource?folderId=1646951&title=Four+Types+of+Portals+That+Solve+Enterprise+Problems
	['internal service portals', 'portals', 'discover', 'social collaboration portals', 'that solve enterprise problems', 'self service portals', 'partner portals', 'digital business', 'leading enterprises', 'common problems']

438 / 756 - https://www.liferay.com/en/group/control_panel/manage?_1_WAR_osbwwwmarketingeventsportlet_redirect=https://www.liferay.com:443/en/group/control_panel/manage?p_p_id=1_WAR_osbwwwmarketingeventsportlet%26p_p_lifecycle=0%26p_p_state=maximized%26p_p_mode=view%26doAsGroupId=231887827%26refererPlid=231887897%26controlPanelCategory=current_site.content%26_1_WAR_osbwwwmarketingeventsportlet_mvcPath=%252Fadmin%252Fedit_marketing_event.jsp%26_1_WAR_osbwwwmarketingeventsportlet_tabs1=sponsors%26_1_WAR_osbwwwmarketingeventsportlet_redirect=%252Fen%252Fgroup%252Fcontrol_panel%252Fmanage%253Fp_p_id%253D1_WAR_osbwwwmarketingeventsportlet%2526p_p_lif

446 / 756 - https://www.liferay.com/en/group/events-demo-site/devcon?_86_portletResource=56_INSTANCE_Uryrcp87acn0&_86_resourcePrimKey=231929019_LAYOUT_56_INSTANCE_Uryrcp87acn0&_86_&_86_struts_action=/portlet_configuration/edit_configuration&p_p_lifecycle=0&p_p_state=pop_up&p_p_id=86&_86_returnToFullPageURL=/en/group/events-demo-site/devcon?p_p_id=56_INSTANCE_FOOTER%26p_p_lifecycle=0%26p_p_state=normal%26p_p_mode=view%26_56_INSTANCE_FOOTER_articleId=231929077%26_56_INSTANCE_FOOTER_articleId=231929077%26_56_INSTANCE_FOOTER_articleId=231929077%26_56_INSTANCE_FOOTER_articleId=231929077&p_p_col_count=7&p_p_col_pos=4&_86_redirect=/en/group/events-demo-site/devcon?p_p_id=56_INSTANCE_FOOTER%26p_p_lifecycle=0%26p_p_state=normal%26p_p_mode=view%26_56_INSTANCE_FOOTER_articleId=231929077%26_56_INSTANCE_FOOTER_articleId=231929077%26_56_INSTANCE_FOOTER_articleId=231929077%26_56_INSTANCE_FOOTER_articleId=231929077&yui_patched_v3_11_0_1_1526667650874_5935=1526667978197&p_p_col_id=column-1
	['web conte

490 / 756 - https://www.liferay.com/en_AU/web/events-symposium-north-america/home?p_p_col_id=column-1&_15_version=2.8&p_p_id=15&p_p_state=maximized&_15_struts_action=/journal/edit_article&p_p_col_count=3&_15_redirect=https://www.liferay.com:443/en_AU/web/events-symposium-north-america/home?p_p_id=56_INSTANCE_n9UACaVBnoqF%26p_p_lifecycle=0%26p_p_state=normal%26p_p_mode=view%26p_p_col_id=column-1%26p_p_col_count=3&p_p_mode=view&_15_articleId=231816004&_15_folderId=0&_15_groupId=231815926&p_p_lifecycle=0&p_p_auth=9VopNHWE
	['workshops', 'innovative solutions', 'experts', 'practical sessions', 'liferay symposium', 'few opportunities', 'world', 'business challenges', 'liferay symposium north america', 'register today', 'new orleans', 'fall', 'key engineers', "liferay's key developers", 'LA', 'industry leaders', 'best practices']

491 / 756 - https://www.liferay.com/en_AU/web/events-symposium-north-america
	['workshops', 'innovative solutions', 'experts', 'practical sessions', 'liferay sympo

523 / 756 - https://www.liferay.com/resource?title=emt
	['image', 'telecommunications company', 'online services']

524 / 756 - https://www.liferay.com/services/training/topics/system-admin
	['liferay training', 'production environment', 'system administrator course']

525 / 756 - https://www.liferay.com/en/group/events-demo-site/devcon?p_p_col_pos=10&_33_backURL=https://www.liferay.com:443/en/group/events-demo-site/devcon?p_p_id=33%26p_p_lifecycle=0%26p_p_state=normal%26p_p_mode=view%26p_p_col_id=column-1%26p_p_col_pos=10&p_p_mode=view&_33_struts_action=/blogs/edit_entry&p_p_id=33&p_p_state=normal&_33_redirect=https://www.liferay.com:443/en/group/events-demo-site/devcon?p_p_id=33%26p_p_lifecycle=0%26p_p_state=normal%26p_p_mode=view%26p_p_col_id=column-1%26p_p_col_pos=10&p_p_lifecycle=0&p_p_col_id=column-1
	['DEVCON']

526 / 756 - https://www.liferay.com/blog/en-us/customer-experience/six-customer-experience-trends-to-expect-in-2018
	['business', 'customer experience', 'liferay blogs']

548 / 756 - https://www.liferay.com/resources?_166_refererPortletName=183&_166_structureAvailableFields=_166_getAvailableFields&p_p_lifecycle=0&p_p_state=pop_up&_166_classNameId=10083&p_p_id=166&_166_redirect=https://www.liferay.com:443/resources?p_p_id=166%26p_p_lifecycle=0%26p_p_state=pop_up%26p_p_mode=view%26doAsGroupId=10182%26_166_refererPortletName=183%26_166_scopeTitle=Application%2BDisplay%2BTemplates%26_166_advancedSearch=false%26_166_tabs1=templates%26_166_keywords=resources%26_166_classNameId=10083%26_166_ddmTemplatesSearchContainerPrimaryKeys=195081155%252C193488165%252C1543144%252C849424%252C846218%252C198961%252C149699%252C18068%26_166_andOperator=true%26_166_formDate=1526676707864%26_166_cmd=%26_166_deleteTemplateIds=%26_166_description=%26_166_name=%26_166_struts_action=%252Fdynamic_data_mapping%252Fview_template%26_166_classPK=0%26_166_redirect=https%253A%252F%252Fwww.liferay.com%253A443%252Fresources%253Fp_p_id%253D166%2526p_p_lifecycle%253D0%2526p_p_state%253Dpop_up%

565 / 756 - https://customer.liferay.com/documentation/knowledge-base/-/kb/923703
	['knowledge base', 'portal']

566 / 756 - https://www.liferay.com/solutions?_15_struts_action=/journal/edit_article&p_p_mode=view&p_p_auth=8FLyz51t&p_p_id=15&p_p_state=pop_up&p_p_lifecycle=1
	['intranets', 'single open source platform', 'WCM', 'enterprise portals', 'mobile development platform']

567 / 756 - https://www.liferay.com/solutions?_15_defaultLanguageId=en_US&_15_structureId=166312677&p_p_state=pop_up&_15_struts_action=/journal/edit_article&_15_toLanguageId=it_IT&p_p_lifecycle=0&p_p_mode=view&p_p_auth=8FLyz51t&yui_patched_v3_11_0_1_1526679247068_7796=1526679457000&_15_groupId=10182&p_p_id=15&_15_articleId=231706365
	['intranets', 'single open source platform', 'WCM', 'enterprise portals', 'mobile development platform']

568 / 756 - https://www.liferay.com/solutions?_15_defaultLanguageId=en_US&_15_structureId=166312677&p_p_state=pop_up&_15_struts_action=/journal/edit_article&_15_toLanguageId=es_

588 / 756 - https://www.liferay.com/home?p_p_lifecycle=0&p_p_state=maximized&p_p_id=1_WAR_googlesearchapplianceportlet_INSTANCE_0000&_1_WAR_googlesearchapplianceportlet_INSTANCE_0000_keywords=7.1+alpha
	['connected experiences', 'digital experience software', 'intranets', 'modern portals', 'websites']

589 / 756 - https://www.liferay.com/home?p_p_lifecycle=0&p_p_state=maximized&p_p_id=1_WAR_googlesearchapplianceportlet_INSTANCE_0000&_1_WAR_googlesearchapplianceportlet_INSTANCE_0000_keywords=7.1+beta
	['connected experiences', 'digital experience software', 'intranets', 'modern portals', 'websites']

590 / 756 - https://customer.liferay.com/documentation/search?p_p_state=normal&_1_WAR_osbknowledgebaseportlet_keywords=GoGo&p_p_id=1_WAR_osbknowledgebaseportlet
	['portal', 'reference', 'felix gogo shell']

591 / 756 - https://www.liferay.com/en/resources/l?title=going-to-nbfc-100-tech-summit
	['nbfc-100-tech']

592 / 756 - https://customer.liferay.com/documentation/7.0/develop/tutorials/-/

637 / 756 - https://www.liferay.com/resources?_166_refererPortletName=183&_166_structureAvailableFields=_166_getAvailableFields&p_p_lifecycle=0&p_p_state=pop_up&_166_classNameId=10083&p_p_id=166&_166_redirect=https://www.liferay.com:443/resources?p_p_id=166%26p_p_lifecycle=0%26p_p_state=pop_up%26p_p_mode=view%26doAsGroupId=10182%26_166_refererPortletName=183%26_166_scopeTitle=Application%2BDisplay%2BTemplates%26_166_advancedSearch=false%26_166_tabs1=templates%26_166_keywords=resources%26_166_classNameId=10083%26_166_ddmTemplatesSearchContainerPrimaryKeys=195081155%252C193488165%252C1543144%252C849424%252C846218%252C198961%252C149699%252C18068%26_166_andOperator=true%26_166_formDate=1526687054788%26_166_cmd=%26_166_deleteTemplateIds=%26_166_description=%26_166_name=%26_166_struts_action=%252Fdynamic_data_mapping%252Fview_template%26_166_classPK=0%26_166_redirect=https%253A%252F%252Fwww.liferay.com%253A443%252Fresources%253Fp_p_id%253D166%2526p_p_lifecycle%253D0%2526p_p_state%253Dpop_up%

	['liferay digital experience platform', 'platform', 'order', 'liferay DXP', 'businesses', 'new technology', 'digital transformation', 'process', 'upgrade', 'customer experience initiatives', 'liferay portal', 'features companies', 'DXP', 'transformation', 'digital leaders', 'workplace', 'specific challenges', 'cloud technology', 'incoming changes', 'digital tools', 'business', 'innovation', 'opportunity']

677 / 756 - https://www.liferay.com/home?p_p_lifecycle=0&p_p_state=maximized&p_p_id=1_WAR_googlesearchapplianceportlet_INSTANCE_0000&_1_WAR_googlesearchapplianceportlet_INSTANCE_0000_keywords=forum
	['connected experiences', 'digital experience software', 'intranets', 'modern portals', 'websites']

678 / 756 - https://www.liferay.com/resource?title=council-of-europe
	["liferay's portal", 'europe', 'highly effective communication tool', 'council']

679 / 756 - https://www.liferay.com/services/certification/professional-developer/6.2
	[]

680 / 756 - https://www.liferay.com/en_AU/blog

716 / 756 - https://www.liferay.com/downloads?p_p_lifecycle=0&p_p_state=maximized&p_p_id=1_WAR_googlesearchapplianceportlet_INSTANCE_0000&_1_WAR_googlesearchapplianceportlet_INSTANCE_0000_keywords=bundled+with+tomcat+6.1
	['liferay downloads', 'download liferay DXP', 'liferay portal']

717 / 756 - https://www.liferay.com/downloads?p_p_mode=view&p_p_lifecycle=0&p_p_state=maximized&p_p_id=1_WAR_googlesearchapplianceportlet_INSTANCE_0000
	['liferay downloads', 'download liferay DXP', 'liferay portal']

718 / 756 - https://www.liferay.com/downloads?p_p_lifecycle=0&p_p_state=maximized&p_p_id=1_WAR_googlesearchapplianceportlet_INSTANCE_0000&_1_WAR_googlesearchapplianceportlet_INSTANCE_0000_keywords=download+bundled+with+tomcat+6.1+ee
	['liferay downloads', 'download liferay DXP', 'liferay portal']

719 / 756 - https://www.liferay.com/downloads?p_p_lifecycle=0&p_p_state=maximized&p_p_id=1_WAR_googlesearchapplianceportlet_INSTANCE_0000&_1_WAR_googlesearchapplianceportlet_INSTANCE_0000_keywords

	['workshops', 'innovative solutions', 'experts', 'practical sessions', 'liferay symposium', 'few opportunities', 'world', 'business challenges', 'liferay symposium north america', 'register today', 'new orleans', 'fall', 'key engineers', "liferay's key developers", 'LA', 'industry leaders', 'best practices']

747 / 756 - https://www.liferay.com/careers?p=job/oN4Q6fwx
	['careers', 'liferay jobs']

748 / 756 - https://www.liferay.com/careers?p=job/oLSz5fw1
	['careers', 'liferay jobs']

749 / 756 - https://customer.liferay.com/documentation/search?p_p_state=normal&_1_WAR_osbknowledgebaseportlet_keywords=Upgrade&p_p_id=1_WAR_osbknowledgebaseportlet
	['portal']

750 / 756 - https://customer.liferay.com/documentation/search?p_p_mode=view&_1_WAR_osbknowledgebaseportlet_mvcPath=/search/view.jsp&p_p_id=1_WAR_osbknowledgebaseportlet&p_p_state=normal&_1_WAR_osbknowledgebaseportlet_keywords=Upgrade&p_p_lifecycle=0&_1_WAR_osbknowledgebaseportlet_assetCategoryIds=80889
	['portal']

751 / 756 - http

In [73]:
# Save URLs without any keywords
# This is meant to be a debugging output

print("Checking if directory (/output/) exists")
if not os.path.exists('./output/'):
    print("Creating directory /output/")
    os.makedirs('./output/')

os.write(1, "Saving URLs with no keywords".encode())
with open('./output/URLs with NO keywords.txt', 'w', encoding='utf-8') as w:
    for counter, url in enumerate(sorted(urls_without_keywords_list)):
        w.write("{}) {}\n".format(counter, url))

Checking if directory (/output/) exists


In [74]:
from collections import defaultdict, Counter       

def compute_score_with_df(user_visits_df, global_visits_counter, total_global_visits, start_date, debug=False):
    """
    Description: This will take the sites that a user has visited, and perform TF-IDF calculations
    to obtain an output score. Note that we also factor in global visits as well.
    calculate_inverse_document_frequency
    Input: 
    user_visits_df - This is the dataframe corresponding to an individual's activites
    global_visits_counter - This is a Counter for all user's activites
    
    Output:
    ranked_interest_df - Ranked interests. Format: Topic of Interest, Score, Corresponding URLs Visited
    user_visits_df - The user df, but added with keywords associated with the link
    """
    
    
    keyword_to_logscore = calculateIdf(user_visits_df, global_visits_counter, total_global_visits, debug=False)
    
    columns = ['Topic of Interest', 'Score', 'Corresponding URLs Visited']
    ranked_interest_df = pd.DataFrame(columns=columns)

    # Iterate through all URLs the user has visited
    for index, entry in user_visits_df.iterrows():
        
        url = entry['normalized_url']        
        aggregate_keyword_list = url_lookup_cache.get(url, [])
        
        # Exponential Decay Factor - Calculate multiplier
        event_date = entry['eventdate']
        multiplier = calculateDecayMultiplier(event_date, start_date)
        
        # Iterate through the individual keywords extracted from the URL
        for keyword in aggregate_keyword_list:
            
            if not keyword:
                print("ERROR, EMPTY KEYWORD DETECTED!")
                print("URL: {}".format(url))
                print("aggregate_keyword_list: {}".format(aggregate_keyword_list))

            existing_row = ranked_interest_df[ranked_interest_df['Topic of Interest'] == keyword]

            if existing_row.empty:
                row = ranked_interest_df.shape[0]               
                ranked_interest_df.loc[row] = [keyword, (keyword_to_logscore[keyword] * multiplier), np.NaN]
                ranked_interest_df['Corresponding URLs Visited'] = ranked_interest_df['Corresponding URLs Visited'].astype(object)
                ranked_interest_df.at[row, 'Corresponding URLs Visited'] = [url]
            else:
                                
                index = ranked_interest_df.index[ranked_interest_df['Topic of Interest'] == keyword]
                column = ranked_interest_df.columns.get_loc('Score')
                updated_score = ranked_interest_df.iloc[index, column].values[0] + (keyword_to_logscore[keyword] * multiplier)
                ranked_interest_df.iloc[index, column] = updated_score
                
                column = ranked_interest_df.columns.get_loc('Corresponding URLs Visited')
                updated_urls = ranked_interest_df.iat[index.values[0], column]
                updated_urls.append(url)                
                ranked_interest_df.iat[index.values[0], column] = updated_urls

    # Sort by logscore before returning
    ranked_interest_df['Score'] = pd.to_numeric(ranked_interest_df['Score'])
    ranked_interest_df.sort_values(by=['Score'], ascending=False, inplace=True)
    
    #
    user_visits_df = pd.merge(user_visits_df, url_to_keyword_df, how='left', on='normalized_url', copy=True)
    user_visits_df = user_visits_df.drop(['analyticsclient.generated_keywords', \
                                          'manual.keywords', \
                                          'analyticsclient.merged_title', \
                                          'analyticsclient.merged_description', \
                                          'analyticsclient.merged_keywords'], axis=1)
        
    return ranked_interest_df, user_visits_df  

# TODO: Future optimiziation, only count the user visited keywords
def calculateIdf(user_visits_df, global_keyword_counter, global_visit_count, 
                 user_weight=1.0, global_weight=2.0, 
                 debug=False, save_results=False):
    
    weighted_document_count = (user_visits_df.shape[0] * user_weight) + (global_visit_count * global_weight)
    user_keyword_counter = generateCounterFromDf(user_visits_df)
    
    idf_lookup_dict = {}
    
    # Multiply by weights
    # Note: We can "get away" with doing only the global weights for the user, 
    # since we're only doing TF-IDF calculations for that user.
    for key in user_keyword_counter.keys():
        # user_keyword_counter[key] = user_keyword_counter[key] * user_weight
        # global_keyword_counter[key] = global_keyword_counter[key] * global_weight        
        idf_lookup_dict[key] = math.log(weighted_document_count / ((user_keyword_counter[key] * user_weight)
                                                                   + (global_keyword_counter[key] * global_weight)))
        if debug or (idf_lookup_dict[key] < 0):
            print('{} : {} [{} / ({} + {})]'.format(key, idf_lookup_dict[key], 
                                                    weighted_document_count, 
                                                    user_keyword_counter[key], global_keyword_counter[key]))
            with pd.option_context('display.max_rows', 200, 'display.max_columns', None, 'display.max_colwidth', 50):
                display(user_visits_df)
                
            print(user_keyword_counter)
            return -1
    
    
    if save_results:
        if not os.path.exists('./debug/'):
            os.makedirs('./debug/')
        
        with open('./debug/IDF Dump.txt', 'w', encoding='utf-8') as w:
            idf_df = pd.DataFrame.from_dict(idf_lookup_dict, orient='index') #, columns=['Keyword', 'IDF Value']
            idf_df = idf_df.sort_values(0)
    
    return idf_lookup_dict

In [75]:
def calculateTopicsOfInterestOnDfOfUsers(filter_grouped_user_df, global_keyword_counter, global_visit_count, start_date, debug=False):
    """
    Inputs:
        filter_grouped_user_df - This contains all the users who we're trying to calculate the topics of interest for.
                                 This should be pre-filtered by your own specified date range.
        global_keyword_counter - This contains the "keyword to counts" for the entire population, within the 30-day window
    
    Outputs:
        user_to_topics_of_interest_df - This is the list of (userid, analyticskey) to (Topics of Interest, scores)
        keyword_to_url_df - This is the user input with keyword list attached to it
    
    """
    
    counter = 1
    user_to_results = dict()
    columns = ['User ID', 'Analytics Key', 'Topic of Interest', 'Score', 'Corresponding URLs Visited']
    user_to_topics_of_interest_df = pd.DataFrame(columns=columns)
    all_keywords_to_url_df = pd.DataFrame()

    for userid_and_analytics_key_tuple, group in filter_grouped_user_df.groupby(['userid', 'analyticskey']):

        user_id = userid_and_analytics_key_tuple[0]
        analytics_key = userid_and_analytics_key_tuple[1]
        
        if debug: 
            print("\n{}) User ID: {} Analytics Key: {}".format(counter, user_id, analytics_key)) 
        
        score_df, user_with_keyword_df = compute_score_with_df(group, global_keyword_counter, global_visit_count, start_date)
        compute_score_with_df
        score_df['User ID'] = user_id
        score_df['Analytics Key'] = analytics_key
        score_df = score_df[columns]
        user_to_topics_of_interest_df = user_to_topics_of_interest_df.append(score_df, ignore_index=True)

        if debug:
            display(user_with_keyword_df)
        
        all_keywords_to_url_df = all_keywords_to_url_df.append(user_with_keyword_df, ignore_index=True)
                
        if counter % 500 == 0:
            print('{} / {}'.format(counter, len(filter_grouped_user_df['userid'].unique())))

        counter += 1
    
    return user_to_topics_of_interest_df, all_keywords_to_url_df

In [76]:
from datetime import timedelta, datetime

def extractDateRange(df, start_date, date_range='day', debug=False):
    """
    Description:
    This takes in a dataframe, and extracts the rows where the eventdate field is within the date range specified.
    Note that the start_date is inclusive, so if you ask for start_date = Jan 1, and range='day', you get all the 
    data from only Jan 1.
    """
        
    end_date = start_date + DATE_RANGE_OPTIONS.get(date_range, date_range)
    
    if debug:
        print("Start Date: {}".format(start_date))
        print("Date Range: {}".format(date_range))
        print("End Date:   {}".format(end_date))
    
    df = df[(df['eventdate'] > start_date) & (df['eventdate'] < end_date)].sort_values(by='eventdate', ascending=True)
    
    if debug:
        print("Earliest Reported Date: {}".format(df.iloc[0]['eventdate']))
        print("Latest Reported Date:   {}".format(df.iloc[-1]['eventdate']))
    
    return df

# Testing code for function above:
#start_date = datetime(2018, 3, 14)
#end_date = datetime(2018, 4, 1)
#date_range = timedelta(30)

#temporary_df = extractDateRange(clean_df, start_date=start_date, date_range='week', debug=True)

#display(temporary_df)


def calculateDecayMultiplier(event_date, start_date, debug=False):
    day_difference = (start_date - event_date).days
    multiplier = DECAY_MULTIPLIER_BASE ** day_difference
    
    if debug:
        print("Start Date:   {}".format(start_date))
        print("Current Date: {}".format(event_date))
        print("Difference:   {}".format(day_difference))
        print("Multiplier:   {}".format(multiplier))
    
    return multiplier

In [77]:
def generateCounterFromDf(df):
    """
    This function takes in the URL history from 'normalized_url'
    and generates the counts of each keyword as a Counter-object
    """

    list_of_list_of_urls = df['normalized_url'].tolist()
    keyword_list = [url_lookup_cache.get(entry, []) for entry in list_of_list_of_urls]
    flat_list = [entry for sublist in keyword_list for entry in sublist]
    keyword_counter = Counter(flat_list)
    
    return keyword_counter


def calculateInfoForAllIndividualUsers(user_df, global_df, start_date, end_date, time_period='day', debug=False):
    """
    This function will iterate through all the users from user_df, and return all the individual's scores
    """
    
    current_date = start_date
    all_users_to_topic_of_interest_df = pd.DataFrame()
    
    # TODO: PERFORMANCE BOOST IDEA: Limit the date range for the user_df and global_df
    
    
    
    while current_date < end_date:
        print("current_date: {}".format(current_date))
        current_execution_time = datetime.now()
        
        # We want to look 30-days back for calcuations
        date_range_filtered_user_df = extractDateRange(user_df, 
                                                       start_date=(current_date - INTEREST_CALCULATION_WINDOW_TIMEDELTA), 
                                                       date_range=(INTEREST_CALCULATION_WINDOW_TIMEDELTA + timedelta(1)), 
                                                       debug=False)
        date_range_filtered_global_df = extractDateRange(global_df, 
                                                         start_date=(current_date - INTEREST_CALCULATION_WINDOW_TIMEDELTA), 
                                                         date_range=(INTEREST_CALCULATION_WINDOW_TIMEDELTA + timedelta(1)), 
                                                         debug=False)
               
        # Obtain number of visits by all users
        global_visit_counter = date_range_filtered_global_df.shape[0]
        
        # Generate counts for global_df
        global_keyword_counter = generateCounterFromDf(date_range_filtered_global_df)
        
        # do Interest calculations for individuals
        user_to_topics_of_interest_df, user_keyword_subset_df = calculateTopicsOfInterestOnDfOfUsers(date_range_filtered_user_df, 
                                                                                                     global_keyword_counter,
                                                                                                     global_visit_counter,
                                                                                                     (current_date + timedelta(1)))        
        user_to_topics_of_interest_df['currdate'] = current_date

        # append to larger list
        all_users_to_topic_of_interest_df = all_users_to_topic_of_interest_df.append(user_to_topics_of_interest_df, ignore_index=True)

        current_date += timedelta(1)
        
        total_execution_time_for_loop = datetime.now() - current_execution_time
        print("Time Elapsed: {}".format(total_execution_time_for_loop))
        
    return all_users_to_topic_of_interest_df

def calculateInfoForAllIndividualUsersSaveToJSON(user_to_toi_and_score, save_directory, debug=False):
    """
    
    
    """
    partition_key = datetime.today().strftime('%Y%m%d0000')
    
    # Create Output Directory if it doesn't already exist
    if not os.path.exists(save_directory):
        os.makedirs(save_directory)
    
    # Gameplan:
    # - Go through date/userid/analyticskey
    # - Go through each keyword & score
    # - Find all URLs & Counts that correspond to the keyword
    # - Save info as a JSON entry
    
    for curr_date, row in user_to_toi_and_score.groupby(['currdate']):

        full_directory_and_file_name = os.path.join(save_directory, curr_date.strftime('%Y%m%d') + '.json')
        output_file = open(full_directory_and_file_name, 'w', encoding='utf-8')
        
        partition_key = datetime.today().strftime('%Y%m%d0000')
        curr_date_string = curr_date.strftime("%Y-%m-%d")

        counter = 1
        current_execution_time = datetime.now()
        print("currdate: {}".format(curr_date_string))
        
        for userid_and_analytics_key, row2 in row.groupby(['User ID', 'Analytics Key']):
            user_id = userid_and_analytics_key[0]
            analytics_key = userid_and_analytics_key[1]
            
            if debug:
                print("User ID: {}".format(user_id))
                print("Analytics Key: {}".format(analytics_key))
                
            row2 = row2.sort_values(by=['Score'], ascending=False)
            
            user_to_keyword_info_list = []
                
            for toi_score, row3 in row2.groupby(['Topic of Interest', 'Score']):
                topic_of_interest = toi_score[0]
                score = toi_score[1]

                if debug:
                    print("\t{}: {}".format(topic_of_interest, score))
                
                # Generates [URL, View Count]
                url_to_view_count_df = row3['Corresponding URLs Visited'].apply(lambda x: pd.Series(x).value_counts()).T.reset_index()
                url_to_view_count_df.rename(columns={url_to_view_count_df.columns[0] : 'url', url_to_view_count_df.columns[1] : 'visitCount'}, inplace=True)
                
                if debug:
                    display(url_to_view_count_df)
                
                url_to_visit_count_list = []
                
                for index, url_visit_count in url_to_view_count_df.iterrows():
                    url = url_visit_count['url']
                    visit_count = url_visit_count['visitCount']
                    
                    if debug:
                        print("URL: {}".format(url))
                        print("visitCount: {}".format(visit_count))
                        
                    url_to_visit_count_list.append(
                        OrderedDict([('url', url), 
                                     ('visitCount', visit_count)]))


                user_to_keyword_info_list.append(
                    OrderedDict([('name', topic_of_interest),
                                 ('score', score), 
                                 ('pagesVisited', url_to_visit_count_list)]))

            json_text = json.dumps(
                OrderedDict([('analyticsKey', analytics_key), 
                            ('partitionKey', partition_key),
                            ('userid', user_id),
                            ('Current Date', curr_date_string),
                            ('interests', user_to_keyword_info_list)]))

            output_file.write("{}\n".format(json_text))
            
            counter += 1
            
            if counter % 1000 == 0:
                print("{} / {}".format(counter, len(row2)))
                
        total_execution_time_for_loop = datetime.now() - current_execution_time
        print("Time Elapsed: {}".format(total_execution_time_for_loop))
            
        output_file.close()

In [78]:
def calculateInfoForAllSegmentsSaveToJSON(segment_to_toi_and_score_df, user_to_toi_and_score, score_threshold, save_directory):
    
    partition_key = datetime.today().strftime('%Y%m%d0000')
    
    # Create Output Directory if it doesn't already exist
    if not os.path.exists(save_directory):
        os.makedirs(save_directory)
    
    for curr_date, row in segment_to_toi_and_score_df.groupby(['currdate']):
        curr_date_string = curr_date.strftime("%Y-%m-%d")
        print("currdate: {}".format(curr_date_string))
        
        full_directory_and_file_name = os.path.join(save_directory, curr_date.strftime('%Y%m%d') + '.json')
        output_file = open(full_directory_and_file_name, 'w', encoding='utf-8')
        
        user_to_toi_and_score_filtered_by_date_df = user_to_toi_and_score[user_to_toi_and_score['currdate'] == curr_date]
        
        for segment_id, row2 in row.groupby(['segmentIdentifier']):
            print("\tsegmentIdentifier: {}".format(segment_id))
            user_to_toi_and_score_filtered_by_date_and_segment_id_df = getSegmentEntriesDf(user_to_toi_and_score_filtered_by_date_df, segment_id)
            #display(user_to_toi_and_score_filtered_by_date_and_segment_id_df)

            # Create veritcal list of [topic of interest, score]
            keyword_to_score_df = row2.drop(labels=['currdate', 'segmentIdentifier'], axis=1, inplace=False).T.reset_index().copy()
            keyword_to_score_df.columns.values[0] = 'Topic of Interest'
            keyword_to_score_df.columns.values[1] = 'Score'

            #with pd.option_context('display.max_rows', 1000, 'display.max_columns', None, 'display.max_colwidth', 2000):
            #    display(keyword_to_score_df)
                
            keyword_to_url_json_string_list = []

            # Iterate through the current date + segment users, to figure out corresponding URLs
            for index, row3 in keyword_to_score_df.iterrows():
                topic_of_interest = row3['Topic of Interest']
                score = row3['Score']

                

                # Skip NaN values
                if math.isnan(score):
                    #print("Skipping...")
                    continue
                    
                print("\t\t{} : {}".format(topic_of_interest, score))

                # Find corresponding users whose individual scores exceed the threshold
                # Need URL, uniqueVisitsCount
                url_to_counts = getUrlAndUniqueVisitsCount(user_to_toi_and_score_filtered_by_date_and_segment_id_df, topic_of_interest, score_threshold)
                
                # If it's an emtpy list
                if url_to_counts.empty:
                    continue
                
                
                #display(url_to_counts)
                url_to_view_count_list = []
    
                for url, view_count in url_to_counts.items():
                    url_to_view_count_entry = OrderedDict([('url', url),
                                                           ('visitCount', view_count)])
                    url_to_view_count_list.append(url_to_view_count_entry)
                    
                keyword_entry = OrderedDict([('name', topic_of_interest), 
                                             ('score', score), 
                                             ('pagesVisited', url_to_view_count_list)])
                keyword_to_url_json_string_list.append(keyword_entry)

            json_text = json.dumps(OrderedDict([('partitionKey', partition_key),
                                                ('segmentIdentifier', segment_id),
                                                ('currdate', curr_date_string), 
                                                ('interests', keyword_to_url_json_string_list)]))
            output_file.write("{}\n".format(json_text))
            
        output_file.close()
    
def getSegmentEntriesDf(df, segmentIdentifier):
    """
    This will only return rows that match the segmentIdentifier
    """
    
    only_segment_id_entries_df = pd.merge(segment_lookup_df, df, how='inner', left_on='datasourceindividualpk', right_on='User ID', sort=True)#.drop('value', 1)
    #display(only_segment_id_entries_df)
    
    return only_segment_id_entries_df
    
    
def getUrlAndUniqueVisitsCount(df, topic_of_interest, minimum_score_threshold, debug=False):
    
    url_to_unique_visits = OrderedDict()
    
    toi_df = df[(df['Topic of Interest'] == topic_of_interest) 
                & (df['Score'] >= minimum_score_threshold)]
    
    if toi_df.empty:
        return toi_df
    
    expanded_url_list = toi_df.set_index(['User ID'])['Corresponding URLs Visited'].apply(pd.Series).stack()
    expanded_url_list = pd.DataFrame(expanded_url_list).reset_index().drop(labels=['level_1'], axis=1, inplace=False)
    expanded_url_list.rename(columns={0 : 'Corresponding URLs Visited'}, inplace=True)
    # We are only getting unique: (userid, url) pairs
    no_duplicates_df = expanded_url_list.drop_duplicates(subset=['User ID', 'Corresponding URLs Visited'])
    count_url_visits = no_duplicates_df['Corresponding URLs Visited'].value_counts()
    
    if debug:
        display(toi_df)
        display(expanded_url_list)
        display(no_duplicates_df)
        display(count_url_visits)

    for url_count_tuple in count_url_visits.iteritems():
        url = url_count_tuple[0]
        count = url_count_tuple[1]
        url_to_unique_visits[url] = count
    
    return count_url_visits


In [79]:
def calculateSegmentWithDf(user_to_topic_of_interest_df, MINIMUM_SCORE_THRESHOLD):    
    user_to_toi_filtered_by_minimum_score = user_to_topic_of_interest_df[user_to_topic_of_interest_df['Score'] > MINIMUM_SCORE_THRESHOLD]
    keyword_to_count = user_to_topic_of_interest_df.groupby('Topic of Interest').count()
    keyword_to_count['Logscore'] = keyword_to_count['User ID'].apply(lambda x: math.log1p(x))
    keyword_to_count = keyword_to_count[['Logscore']]

    return keyword_to_count  

def calculateSegmentInfoFromIndividualDf(segment_name, user_to_toi_df, score_threshold, debug=False):
    """
    This will calculate the interest scores, and
    """
    
    user_to_toi_with_date_df = pd.DataFrame()
    
    # Filter by date
    for index, row in user_to_toi_df.groupby('currdate'):
        
        if debug:
            print("currdate: {}".format(index))
            display(row)
            
        segment_to_topic_of_interest_df = calculateSegmentWithDf(row, score_threshold)
        segment_to_topic_of_interest_transposed_df = segment_to_topic_of_interest_df.T
        segment_to_topic_of_interest_transposed_df['currdate'] = index
        user_to_toi_with_date_df = user_to_toi_with_date_df.append(segment_to_topic_of_interest_transposed_df, ignore_index=True)
        
        if debug:
            display(user_to_toi_with_date_df)

    # Move currdate column to front
    currdate_column = user_to_toi_with_date_df['currdate']
    user_to_toi_with_date_df.drop('currdate', axis=1, inplace=True)
    user_to_toi_with_date_df.insert(0, 'currdate', currdate_column)
    
    # Add Segment Name column
    user_to_toi_with_date_df.insert(1, 'segmentIdentifier', segment_name)
    
    return user_to_toi_with_date_df

def calculateAllSegmentInfo(user_to_toi_df, debug=False):
    """
    This function will return a DataFrame of all segments Topic of Interests and Scores
    """
    
    all_segment_info_df = pd.DataFrame()
    
    # Gameplan:
    # - Iterate through the list of segments
    #   * Filter user_to_toi_df so we only get the users from that segment
    # - Calculate segment toi & scores for that segment
    for segmentName, row in segment_lookup_df.groupby('segmentName'):
        display("segmentName: {}".format(segmentName))
        filtered_user_df = pd.merge(row, user_to_toi_df, how='inner', left_on='datasourceindividualpk', right_on='User ID')
        print("\tMembers: {}".format(filtered_user_df['User ID'].unique()))
        if debug:
            display(filtered_user_df)
            
        if filtered_user_df.shape[0] == 0:
            print("[WARNING] - Segment has 0 users! Skipping...")
            continue

        segment_toi_to_score_df = calculateSegmentInfoFromIndividualDf(segmentName, filtered_user_df, MINIMUM_TOPIC_OF_INTEREST_THRESHOLD_SCORE)
        all_segment_info_df = all_segment_info_df.append(segment_toi_to_score_df, ignore_index=True)
    
    
    
    # Move currdate & segmentIdentifier to front
    currdate_column = all_segment_info_df['currdate']
    segment_id_column = all_segment_info_df['segmentIdentifier']
    all_segment_info_df.drop('currdate', axis=1, inplace=True)
    all_segment_info_df.drop('segmentIdentifier', axis=1, inplace=True)
    all_segment_info_df.insert(0, 'currdate', currdate_column)
    all_segment_info_df.insert(1, 'segmentIdentifier', segment_id_column)
    
    return all_segment_info_df

In [80]:
def printUsefulUserDfInformation(df):
    print("Rows:         {}".format(df.shape[0]))
    print("Unique Users: {}".format(len(df.groupby('userid'))))
    print("Unique URLs:  {}".format(len(df.groupby('normalized_url'))))

## Pipeline with Output saved as JSON files

Steps:
* Filter out the group of users you want as a dataframe
* Pass in date range for calculations
* Write output files
 * Individual -> Topic of Interest (individual topics of interest.json)
 * Entire Segment -> Topic of Interest (segment topics of interest.json)
 * Segment URLs Contribution -> Topic of Interest (daily URL contribution to topics of interest.json)

In [81]:
%%time

MINIMUM_VIEWING_TIME_CONSIDERED = 10000

start_date = START_DATE_DATETIME
end_date = END_DATE_DATETIME
#start_date = datetime(2018, 4, 1)
#end_date = datetime(2018, 5, 1)

printUsefulUserDfInformation(clean_df)

print("\nFiltering out Ignore URLs")
global_df = clean_df[(~clean_df['Ignore URL'])]
printUsefulUserDfInformation(global_df)

print("\nFiltering out views not exceeding {}ms".format(MINIMUM_VIEWING_TIME_CONSIDERED))
global_df = global_df[global_df['eventproperties.viewDuration'] >= MINIMUM_VIEWING_TIME_CONSIDERED]
printUsefulUserDfInformation(global_df)

print("\nFiltering out views not within time range")
print((start_date - INTEREST_CALCULATION_WINDOW_TIMEDELTA))
print(end_date + timedelta(days=1))
global_df = global_df[(global_df['eventdate'] >= (start_date - INTEREST_CALCULATION_WINDOW_TIMEDELTA))
                  & (global_df['eventdate'] <= (end_date + timedelta(days=1)))]
printUsefulUserDfInformation(global_df)

global_df = global_df.sort_values(by='eventdate')

Rows:         5271
Unique Users: 1362
Unique URLs:  869

Filtering out Ignore URLs
Rows:         3970
Unique Users: 995
Unique URLs:  707

Filtering out views not exceeding 10000ms
Rows:         3413
Unique Users: 936
Unique URLs:  591

Filtering out views not within time range
2018-04-18 00:00:00
2018-05-21 07:44:40.317892
Rows:         3341
Unique Users: 911
Unique URLs:  587
Wall time: 409 ms


In [82]:
# Filter 
printUsefulUserDfInformation(global_df)
user_df = global_df.copy()

Rows:         3341
Unique Users: 911
Unique URLs:  587


In [83]:
%%time

print("Filtering out users who did not have at least 5 unique views")
user_df = user_df.groupby('userid').filter(lambda x: len(x) > 5)
printUsefulUserDfInformation(user_df)

Filtering out users who did not have at least 5 unique views
Rows:         1768
Unique Users: 84
Unique URLs:  328
Wall time: 587 ms


In [84]:
with pd.option_context('display.max_rows', 5, 'display.max_columns', None, 'display.max_colwidth', 50):
    display(user_df)

Unnamed: 0,eventdate,analyticskey,userid,eventid,Ignore URL,normalized_url,context.url,context.og:url,context.title,context.og:title,context.description,context.og:description,context.keywords,context.contentLanguageId,eventproperties.scrollDepth,eventproperties.viewDuration,context.userAgent,context.platformName,context.browserName,context.country,context.region,context.city,clientip
5967,2018-04-29 21:10:12.132,www.liferay.com,AWMTMaO8_nPDtHhuj1tL,unload,False,https://www.liferay.com/downloads,https://www.liferay.com/downloads,https://www.liferay.com/downloads,Liferay Downloads,Liferay Downloads,"Download Liferay DXP, Liferay Portal, and othe...","Download Liferay DXP, Liferay Portal, and othe...",,en-US,,701447,Mozilla/5.0 (Windows NT 6.3; Win64; x64) Apple...,Windows,Chrome,Cameroon,Sud,Ambam,41.202.207.4
5968,2018-04-29 21:10:12.132,www.liferay.com,AWMTMaO8_nPDtHhuj1tL,unload,False,https://www.liferay.com/downloads,https://www.liferay.com/downloads,https://www.liferay.com/downloads,Liferay Downloads,Liferay Downloads,"Download Liferay DXP, Liferay Portal, and othe...","Download Liferay DXP, Liferay Portal, and othe...",,en-US,,701447,Mozilla/5.0 (Windows NT 6.3; Win64; x64) Apple...,Windows,Chrome,Cameroon,Sud,Ambam,41.202.207.4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7750,2018-05-20 23:05:30.572,www.liferay.com,AWNwlswGaEjFe68kAsd_,unload,False,https://www.liferay.com/en_AU/subscription-ser...,https://www.liferay.com/en_AU/subscription-ser...,https://www.liferay.com/en_AU/subscription-ser...,Liferay Enterprise Subscription and Profession...,Liferay Enterprise Subscription and Profession...,Our subscription includes access to Liferay Di...,Our subscription includes access to Liferay Di...,,en-AU,,56983,Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:6...,Windows,Unknown,Australia,New South Wales,Epping,101.2.169.36
7754,2018-05-20 23:06:18.720,www.liferay.com,AWNwlswGaEjFe68kAsd_,unload,False,https://www.liferay.com/en_AU/downloads/ee-lic...,https://www.liferay.com/en_AU/downloads/ee-lic...,https://www.liferay.com/en_AU/downloads/ee-lic...,Liferay Portal Enterprise Edition (EE) License,Liferay Portal Enterprise Edition (EE) License,Learn more about the commercial license for Li...,Learn more about the commercial license for Li...,,en-AU,,51279,Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:6...,Windows,Unknown,Australia,New South Wales,Epping,101.2.169.36


In [85]:
%%time
print("Calculating all Individual User's Info")
os.write(1, "\nCalculating all Individual User's Info".encode())
user_to_toi_and_score = calculateInfoForAllIndividualUsers(user_df, global_df, (start_date + timedelta(30)), (start_date + timedelta(37)), 'day', False)

Calculating all Individual User's Info
current_date: 2018-06-17 00:00:00
Time Elapsed: 0:01:08.556327
current_date: 2018-06-18 00:00:00
Time Elapsed: 0:00:17.075878
current_date: 2018-06-19 00:00:00
Time Elapsed: 0:00:08.589478
current_date: 2018-06-20 00:00:00
Time Elapsed: 0:00:00.031522
current_date: 2018-06-21 00:00:00
Time Elapsed: 0:00:00.026014
current_date: 2018-06-22 00:00:00
Time Elapsed: 0:00:00.032023
current_date: 2018-06-23 00:00:00
Time Elapsed: 0:00:00.026517
Wall time: 1min 34s


In [86]:
%%time
print("Saving Individual Info to JSON file")
os.write(1, "\nSaving Individual Info to JSON file".encode())
calculateInfoForAllIndividualUsersSaveToJSON(user_to_toi_and_score, INDIVIDUAL_OUTPUT_DIRECTORY, debug=False)

Saving Individual Info to JSON file
currdate: 2018-06-17
Time Elapsed: 0:00:06.812181
currdate: 2018-06-18
Time Elapsed: 0:00:02.672429
currdate: 2018-06-19
Time Elapsed: 0:00:01.326503
Wall time: 10.9 s


In [87]:
%%time
print("Calculating all Segment Info")
os.write(1, "\nCalculating all Segment Info".encode())
all_segment_info_df = calculateAllSegmentInfo(user_to_toi_and_score, debug=False)

Calculating all Segment Info


'segmentName: Contact Sales Form'

	Members: []


'segmentName: DACH'

	Members: ['AWIJ9CHnIxLIzylnnAN5']


'segmentName: Developers'

	Members: []


'segmentName: Gartner MQ Form'

	Members: ['AWIJ9CHnIxLIzylnnAN5']


'segmentName: Gartner MQ Form 2'

	Members: []


'segmentName: Jul'

	Members: []


'segmentName: Julio testing'

	Members: []


'segmentName: Known Individuals'

	Members: []


'segmentName: LRDCOM UAT'

	Members: []


'segmentName: Liferay Developers'

	Members: []


'segmentName: Rik'

	Members: ['AWKrpL_MPBlhtEFni2A' 'AWIJ9CHnIxLIzylnnAN5']


'segmentName: Spaniards'

	Members: []


'segmentName: Technology'

	Members: []


'segmentName: US Customers'

	Members: []


'segmentName: Wealthy'

	Members: []


'segmentName: country-test2'

	Members: ['AWKrpL_MPBlhtEFni2A']


'segmentName: test-country-10'

	Members: ['AWKrpL_MPBlhtEFni2A']


'segmentName: test-country-2'

	Members: ['AWIJ9CHnIxLIzylnnAN5']


'segmentName: test-country-3'

	Members: ['AWIJ9CHnIxLIzylnnAN5']


'segmentName: test-country-4'

	Members: ['AWIJ9CHnIxLIzylnnAN5']


'segmentName: test-country-5'

	Members: ['AWKrpL_MPBlhtEFni2A' 'AWIJ9CHnIxLIzylnnAN5']


'segmentName: test-country-6'

	Members: ['AWIJ9CHnIxLIzylnnAN5']


'segmentName: test-country-7'

	Members: ['AWKrpL_MPBlhtEFni2A']


'segmentName: test-country-8'

	Members: ['AWKrpL_MPBlhtEFni2A']


'segmentName: test-country-9'

	Members: ['AWIJ9CHnIxLIzylnnAN5']
Wall time: 561 ms


In [88]:
%%time
print("Saving Segment Info to JSON file")
os.write(1, "\nSaving Segment Info to JSON file".encode())
calculateInfoForAllSegmentsSaveToJSON(all_segment_info_df, 
                                      user_to_toi_and_score, 
                                      0, #MINIMUM_TOPIC_OF_INTEREST_THRESHOLD_SCORE,
                                      SEGMENT_OUTPUT_DIRECTORY)

Saving Segment Info to JSON file
currdate: 2018-06-17
	segmentIdentifier: DACH
		home : 0.6931471805599453
	segmentIdentifier: Gartner MQ Form
		home : 0.6931471805599453
	segmentIdentifier: Rik
		IT : 0.6931471805599453
		IT and business leaders : 0.6931471805599453
		business : 0.6931471805599453
		case studies : 0.6931471805599453
		ebooks : 0.6931471805599453
		free software upgrades : 0.6931471805599453
		home : 0.6931471805599453
		information : 0.6931471805599453
		leadership resources : 0.6931471805599453
		liferay digital experience platform : 0.6931471805599453
		liferay enterprise subscription : 0.6931471805599453
		liferay's resource library : 0.6931471805599453
		medimpact : 0.6931471805599453
		members : 0.6931471805599453
		professional support : 0.6931471805599453
		subscription : 0.6931471805599453
		subscription-only plug-ins : 0.6931471805599453
		whitepapers : 0.6931471805599453
	segmentIdentifier: country-test2
		IT : 0.6931471805599453
		IT and business leaders : 

		liferay enterprise subscription : 0.6931471805599453
		liferay's resource library : 0.6931471805599453
		medimpact : 0.6931471805599453
		members : 0.6931471805599453
		professional support : 0.6931471805599453
		subscription : 0.6931471805599453
		subscription-only plug-ins : 0.6931471805599453
		whitepapers : 0.6931471805599453
	segmentIdentifier: test-country-8
		IT : 0.6931471805599453
		IT and business leaders : 0.6931471805599453
		business : 0.6931471805599453
		case studies : 0.6931471805599453
		ebooks : 0.6931471805599453
		free software upgrades : 0.6931471805599453
		information : 0.6931471805599453
		leadership resources : 0.6931471805599453
		liferay digital experience platform : 0.6931471805599453
		liferay enterprise subscription : 0.6931471805599453
		liferay's resource library : 0.6931471805599453
		medimpact : 0.6931471805599453
		members : 0.6931471805599453
		professional support : 0.6931471805599453
		subscription : 0.6931471805599453
		subscription-only plug-in

In [89]:
os.write(1, "\n\nEverything is finished!".encode())

23