In [None]:
import re
import json
import langid
import requests
import pandas as pd


from bs4 import BeautifulSoup
from time import sleep
from selenium import webdriver
from morfeusz2 import Morfeusz
from advertools import url_to_df 
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

requests.packages.urllib3.disable_warnings() # We will rised by failed url requests, which won't be taken into account either way

BROWSING_HISTORY_JSON = 'data/BrowserHistory.json'
WORDBAGS_JSON = 'data/BrowsingHistoryWordbags.json'
CHROMEDRIVER_EXEC = r'..\LocalPathVariables\Chromedriver_bin\chromedriver.exe'

## Importing the browsing history

We'll import the data spanning from 19.10.2021 untill 19.03.2022, with the latter marking the beginning of this project.

After reading the data into a dataframe we're limiting it to chosen timespan and accessable URL's (beginning with "http" / "https"). We're also leaving the columns of interest, it is the title, url and time_usec.

We then reset the indexing to match the current record count. Zero will be the first index.

In [None]:
data = json.load(open(BROWSING_HISTORY_JSON, encoding='utf-8'))
df = pd.json_normalize(data, record_path=['Browser History'])
df.drop(df[df.time_usec < 1634601600000000].index, inplace=True)
df.drop(df[~df.url.str.contains('http://') & ~df.url.str.contains('https://')].index, inplace=True)
df = df[['title', 'url', 'time_usec']]
df.index = [i for i in range(0, len(df))]

## Partitioning of URL's

As we want to separate the domain form the whole address we use advertools.url_to_df() function, which creates a dataframe object where each row gets an entry for a specific url seciton. We'll only use the netloc field, as it's the domain we look after.

In [None]:
df_add = url_to_df(df.url)[['netloc']]
df = df.join(df_add)

## Volume reduction

In this section we reduce the overall volume of our dataset for the purpose of noise cutting. We're implementing the solutions derived in the README file.

### Limiting repetitions

We're cutting short the lurking peroids, thus getting closer to representing the actual "entrances". I also decided to treat all google search queries as noise. I'd agree with the view that it may be bold of me to think that a great deal of potential information isn't lost this way, but I justify it by saing that even if we traet each query as a separate entrance (instead of bucketing under google.com domain, which wouldn't be informational due to a large variety of searched topics) each included page may greately broaden the wordspace we'll have to choose features for, making the similarity matrix even more sparse.

Later we drop marked rows along with the temporary "del" column.

It's also worth to create a table consisting of browsing history share per domain before removing repetitions for the sake of later analysis.


In [None]:
df_amounts = df['netloc'].value_counts()
df['del'] = [False if (df.netloc[i] == df.netloc[i+1] or df.netloc[i] == 'www.google.com') else True
             for i in range(0, len(df.values)-1)] + [True]
df = df[df['del']][['time_usec', 'title', 'url', 'netloc']]

### Reduction of outliers

I decided to drop all entries about sites that I've visited not more than 23 times during the last 6 months. I'd be hard to speak about a routine if I didn't visit a site at least 5 times per month, so it'd be reasonable to cut it off here.

In [None]:
df = df[df['netloc'].map(df['netloc'].value_counts()) > 23]

## Downloading

Buckets dataframe is created to keep just two types of information: the domain name and a wordbag field, to which we will append the trimmed and prepared contents of its subdirectiories that I've visited. In order to cut down on the amount of iterations we create a set called history which will store full URL's of sites whose content has been already downloaded.

To keep it even simpler we only allow ourselves to include sites whose response code is equal to 200 (meaning connection without exceptions). It later tourns out that few sites, ylilauta.org to give an example, is protected against webcrawling bots, so we'll have to label them manually.

The process of stripping the text has three steps. Firstly we strip it out of html tags, secondly we analise each character to keep only normal letters.
Secondly each word is getting checked for appearance of uppercase letters inside of it.
There's a large probability that those would be leftovers of variable names from code. Additionally we specify to keep them in the length range between 4 and 20 characters, lowercasing them all afterwards.
It's all finished by stemming with the use of two lemmatisation libraries. Morfeusz() to be used for polish and a PorterStemmer for english, usage of which is regulated by classify() function from langid library applied to first 20 words from the site.

In [None]:
df_buckets = pd.DataFrame({'netloc' : df['netloc'].unique(),
                           'wordbag': ''})
history = set()

op = webdriver.ChromeOptions()
op.add_argument('headless')
driver = webdriver.Chrome(executable_path=CHROMEDRIVER_EXEC, options=op)

for _, row in df.iterrows():
    
    if row['url'] not in history:
        
        sc = 0
        history.add(row['url'])

        try:
            page = requests.get(row['url'], verify=False)
            sc = page.status_code
        except: 
            pass
        if sc == 200:
            print(f'Conducting {row.url} ...') # The script is verbose. 
            
            driver.get(row['url'])
            for _ in range(10):
                sleep(0.35)
                driver.execute_script('return scrollBy(0, 400);')

            soup =  BeautifulSoup(driver.page_source, 'html.parser')

            text = ''.join(i for i in soup.stripped_strings)

            words_lst = ''.join(e if (e.isalnum() or e == ' ') and not e.isdigit() else ' ' #Analiza znaków
                                for e in text).split() 
            words_lst = [e.lower() for e in words_lst
                            if not any([bool(re.match(r'.\w*[A-Z]\w*', e)), len(e)>20, len(e)<4])] 

            if langid.classify(' '.join(words_lst[:20]))[0] == 'pl':
                stemmer = Morfeusz()
                text = ' '.join(next(iter(stemmer.analyse(e)))[2][1].split(':')[0] for e in words_lst if not e in stopwords.words('polish')) 
            else:
                stemmer = PorterStemmer()
                text = ' '.join(stemmer.stem(e) for e in words_lst if not e in stopwords.words('english')) #Stemming

            if row['netloc'] in df_buckets['netloc'].values:
                df_buckets.loc[df_buckets['netloc'] == row['netloc'], 'wordbag'] += f' {text} '
driver.quit()


In [None]:
df_buckets

### Saving

We save it in json format for use in separate script for analysis.

In [None]:
df_buckets.to_json(WORDBAGS_JSON)