In [10]:
import re
import json
import langid
import datetime
import requests
import pandas as pd
import advertools as adv

from bs4 import BeautifulSoup
from morfeusz2 import Morfeusz
from collections import Counter
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from bs4.element import Comment

requests.packages.urllib3.disable_warnings()

BROWSING_HISTORY_JSON = 'data/BrowserHistory.json'
DOCUMENTS_JSON = 'data/BrowsingHistoryWordbags.json'

In [11]:
def tag_visible(element):
    if element.parent.name in ['style', 'script', 'head', 'title', 'meta', '[document]']:
        return False
    if isinstance(element, Comment):
        return False
    return True

# Data Preparation

The goal is to analise the timeseries arising from my web browser history in order to predict my usual daily routine. Along the way we'll make use of a clustering algorithm for the purpose of automatically labeling the visited sites as laisure-oriented (for example: youtube, facebook or coub), math- or programming- oriented (stack overflows, various documentations, wikipedia) etc. 

This posts a demand for two types of data: the site's contents as a document and a timeseries of visited site types with accordance to timestamps. Proper gathering and preparation of both of them will be challanging.

First off with the sites. As we'll feed it to a clustering alghoritm via some document vectorisation method, we need consider the following:

1. The less sparse the similarity matrix (made up of documents' feature vectors) the better the clustering algorithm will perform, and as each feature vector entrance is associated with one word (globally, since every document's feature vector has to "make a spot" for its own value for that word), we should choose only the most important of them. Our sense measure of importance will be dictated by:
    - The need to represent site's visible content, not the structure. We need to get rid of any code contained in downloaded .html file.
    - Our decision not to take context into account (for simplicity, as we feel confident that a general document clustering will do), so we don't need to include interpunction or special characters in general.
    - The need to reduce noise inside the document. We'll filter for stopwords.
    - Generality of a word. We should count different forms of one word as several occurences of one word. We will stem each word.

2. Downloading should be performed in a way optimising the end-goal timeseries, so:
    - We need to get rid of noise in the browsing history. We should omit sites that don't meet a visit-frequency threshold.
    - We should get rid of repetitions. We can do it in two ways:
        + We can create "domain buckets" that represent a domain with its various paths as one entrance (to be explained in details later) along with a general content profile of each. It serves not only to reduce the download, but to make it more stable in clustering process.
        + By skipping the lenghty lurking process on one domain bucket and leave just one representative, later to fill the gap between timestamps with that copies of it.
    - We need to drop records of sites that we cannot access.
    
After such processing, we should achieve a nice, dense similarity matrix.

Additionally we'll make a comparison with a clustering based on pages' titles. Their preparation will be same as for the sites the scope of the second point above, but word filtering and generalisation won't be so restrictive.

## Importing the browsing history

We'll import the data spanning from 19.10.2021 untill (but not exactly) 19.03.2022, which marks the beginning of this project.

After reading the data into a dataframe we're limiting it to chosen timespan and accessable URL's (beginning with "http" / "https"). We're also leaving the columns of interest, it is the title, url and time_usec.

We then reset the indexing to match the current record count. It's worth noting that we begin at 0.

In [12]:
data = json.load(open(BROWSING_HISTORY_JSON, encoding="utf-8"))
df = pd.json_normalize(data, record_path=['Browser History'])
df.drop(df[df.time_usec < 1645228800000000].index, inplace=True)
df.drop(df[~df.url.str.contains('http://') & ~df.url.str.contains('https://')].index, inplace=True)
df = df[['title', 'url', 'time_usec']]
df.index = [i for i in range(0, len(df))]

## Partitioning of URL's

As we want to separate the domain form the whole address we use advertools.url_to_df() function, which creates a dataframe object where each row gets an entry for a specific url seciton. We'll only use the netloc field, as it's the domain we look after.

In [13]:
df_add = adv.url_to_df(df.url)[['netloc']]
df = df.join(df_add)

## Volume reduction

In this section we reduce the overall volume of our dataset for the purpose of noise cutting. We're implementing the solutions derived in both points above.

In [14]:
df_amounts = df['netloc'].value_counts()#Zapisujemy ogólne ilości odwiedzin stron przed ich wywaleniem
df['del'] = [False if ( df.netloc[i] == df.netloc[i+1] or df.netloc[i] == 'www.google.com' ) else True for i in range(0, len(df.values)-1)] + [True] #Tylko pojedyncze wpisy z bloku powtórek względem całej domeny i won z google
df = df[df['del']][['time_usec', 'title', 'url', 'netloc']] #Wyrzucamy kolumnę icon, client_id, page_transition oraz del

In [None]:
df_amounts

In [None]:
df['netloc'].value_counts()

In [15]:
df = df[df['netloc'].map(df['netloc'].value_counts()) > 5] #Redukcja stron, gdzie witam (i spędzam posiedzenie) mniej niż 5 razy na miesiąc

In [None]:
df_buckets = pd.DataFrame({'netloc' : df['netloc'].unique(),  #Tworzymy tablicę wiader
                           'wordbag': ''})

for _, row in df.iterrows():
    sc = 0
    try:
        page = requests.get(row['url'], verify=False)
        sc = page.status_code
    except: 
        pass
    if sc == 200:
        print(f'Conducting {row.url} ...')
        soup = BeautifulSoup(page.text, 'html.parser')
        texts = soup.findAll(text=True)
        visible_texts = filter(tag_visible, texts)  
        text = ' '.join(t.strip() for t in visible_texts)
            
        words_lst = ''.join(e if (e.isalnum() or e == ' ') and not e.isdigit() else ' ' #Analiza znaków
                         for e in text).split() 
        words_lst = [e.lower() for e in words_lst #Analiza słów
                         if not any([bool(re.match(r'.\w*[A-Z]\w*', e)), len(e)>20])] 

        if langid.classify(' '.join(words_lst[:20]))[0] == 'pl':
            stemmer = Morfeusz()
            text = ' '.join(next(iter(stemmer.analyse(e)))[2][1].split(':')[0] for e in words_lst if not e in stopwords.words('polish')) 
        else:
            stemmer = PorterStemmer()
            text = ' '.join(stemmer.stem(e) for e in words_lst if not e in stopwords.words('english')) #Stemming
            
        if row['netloc'] in df_buckets['netloc'].values:
            df_buckets.loc[df_buckets['netloc'] == row['netloc'], 'wordbag'] += f' {text} '



Conducting https://www.youtube.com/watch?v=Aff2g5mVt1Q ...
Conducting https://www.youtube.com/ ...
Conducting https://www.linkedin.com/notifications/ ...
Conducting https://www.linkedin.com/ ...
Conducting https://login.uj.edu.pl/login?service=https%3A%2F%2Fwww.usosweb.uj.edu.pl%2Fkontroler.php%3F_action%3Dlogowaniecas%2Findex&locale=pl ...
Conducting https://en.wikipedia.org/wiki/Data_mining ...
Conducting https://en.wikipedia.org/wiki/Statistical_learning_theory ...
Conducting https://nofluffjobs.com/pl/praca-it ...
Conducting https://www.kaggle.com/learn ...
Conducting https://en.wikipedia.org/wiki/Continuous_uniform_distribution ...
Conducting https://en.wikipedia.org/wiki/Iterated_integral ...
Conducting https://en.wikipedia.org/wiki/Central_limit_theorem ...
Conducting https://pl.wikipedia.org/wiki/Wariancja ...
Conducting https://en.wikipedia.org/wiki/Monte_Carlo_method ...
Conducting https://en.wikipedia.org/wiki/Volume_of_an_n-ball ...
Conducting https://math.stackexchange.com

In [9]:
df_buckets

Unnamed: 0,netloc,wordbag
0,www.youtube.com,informacja centrum prasowy prawa autorski sko...
1,ylilauta.org,
2,www.linkedin.com,third parti custom partner servic provid use ...
3,www.usosweb.uj.edu.pl,
4,login.uj.edu.pl,informacja aktualizacja przeglądarka szanowny...
5,en.wikipedia.org,data mine wikipedia free encyclopedia jump na...
6,www.canva.com,
7,nofluffjobs.com,requir page oferti praci profil firm mastercl...
8,www.kaggle.com,
9,www.overleaf.com,


In [None]:
text = 'diupa'
df_buckets.loc[df_buckets['netloc'] == 'www.canva.com', 'wordbag'] += f' {text} '

In [None]:
df_buckets.to_json('data/WordbagsBuckets.json')

In [None]:
df_finale['wordbag'] = df_finale['title'].astype(str) + ' ' \
                    + df_finale['page_transition'].astype(str) + ' ' \
                    + datetime.datetime.fromtimestamp(df_finale['time_usec'] / 1000000.0).strftime('%A %H:%M:%S') + ' ' \
                    + df_finale['wordbag'].astype(str) #Dodawanie metadanych

In [None]:
df_contents = pd.read_json(DOCUMENTS_JSON)