In [1]:
import re
import json
import requests
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
from io import StringIO

requests.packages.urllib3.disable_warnings()

BROWSING_HISTORY_JSON = 'data/BrowserHistory.json'

# Data Preparation

As the goal is to apply a clustering alghoritm to my browsing history in order to see if it's possible to characterise my daily (or even weekly) browsing routine, we'll need to investigate the .csv of browsing history itself. It is very likely that we'll only find the timestamp and the web address. It gives birth to the need of looking into the website's code, so the need of a script for downloading and preparing the data follows.

Later we'll have to compose each batch of data (Timestamp and a particular site details, along with the page content) into a single entry to feed the algorithm.

## Importing the browsing history
We'll import the data spanning from 19.02.2022 untill (but not exactly) 19.03.2022, which is today.


In [2]:
data = json.load(open(BROWSING_HISTORY_JSON))
df = pd.json_normalize(data, record_path=['Browser History'])
df.drop(df[df.time_usec < 1647043200000000].index, inplace=True) #W przeciągu tygodnia testujemy
df.drop(df[~df.url.str.contains('http://') & ~df.url.str.contains('https://')].index, inplace=True) #Tylko validalne linki

In [3]:
df

Unnamed: 0,favicon_url,page_transition,title,url,client_id,time_usec
0,https://www.google.com/favicon.ico,LINK,Google Takeout,https://takeout.google.com/?pli=1,4/Ko9O98IRCCudQQfWle4g==,1647675449005646
1,https://www.google.com/favicon.ico,LINK,Google Takeout,https://takeout.google.com/?pli=1,4/Ko9O98IRCCudQQfWle4g==,1647675445981493
2,https://www.google.com/favicon.ico,LINK,Google Takeout,https://takeout.google.com/?pli=1,4/Ko9O98IRCCudQQfWle4g==,1647675437100568
3,https://www.google.com/favicon.ico,LINK,Google Takeout,https://takeout.google.com/?pli=1,4/Ko9O98IRCCudQQfWle4g==,1647675435904327
4,https://www.google.com/favicon.ico,LINK,Google Takeout,https://takeout.google.com/?pli=1,4/Ko9O98IRCCudQQfWle4g==,1647675427134517
...,...,...,...,...,...,...
2354,https://pl.wikipedia.org/static/favicon/wikipe...,AUTO_BOOKMARK,"Gramatyka języka fińskiego – Wikipedia, wolna ...",https://pl.wikipedia.org/wiki/Gramatyka_j%C4%9...,CVEyydd4EXv6yEd2mZeQ2g==,1647074739727571
2355,https://www.youtube.com/s/desktop/6007d895/img...,TYPED,YouTube,https://www.youtube.com/,CVEyydd4EXv6yEd2mZeQ2g==,1647074717965150
2356,https://pia-hs.dsuj.pl/img/favicon.svg,TYPED,Hotspot > Logowanie,https://pia-hs.dsuj.pl/login?dst=http%3A%2F%2F...,CVEyydd4EXv6yEd2mZeQ2g==,1647074696486888
2357,,AUTO_TOPLEVEL,Połączenie z siecią Wi-Fi,https://www.overleaf.com/project/6064f7b0f3d64...,CVEyydd4EXv6yEd2mZeQ2g==,1647074695024254


In [65]:
df_trim = df[-20:].copy()

In [None]:
df_trim

In [4]:
response_codes = []
contents = []

sc = 0
fill = ''

for url in df.url:
    try:
        page = requests.get(url, verify=False)
        
        sc = page.status_code
        if sc == 200:
            soup = BeautifulSoup(page.text)
            text = soup.get_text()
            fill = text.replace('\n', '')[:3500]
        
    except: 
        sc = 0
        fill = ''
        pass
    response_codes.append(sc)
    contents.append(fill)

df_contents = df.assign(response_code=pd.Series(response_codes).values, content=pd.Series(contents).values)



In [5]:
df_contents

Unnamed: 0,favicon_url,page_transition,title,url,client_id,time_usec,response_code,content
0,https://www.google.com/favicon.ico,LINK,Google Takeout,https://takeout.google.com/?pli=1,4/Ko9O98IRCCudQQfWle4g==,1647675449005646,200,Logowanie – Konta Google Jedno konto. Wszystk...
1,https://www.google.com/favicon.ico,LINK,Google Takeout,https://takeout.google.com/?pli=1,4/Ko9O98IRCCudQQfWle4g==,1647675445981493,200,Logowanie – Konta Google Jedno konto. Wszystk...
2,https://www.google.com/favicon.ico,LINK,Google Takeout,https://takeout.google.com/?pli=1,4/Ko9O98IRCCudQQfWle4g==,1647675437100568,200,Logowanie – Konta Google Jedno konto. Wszystk...
3,https://www.google.com/favicon.ico,LINK,Google Takeout,https://takeout.google.com/?pli=1,4/Ko9O98IRCCudQQfWle4g==,1647675435904327,200,Logowanie – Konta Google Jedno konto. Wszystk...
4,https://www.google.com/favicon.ico,LINK,Google Takeout,https://takeout.google.com/?pli=1,4/Ko9O98IRCCudQQfWle4g==,1647675427134517,200,Logowanie – Konta Google Jedno konto. Wszystk...
...,...,...,...,...,...,...,...,...
2354,https://pl.wikipedia.org/static/favicon/wikipe...,AUTO_BOOKMARK,"Gramatyka języka fińskiego – Wikipedia, wolna ...",https://pl.wikipedia.org/wiki/Gramatyka_j%C4%9...,CVEyydd4EXv6yEd2mZeQ2g==,1647074739727571,200,"Gramatyka języka fińskiego – Wikipedia, wolna ..."
2355,https://www.youtube.com/s/desktop/6007d895/img...,TYPED,YouTube,https://www.youtube.com/,CVEyydd4EXv6yEd2mZeQ2g==,1647074717965150,200,YouTubeInformacjeCentrum prasowePrawa autorski...
2356,https://pia-hs.dsuj.pl/img/favicon.svg,TYPED,Hotspot > Logowanie,https://pia-hs.dsuj.pl/login?dst=http%3A%2F%2F...,CVEyydd4EXv6yEd2mZeQ2g==,1647074696486888,404,
2357,,AUTO_TOPLEVEL,Połączenie z siecią Wi-Fi,https://www.overleaf.com/project/6064f7b0f3d64...,CVEyydd4EXv6yEd2mZeQ2g==,1647074695024254,403,


In [6]:
df_contents.to_json('data/BrowsingHistoryUpdatedWeek.json')

In [None]:
df = pd.read_json(BROWSING_HISTORY_JSON)
df.rename(columns={'results.id' :  result_levels[self.lvl],
                   'val'        :  slugify(self.parent_id, allow_unicode=True) + '-' + self.name},
         inplace=True)
df.drop('attrId', axis=1, inplace=True)
df.set_index([result_levels[self.lvl],'year'], inplace=True)