In [1]:
# library imports
import csv
import nltk
import pandas as pd
import numpy as np
import matplotlib

In [2]:
# import NDP file (~3GB of plain text)
ndp_file = "/Users/caoimherooney/Desktop/227-www-ndp-ca.txt"

In [3]:
# load the NDP plain text as a dataframe
df = pd.read_csv(ndp_file, sep = ",", usecols=[0,1,2,3], header=None, error_bad_lines=False, quoting=csv.QUOTE_NONE)
df = df.sort_values(0)

In [4]:
# there is too much data in this dataframe!
# let's focus just on the homepage
# find a homepage to focus on for the diffs
homepages = df.loc[df[2] == "http://www.ndp.ca/"]

In [5]:
# let's begin by comparing texts
# we can load the SequenceMatcher library

from difflib import SequenceMatcher

In [6]:
# byte-wise metric
# compares two pages character by character
# returns 1 if there is any change at all between two pages

bytewise = []
for y in range(len(homepages)-1):
    first_page = "".join(homepages.iloc[y].tolist()) # convert webpage text into one long string
    second_page = "".join(homepages.iloc[y+1].tolist()) 
    if len(first_page) == len(second_page):
        for k in range(len(first_page)):
            if first_page[k] != second_page[k]:
                bytewise.append(1)
                break   
            elif (k == len(first_page) - 1) and (first_page[k] == second_page[k]):
                bytewise.append(0)
    else:
        bytewise.append(1)

# output is the list of metric values: either 0 (no change) or 1 (any change)
print(bytewise)

[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]


In [7]:
# TF.IDF metric
# compares the occurrence of chosen words in each page
# generates vectors weighted by such occurrences and calculates cosine distance between them

import re
import random
import numpy as np

tfidf = []
for y in range(len(homepages)-1):
    first_page = re.sub("[^\w]", " ",  "".join(homepages.iloc[y].tolist())).split()
    second_page = re.sub("[^\w]", " ",  "".join(homepages.iloc[y+1].tolist())).split()

    # generate random list of words of length >= 5 that appear in first page
    random_words = []
    for i in range(10):
        rand = random.choice(first_page)
        while len(rand) <= 4:
            rand = random.choice(first_page)    
        random_words.append(rand)
    
    # count how many times random words appear in each page
    v1 = []
    v2 = []
    for j in range(len(random_words)):
        v1.append(first_page.count(random_words[j]))
        v2.append(second_page.count(random_words[j]))
    
    if np.linalg.norm(v2) == 0:
        D = 1
    else:
        D = 1 - np.inner(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2))
    tfidf.append(D)

# output is the list of metric values between 0 (very similar) and 1 (very different)
print(tfidf)

[-2.220446049250313e-16, 2.220446049250313e-16, 0.021054989627438903, 0.0, 0.0540946970730829, 0.42264973081037416, -2.220446049250313e-16, 0.0, 0.4522774424948339, 0.3930230213331162, 0.44529980377477085, 0.25464400750007, 0.14160492472104802, 0.23623738417402662, 0.19096016504410962, 0.2499999999999999, 0.24738219099361825, 0.25, 0.4522774424948339, 0.28388512596056714, 0.2132042075305568, 0.26620061429465713, 0.3377338214674781, 0.3675444679663241, 0.20000000000000018, 0.10557280900008414, 0.12294198069297069, 0.08013378899220003, 0.07417990022744858, 0.20740607609878292, 0.4522774424948339, 0.14188366967896693, 0.09861218113400261, 0.125, 0.03923107716947705, 0.15384615384615374, 0.16333997346592444, 0.21553545944726393, 1, 0.4023856953328032, 0.3675444679663241, 0.16794970566215628, 0.5635642195280153, 0.47521642859401036, 0.4729537233052702, 0.23528088709812756, 0.37639043553767637, 1, 0.683772233983162, 0.683772233983162, 0.029274656605849048, 0.12294198069297069, 0.052377445526

In [8]:
# edit distances metric
# number of edits that are required to transform one sentence into another.
# we normalise this number by the total number of characters in each of the two pages 
# ie deleting all of page 1 and adding all of page 2

edit_distances = []
for y in range(len(homepages)-1):
    first_page = "".join(homepages.iloc[y].tolist()) # convert webpage text into one long string
    second_page = "".join(homepages.iloc[y+1].tolist()) 
    len_first = len(first_page)
    len_second = len(second_page)
    
    distance = nltk.edit_distance(first_page, second_page, transpositions=False)
    edit_distances.append(distance / (len_first + len_second))

# output is the list of metric values between 0 (very similar) and 1 (very different)
print(edit_distances)

[0.002325581395348837, 0.0011627906976744186, 0.0011627906976744186, 0.0011627906976744186, 0.002325581395348837, 0.1267605633802817, 0.0010141987829614604, 0.002028397565922921, 0.41021548284118114, 0.23275862068965517, 0.4267399267399267, 0.3869346733668342, 0.22781065088757396, 0.5180851063829788, 0.5, 0.3263525305410122, 0.31356693620844567, 0.2268199233716475, 0.3921951219512195, 0.398422090729783, 0.20814132104454686, 0.22344610542879623, 0.4748858447488584, 0.5545073375262054, 0.21248499399759904, 0.12749213011542498, 0.12493268712977922, 0.15263983272347098, 0.15626560159760358, 0.20020964360587, 0.2292358803986711, 0.26662444585180495, 0.14487632508833923, 0.14931906614785992, 0.14557889594528578, 0.1411351079859367, 0.14461994076999013, 0.11724806201550388, 0.8921124206708976, 0.8498727735368957, 0.26651818856718634, 0.23558368495077356, 0.36596523330283626, 0.3399915002124947, 0.25690140845070425, 0.25314285714285717, 0.2769709543568465, 0.8818525519848771, 0.832920792079207

In [16]:
# word distance metric 
# calculates how many of the words on a page have changed
# split option allows us to consider if splitting the strings into words makes a difference

word_distance = []
word_distance_split = []
for y in range (0,len(homepages)-1):
    first_page = homepages.iloc[y].tolist()
    second_page = homepages.iloc[y+1].tolist()
    seq = SequenceMatcher(None, first_page[3],second_page[3])
    seq_split = SequenceMatcher(None, first_page[3].split(),second_page[3].split())
    distance = 1-seq.ratio()
    distance_split = 1-seq_split.ratio()
    word_distance.append(distance)
    word_distance_split.append(distance_split)
    
# output is the list of metric values between 0 (very similar) and 1 (very different)
print(word_distance)
print(word_distance_split)
    


[0.0, 0.0, 0.0, 0.0, 0.0, 0.2673733804475854, 0.0, 0.0, 0.7981340118744699, 0.7294281729428174, 0.7347740667976425, 0.7003257328990228, 0.5727699530516432, 0.7251732101616628, 0.6795180722891566, 0.7332089552238805, 0.7305101058710298, 0.5905767668562145, 0.7266035751840167, 0.6744680851063829, 0.5684039087947883, 0.5538847117794486, 0.6807980049875312, 0.8590909090909091, 0.4183417085427136, 0.3471615720524017, 0.37745372966909707, 0.3561718325176726, 0.40902021772939345, 0.5070883315158125, 0.30817610062893086, 0.36345514950166113, 0.3917147351861563, 0.40262361251261347, 0.35022807906741005, 0.323943661971831, 0.39856557377049184, 0.3346733668341708, 0.9591836734693877, 0.9606741573033708, 0.7391987431264728, 0.510385756676558, 0.6761363636363636, 0.6972356296621325, 0.606114050558495, 0.5906921241050119, 0.6839266450916937, 0.9695121951219512, 0.9455040871934605, 0.7764127764127764, 0.8716119828815977, 0.2813141683778234, 0.256857855361596, 0.23469387755102045, 0.6083650190114068, 