In [1]:
# library imports
import csv
import nltk
import pandas as pd
import numpy as np

In [2]:
# import NDP file (~3GB of plain text)
ndp_file = "/Users/ianmilligan1/Desktop/227-www-ndp-ca.csv"

In [3]:
# load the NDP plain text as a dataframe
df = pd.read_csv(ndp_file, sep = ",", usecols=[0,1,2,3], header=None, error_bad_lines=False, quoting=csv.QUOTE_NONE)
df = df.sort_values(0)

In [4]:
# there is too much data in this dataframe!
# let's focus just on the homepage
# find a homepage to focus on for the diffs
homepages = df.loc[df[2] == "http://www.ndp.ca/"]

In [5]:
# how many homepages are there?
len(homepages)

90

In [6]:
# let's look at the first ROW of the dataframe
# this should be, given sorting, the first crawl

homepages.iloc[0].tolist()

['(20051007',
 'www.ndp.ca',
 'http://www.ndp.ca/',
 "NDP | The New Democratic Party of Canada ABOUT ›\xa0Jack Layton ›\xa0Our Caucus ›\xa0Our History ›\xa0Contact Us GET INVOLVED ›\xa0Your Riding ›\xa0Campaigns ›\xa0Events ›\xa0Youth RESOURCES ›\xa0Press Room ›\xa0Speeches & Articles ›\xa0Downloads ›\xa0e.NDP ›\xa0More... SEARCH ›\xa0Français\xa0 › NDP Budget Details › Jack's Budget Speech › Quotes on NDP Budget › Session in Review Privacy Policy |\xa0Jobs |\xa0RSS © 2005 New Democratic Party"]

In [7]:
# let's just strip the text out
# this is the fourth column in the dataframe
# (which is 3 in Python's start from 0 numbering)

alltext = homepages[3].tolist()

# alltext now contains all of the homepage text

In [8]:
# let's begin by comparing texts
# we can load the SequenceMatcher library

from difflib import SequenceMatcher

In [9]:
# let's now iterate through each pair of pages, diffing them 
# with each other. 
# we then create a list of lists, which can feed into a DF
#

final = []
for y in range (0,len(homepages)-1):
    first_page = homepages.iloc[y].tolist()
    second_page = homepages.iloc[y+1].tolist()
    seq = SequenceMatcher(None, first_page[3],second_page[3])
    distance = seq.ratio()*100
    data = first_page[0],first_page[2],second_page[0],second_page[2],distance
    final.append(list(data))

In [11]:
# now in final we have the results.
# let's turn it into a dataframe

ndp_diffs_sequence = pd.DataFrame(final, columns=['origin_date','origin_url','target_date','target_url','diff'])

In [12]:
# here I sort the dataframes
# in this example, the lower diff numbers are most different

sorted = ndp_diffs_sequence.sort_values('diff')

In [13]:
sorted

Unnamed: 0,origin_date,origin_url,target_date,target_url,diff
88,(20180724,http://www.ndp.ca/,(20180807,http://www.ndp.ca/,0.000000
86,(20180505,http://www.ndp.ca/,(20180724,http://www.ndp.ca/,0.000000
85,(20180505,http://www.ndp.ca/,(20180505,http://www.ndp.ca/,0.000000
87,(20180724,http://www.ndp.ca/,(20180724,http://www.ndp.ca/,0.000000
82,(20180205,http://www.ndp.ca/,(20180205,http://www.ndp.ca/,0.000000
81,(20171105,http://www.ndp.ca/,(20180205,http://www.ndp.ca/,0.000000
61,(20131105,http://www.ndp.ca/,(20140205,http://www.ndp.ca/,0.000000
62,(20140205,http://www.ndp.ca/,(20140505,http://www.ndp.ca/,0.000000
70,(20160205,http://www.ndp.ca/,(20160505,http://www.ndp.ca/,0.000000
71,(20160505,http://www.ndp.ca/,(20160505,http://www.ndp.ca/,0.000000
