# Exploring Diffs in the NDP Plain Text File

In [1]:
# library imports
import csv
import nltk
import pandas as pd
import numpy as np

In [3]:
# import NDP file (~3GB of plain text)
ndp_file = "/Users/ianmilligan1/Desktop/227-www-ndp-ca.csv"

In [4]:
# load the NDP plain text as a dataframe
df = pd.read_csv(ndp_file, sep = ",", usecols=[0,1,2,3], header=None, error_bad_lines=False, quoting=csv.QUOTE_NONE)
df = df.sort_values(0)

In [5]:
# find a homepage to focus on for the diffs
homepages = df.loc[df[2] == "http://www.ndp.ca/"]

In [7]:
# how many homepages are there?
len(homepages)

90

In [8]:
# let's look at one record
# should be the first crawl in the collection
homepages.iloc[0].tolist()

['(20051007',
 'www.ndp.ca',
 'http://www.ndp.ca/',
 "NDP | The New Democratic Party of Canada ABOUT ›\xa0Jack Layton ›\xa0Our Caucus ›\xa0Our History ›\xa0Contact Us GET INVOLVED ›\xa0Your Riding ›\xa0Campaigns ›\xa0Events ›\xa0Youth RESOURCES ›\xa0Press Room ›\xa0Speeches & Articles ›\xa0Downloads ›\xa0e.NDP ›\xa0More... SEARCH ›\xa0Français\xa0 › NDP Budget Details › Jack's Budget Speech › Quotes on NDP Budget › Session in Review Privacy Policy |\xa0Jobs |\xa0RSS © 2005 New Democratic Party"]

In [45]:
# let's now iterate through each pair of pages, diffing them 
# with each other. 
# we then create a list of lists, which can feed into a DF
#

final = []
for y in range (0,len(homepages)-1):
    first_page = homepages.iloc[y].tolist()
    second_page = homepages.iloc[y+1].tolist()
    distance = nltk.edit_distance(first_page[3],second_page[3])
    data = first_page[0],first_page[2],second_page[0],second_page[2],distance
    final.append(list(data))

In [48]:
ndp_diffs = pd.DataFrame(final, columns=['origin_date','origin_url','target_date','target_url','diff'])

In [49]:
# let's see the pages with the largest diffs

ndp_diffs.sort_values('diff',ascending=False)

Unnamed: 0,origin_date,origin_url,target_date,target_url,diff
64,(20140805,http://www.ndp.ca/,(20141105,http://www.ndp.ca/,1475
65,(20141105,http://www.ndp.ca/,(20150206,http://www.ndp.ca/,1420
62,(20140205,http://www.ndp.ca/,(20140505,http://www.ndp.ca/,1232
56,(20120803,http://www.ndp.ca/,(20121103,http://www.ndp.ca/,1231
38,(20080711,http://www.ndp.ca/,(20080930,http://www.ndp.ca/,981
47,(20100503,http://www.ndp.ca/,(20100803,http://www.ndp.ca/,932
67,(20150505,http://www.ndp.ca/,(20150805,http://www.ndp.ca/,932
42,(20090502,http://www.ndp.ca/,(20090802,http://www.ndp.ca/,799
43,(20090802,http://www.ndp.ca/,(20091102,http://www.ndp.ca/,798
76,(20161105,http://www.ndp.ca/,(20170205,http://www.ndp.ca/,773
