Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update to Python 3 #69

Closed
wants to merge 2 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 6 additions & 6 deletions scripts/combine_grouped_links_files.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
Output is written to stdout.
"""

from __future__ import print_function


import io
import sys
Expand All @@ -14,7 +14,7 @@
# Validate input arguments.
if len(sys.argv) < 2:
print('[ERROR] Not enough arguments provided!')
print('[INFO] Usage: {0} <outgoing_links_file> <incoming_links_file>'.format(sys.argv[0]))
print(('[INFO] Usage: {0} <outgoing_links_file> <incoming_links_file>'.format(sys.argv[0])))
sys.exit()

OUTGOING_LINKS_FILE = sys.argv[1]
Expand All @@ -31,16 +31,16 @@
# Create a dictionary of page IDs to their incoming and outgoing links.
LINKS = defaultdict(lambda: defaultdict(str))
for line in io.BufferedReader(gzip.open(OUTGOING_LINKS_FILE, 'r')):
[source_page_id, target_page_ids] = line.rstrip('\n').split('\t')
[source_page_id, target_page_ids] = line.decode('UTF-8').rstrip('\n').split('\t')
LINKS[source_page_id]['outgoing'] = target_page_ids

for line in io.BufferedReader(gzip.open(INCOMING_LINKS_FILE, 'r')):
[target_page_id, source_page_ids] = line.rstrip('\n').split('\t')
[target_page_id, source_page_ids] = line.decode('UTF-8').rstrip('\n').split('\t')
LINKS[target_page_id]['incoming'] = source_page_ids

# For each page in the links dictionary, print out its incoming and outgoing links as well as their
# counts.
for page_id, links in LINKS.iteritems():
for page_id, links in list(LINKS.items()):
outgoing_links = links.get('outgoing', '')
outgoing_links_count = 0 if outgoing_links is '' else len(
outgoing_links.split('|'))
Expand All @@ -52,4 +52,4 @@
columns = [page_id, str(outgoing_links_count), str(
incoming_links_count), outgoing_links, incoming_links]

print('\t'.join(columns))
print(('\t'.join(columns)))
11 changes: 5 additions & 6 deletions scripts/prune_pages_file.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,17 +5,16 @@
Output is written to stdout.
"""

from __future__ import print_function


import io
import sys
import gzip
from sets import Set

# Validate input arguments.
if len(sys.argv) < 3:
print('[ERROR] Not enough arguments provided!')
print('[INFO] Usage: {0} <pages_file> <redirects_file>'.format(sys.argv[0]))
print(('[INFO] Usage: {0} <pages_file> <redirects_file>'.format(sys.argv[0])))
sys.exit()

PAGES_FILE = sys.argv[1]
Expand All @@ -32,13 +31,13 @@
# Create a dictionary of redirects.
REDIRECTS = {}
for line in io.BufferedReader(gzip.open(REDIRECTS_FILE, 'r')):
[source_page_id, _] = line.rstrip('\n').split('\t')
[source_page_id, _] = line.decode('UTF-8').rstrip('\n').split('\t')
REDIRECTS[source_page_id] = True

# Loop through the pages file, ignoring pages which are marked as redirects but which do not have a
# corresponding redirect in the redirects dictionary, printing the remaining pages to stdout.
for line in io.BufferedReader(gzip.open(PAGES_FILE, 'r')):
[page_id, page_title, is_redirect] = line.rstrip('\n').split('\t')
[page_id, page_title, is_redirect] = line.decode('UTF-8').rstrip('\n').split('\t')

if is_redirect == '0' or page_id in REDIRECTS:
print('\t'.join([page_id, page_title, is_redirect]))
print(('\t'.join([page_id, page_title, is_redirect])))
9 changes: 4 additions & 5 deletions scripts/replace_titles_and_redirects_in_links_file.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@
import io
import sys
import gzip
from sets import Set

# Validate inputs
if len(sys.argv) < 4:
Expand All @@ -35,23 +34,23 @@
sys.exit()

# Create a set of all page IDs and a dictionary of page titles to their corresponding IDs.
ALL_PAGE_IDS = Set()
ALL_PAGE_IDS = set()
PAGE_TITLES_TO_IDS = {}
for line in io.BufferedReader(gzip.open(PAGES_FILE, 'r')):
[page_id, page_title, _] = line.rstrip('\n').split('\t')
[page_id, page_title, _] = line.decode('UTF-8').rstrip('\n').split('\t')
ALL_PAGE_IDS.add(page_id)
PAGE_TITLES_TO_IDS[page_title] = page_id

# Create a dictionary of page IDs to the target page ID to which they redirect.
REDIRECTS = {}
for line in io.BufferedReader(gzip.open(REDIRECTS_FILE, 'r')):
[source_page_id, target_page_id] = line.rstrip('\n').split('\t')
[source_page_id, target_page_id] = line.decode('UTF-8').rstrip('\n').split('\t')
REDIRECTS[source_page_id] = target_page_id

# Loop through each line in the links file, replacing titles with IDs, applying redirects, and
# removing nonexistent pages, writing the result to stdout.
for line in io.BufferedReader(gzip.open(LINKS_FILE, 'r')):
[source_page_id, target_page_title] = line.rstrip('\n').split('\t')
[source_page_id, target_page_title] = line.decode('UTF-8').rstrip('\n').split('\t')

source_page_exists = source_page_id in ALL_PAGE_IDS

Expand Down
15 changes: 7 additions & 8 deletions scripts/replace_titles_in_redirects_file.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,17 +4,16 @@
Output is written to stdout.
"""

from __future__ import print_function


import io
import sys
import gzip
from sets import Set

# Validate input arguments.
if len(sys.argv) < 3:
print('[ERROR] Not enough arguments provided!')
print('[INFO] Usage: {0} <pages_file> <redirects_file>'.format(sys.argv[0]))
print(('[INFO] Usage: {0} <pages_file> <redirects_file>'.format(sys.argv[0])))
sys.exit()

PAGES_FILE = sys.argv[1]
Expand All @@ -29,18 +28,18 @@
sys.exit()

# Create a set of all page IDs and a dictionary of page titles to their corresponding IDs.
ALL_PAGE_IDS = Set()
ALL_PAGE_IDS = set()
PAGE_TITLES_TO_IDS = {}
for line in io.BufferedReader(gzip.open(PAGES_FILE, 'r')):
[page_id, page_title, _] = line.rstrip('\n').split('\t')
[page_id, page_title, _] = line.decode('UTF-8').rstrip('\n').split('\t')
ALL_PAGE_IDS.add(page_id)
PAGE_TITLES_TO_IDS[page_title] = page_id

# Create a dictionary of redirects, replace page titles in the redirects file with their
# corresponding IDs and ignoring pages which do not exist.
REDIRECTS = {}
for line in io.BufferedReader(gzip.open(REDIRECTS_FILE, 'r')):
[source_page_id, target_page_title] = line.rstrip('\n').split('\t')
[source_page_id, target_page_title] = line.decode('UTF-8').rstrip('\n').split('\t')

source_page_exists = source_page_id in ALL_PAGE_IDS
target_page_id = PAGE_TITLES_TO_IDS.get(target_page_title)
Expand All @@ -50,7 +49,7 @@

# Loop through the redirects dictionary and remove redirects which redirect to another redirect,
# writing the remaining redirects to stdout.
for source_page_id, target_page_id in REDIRECTS.iteritems():
for source_page_id, target_page_id in list(REDIRECTS.items()):
start_target_page_id = target_page_id

redirected_count = 0
Expand All @@ -65,4 +64,4 @@
target_page_id = None

if target_page_id is not None:
print('\t'.join([source_page_id, target_page_id]))
print(('\t'.join([source_page_id, target_page_id])))