In [2]:
import sys
import os
from time import time

DATAPATH = '/Users/balthasar/Google Drive/MTL_data/wsj/'
sys.path.append(DATAPATH)
sys.path.append('../')

from src.util import count_lines_many, data_from_many, unpack, bunch_paragraphs

datafiles = [DATAPATH + file for file in os.listdir(DATAPATH)]
print("Number of entries: %i" % (count_lines_many(datafiles)))

Number of entries: 46726


In [3]:
import re 
from bs4 import BeautifulSoup

data = data_from_many(datafiles)
url, html = unpack(next(data))

In [61]:
def get_paragraphs(html):
    soup = BeautifulSoup(html,'html.parser')
    paragraphs = [tag.get_text() for tag in 
        soup.findAll('title')+soup.findAll("h1",'wsj-article-headline')+soup.findAll("h2",'sub-head')+soup.findAll("p")]
    if len(paragraphs) != 0:
        return paragraphs
    else:
        return 'NOCONTENT'
    
label_re = re.compile('(?<=<meta name="article.type" content=").*(?=" />)')
def get_label(html):
    label = label_re.search(html)
    if label:
        return '/'.join(label.group().split(' - '))
    else:
        return "NOLABEL"
     

def preprocess(item):
    url, html = unpack(item)
    label = get_label(html)
    paragraphs = get_paragraphs(html)
    return label, paragraphs

In [63]:
# Class count -- count only entries that have easily identifiable content and label
from collections import Counter
from multiprocessing import Pool

classes = Counter()
subclasses = Counter()

i,j = (0,0)
word_count = 0
paragraph_count = 0

def analyze_item_one(item):
    global i,j,word_count,paragraph_count,classes,subclasses
    i+=1
    label,paragraphs = item
    if label != 'NOLABEL' and paragraphs != 'NOCONTENT':
        j+=1
        class_ = label.split('/')[0]
        subclass_ = label
        
        paragraph_count += len(paragraphs)
        for paragraph in paragraphs:
            word_count += len(paragraph.split(' '))

        classes.update([class_])
        subclasses.update([subclass_])
        
        print("[%i/%i] Word Count: %i Paragraph Count: %i Top-10 classes: %s" % (j,i,word_count,paragraph_count,str(classes.most_common(10))),end='\r')

def analyze_item_many(data):
    list(map(analyze_item_one,data))

data = map(preprocess,data_from_many(datafiles))
analyze_item_many(data)        
with open('../summaries/wsj.txt','w') as file:
    file.write('WSJ\n')
    file.write(50*"#")
    file.write('\nWORD COUNT: %i\n' % word_count)
    file.write('PARAGRAPH COUNT: %i\n' % paragraph_count)
    file.write('CLASSES: \n')
    for item in list(classes.items()):
        file.write('%s\t%i\n' % item)
    file.write('\n\n')
    file.write('SUBCLASSES: \n')
    for item in list(subclasses.items()):
        file.write('%s\t%i\n' % item)
    file.write('\n\n')

[45346/46726] Word Count: 22914862 Paragraph Count: 777932 Top-10 classes: [('Business', 4532), ('Politics and Policy', 4156), ('Technology', 3390), ('Markets Main', 3275), ('Heard on the Street', 2169), ('World News', 2096), ('U.S. News', 2074), ('Middle East News', 1267), ('Asia News', 1196), ('Europe News', 1169)]