In [2]:
import os
import sys
from bs4 import BeautifulSoup
import pymongo
import datetime
import urllib2
import json

In [3]:
def get_full_text(url):
    try:
        opener = urllib2.build_opener(urllib2.HTTPCookieProcessor())
        response = opener.open(url)
        html = response.read()
        soup = BeautifulSoup(html, 'html.parser')
        story_contet = soup.find_all('p', { "class" : "story-content" })
        return ' '.join([s.string for s in story_contet if s.string])
    except:
        return None

In [4]:
def article_to_doc(article):
    doc = {}
    doc['_id'] = article['_id']
    full_text = get_full_text(article['web_url'])
    if not full_text:
        print article['web_url']
        return None
    doc['full_text'] = full_text
    if article['abstract'] is not None:
        doc['abstract'] = article['abstract'].encode("utf8")
    doc['headline'] = article['headline']['main'].encode("utf8")
    doc['desk'] = article['news_desk']
    doc['date'] = article['pub_date'] # cutting time of day.
    try:
        date_object = datetime.datetime.strptime(article['pub_date'], '%Y-%m-%dT%H:%M:%SZ')
        doc["ISODate"] = date_object
    except ValueError as e:
        try:
            date_object = datetime.datetime.strptime(article['pub_date'], '%Y-%m-%dT%H:%M:%S+0000')
            doc["ISODate"] = date_object
        except:
            print "no ISO date id: %d"%doc["_id"]
    doc['section'] = article['section_name']
    doc['subsection'] = article['subsection_name']
    if article['snippet'] is not None:
        doc['snippet'] = article['snippet'].encode("utf8")
    doc['source'] = article['source']
    doc['types_of_material'] = article['type_of_material']
    doc['url'] = article['web_url']
    doc['word_count'] = article['word_count']
    # locations
    locations = []
    for x in range(0,len(article['keywords'])):
        if 'glocations' in article['keywords'][x]['name']:
            locations.append(article['keywords'][x]['value'])
    doc['locations'] = locations
    # subject
    subjects = []
    for x in range(0,len(article['keywords'])):
        if 'subject' in article['keywords'][x]['name']:
            subjects.append(article['keywords'][x]['value'])
    doc['descriptors'] = subjects   
    return doc

In [5]:
docs = []

In [None]:
articles = []
with open ('articles.json', 'r') as f:
    articles = json.load(f)

In [9]:
client = pymongo.MongoClient('mongodb://um.media.mit.edu:27017/super-glue')
db = client.nyt_corpus
articles_collection = db.test_articles

In [8]:
added = False
for i, article in enumerate(articles):
    if articles_collection.find({"_id": article["_id"]}).count()==0:
        if not added:
            print "starting to add!"
            added = True
        doc = article_to_doc(article)
        if doc:
            articles_collection.insert_one(doc)
            docs.append(doc)
        else:
            print i
        

starting to add!
http://www.nytimes.com/1999/01/03/travel/deep-in-africa-a-most-civilized-discovery.html
2306
http://www.nytimes.com/1995/04/30/nyregion/where-training-awaits-students-in-need.html
2308
http://www.nytimes.com/1993/08/29/nyregion/when-wooded-trail-and-stream-beckon.html
2309
http://www.nytimes.com/1993/05/23/nyregion/yale-student-immortalizes-his-friends-from-the-soup-kitchen.html
2310
http://www.nytimes.com/1993/04/18/nyregion/when-baby-makes-a-lot-more-than-3.html
2311
http://www.nytimes.com/1992/09/27/nyregion/young-artists-draw-on-their-resources.html
2312
http://www.nytimes.com/1991/10/20/nyregion/welcoming-japanese-companies-east-meets-west-at-bottom-line.html
2313
http://www.nytimes.com/1995/08/06/nyregion/view-voluntown-forest-clear-water-camping-pachaug-hopeville-pond.html
2314
http://www.nytimes.com/1994/05/01/nyregion/view-new-haven-public-access-tv-it-s-storer-s-money-but-independent-talent.html
2315
http://www.nytimes.com/1993/07/04/nyregion/view-new-haven-c

In [8]:
docs = []
db_articles = articles_collection.find({})
for doc in db_articles:
    docs.append(doc)

In [10]:
sorted_by_date = sorted(docs, key=lambda k: k['ISODate'])

In [11]:
sorted_by_date[2100]

{u'ISODate': datetime.datetime(2016, 12, 16, 11, 0, 1),
 u'_id': u'5853c93b7c459f2525d12393',
 u'date': u'2016-12-16T11:00:01+0000',
 u'descriptors': [u'Gifts',
  u'Families and Family Life',
  u'Christmas',
  u'Children and Childhood'],
 u'desk': u'Well',
 u'full_text': u'I\u2019ve buried my phone at the bottom of my purse so I can try to ignore the calls, emails and texts from well-meaning relatives, all of whom are asking: Where is the Christmas list? They want suggestions for what to give the kids, the more specific the better. If I say, \u201cMaybe a Lego set?\u201d they\u2019ll ask, \u201cWhich one?\u201d I think it\u2019s about longing: to be taken care of, to let someone else do at least part of the planning. I may be an adult, but some part of me still has a child\u2019s desire to wake up, starry-eyed, and find that gifts have materialized under the tree \u2014 surprises chosen with love and obtained in secret, waiting to be opened in wonder. We don\u2019t outgrow that. Then t

In [12]:
subjects = set([sub for story in docs for sub in story['descriptors']])

In [13]:
descriptors = []
desc = {}
with open('../descriptors.json') as data_file:
    desc = json.load(data_file)
    descriptors = [x['word'] for x in desc]

In [None]:
descriptors.remove("")

In [15]:
missing = [sub.lower() for sub in list(subjects) if sub.lower() not in descriptors]

In [25]:
real_missing = []
count= 0
for word in missing:
    found = False
    for desc in descriptors:
        a = word.lower().split()
        b = desc.split()
        if set(a) < set(b) or set(b) < set(a):
            print "%s  --  %s"%(word, desc)
            found=True
#     print found
    if found:
        count+=1
    else:
        real_missing.append(word)

teenagers and adolescence  --  teenagers
israeli settlements  --  israeli settlements (occupied territories)
israeli settlements  --  israeli settlements (occupiedterritories)
basketball (college)  --  basketball
world trade center (manhattan, ny)  --  world trade center
labor and jobs  --  labor
history (academic subject)  --  history
united nations framework convention on climate change  --  climate
united nations framework convention on climate change  --  united nations
memorabilia and souvenirs  --  memorabilia
memorabilia and souvenirs  --  souvenirs
greenhouse gas emissions  --  gas
citizenship and naturalization  --  citizenship
same-sex marriage, civil unions and domestic partnerships  --  partnerships
middle east and africa migrant crisis  --  middle east
bioenergy and biofuels  --  biofuels
nuclear weapons  --  rocky flats nuclear weapons plant (colo)
nuclear weapons  --  weapons
shopping centers and malls  --  shopping centers
anthropology  --  archaeology and anthropology


In [26]:
real_missing

[u'afghanistan war (2001-14)',
 u'segregation and desegregation',
 u'renminbi (currency)',
 u'birdwatching',
 u'deflation (economics)',
 u'parent-teacher associations',
 u'e-mail',
 u'smartphones',
 u'uefa europa league',
 u'living wage',
 u'no child left behind act',
 u'second avenue subway (nyc)',
 u'carjacking',
 u'shipwrecks (historic)',
 u'cycling, mountain bike',
 u'arms trade',
 u'frequent flier programs',
 u'social networking (internet)',
 u'parenting',
 u'roommates',
 u'red burgundy (wine)',
 u'offshore drilling and exploration',
 u'deferred action for childhood arrivals',
 u'typhoon haiyan (2013)',
 u'whites',
 u'paris attacks (november 2015)',
 u'sustainable living',
 u'vis-comments',
 u'organ donation',
 u'co-working',
 u'denisova hominid',
 u'medical devices',
 u'panama papers',
 u'graphic novels',
 u'mortgage-backed securities',
 u'synthetic cannabinoids',
 u'3-d devices and effects',
 u'classified information and state secrets',
 u'rohingya (ethnic group)',
 u'british po

In [28]:
with open('missing_labels.json' ,'w') as out_file:
    json.dump(real_missing, out_file)

In [27]:
print "%d / %d"%(count,len(missing))

209 / 394


In [51]:
long_desc = [d for d in descriptors if len(d)>90]

In [72]:
import numpy as np
true_1291 = np.zeros(len(descriptors))

In [74]:
for d in docs[1291]['descriptors']:
    if d.lower() in descriptors:
        true_1291[descriptors.index(d.lower())] = 1

3.0

In [77]:
with open('articles.json', 'w') as f:
    json.dump(docs, f)

TypeError: datetime.datetime(2016, 12, 31, 23, 0, 24) is not JSON serializable

In [53]:
long_desc

[u'amilies and family life?( food and agriculture organization, united nations (fao)0 far east, south and southeast asia and pa',
 u'ews and news media newsdealers and newsstands newspaper guild, the (tng)d 9\ufffd national football league (nfl)s5 new h',
 u'nited states economy united states foreign service i united states international relations c- united states merchan',
 u'olitics and government5 political and economic integration poliomyelitis polisario front (front for liberation of saguia',
 u'uspensions, dismissals and resignations sutton, percy e suzuki, zenko (prime min) 9 svoboda, ludvik (1895-1979) 080a',
 u'customs (tariff),treaties,treaties,international relations,international relations,buddhism,treaties,international relations,buddhism',
 u'politics and government,budgets and budgeting,united states politics and government,budgets and budgeting,united states politics and government,finances',
 u'teachers and school employees,united states politics and government,city co