In [1]:
import pandas as pd
import numpy as np
import requests
import cnfg
import json
import re
from pymongo import MongoClient
import pprint
import string
import nltk

from  nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer

from IPython.display import clear_output

In [2]:
config = cnfg.load(".katie_mongo.cnfg")
mongouser=config['user']
mongopwd=config['pwd']

In [3]:
###MONGO initialization
client = MongoClient("mongodb://"+mongouser+":"+mongopwd+"@localhost/test")
db = client.test #database
table = db.nyttest #"table" also called collection

In [9]:
def custom_tokenizer(text):

    # tokenize
    tokens = word_tokenize(text)
    
    # stem
    #stemmer = SnowballStemmer('english')
    #tokens_stem = [stemmer.stem(y) for y in tokens_stop] 
    
    #lemmatize
    lemma=nltk.stem.WordNetLemmatizer()
    lems = [lemma.lemmatize(y) for y in tokens]

    # remove stop words
    stop_words = stopwords.words('english')
    tokens_stop = [y for y in lems if y not in stop_words]

    return (" ".join(tokens_stop)) #return one long string as opposed to list of strings

def preprocess_text(text):
    # remove punctuation
    remove_punct = str.maketrans('', '', string.punctuation)
    text = text.replace('“','').replace('”','').replace('’','').translate(remove_punct)

    # remove digits and convert to lower case
    remove_digits = str.maketrans('', '', string.digits)
    text = text.strip().lower().translate(remove_digits)
    
    return text

In [10]:
def preprocess_me_capn():
    cursor=table.find({'$and':[{'original_text':{'$exists':True}},{'processed_text':{'$exists':False}}]})
    for item in cursor:
        try:
            original=item['original_text']
            processed_text=preprocess_text(original)
            table.update_one({'_id':item['_id']}, {'$set': {'processed_text': processed_text} }, upsert=False)
        except:
            pass

In [11]:
%time preprocess_me_capn()

CPU times: user 13.1 s, sys: 0 ns, total: 13.1 s
Wall time: 14.3 s


In [12]:
def tokenize_me_capn():
    cursor=table.find({'$and':[{'processed_text':{'$exists':True}},{'tokenized_lemmas':{'$exists':False}}]})
    for item in cursor:
        original=item['processed_text']
        tokenz=custom_tokenizer(original)
        table.update_one({'_id':item['_id']}, {'$set': {'tokenized_lemmas': tokenz} }, upsert=False)

In [13]:
%time tokenize_me_capn()

CPU times: user 2min 1s, sys: 376 ms, total: 2min 2s
Wall time: 2min 3s


In [14]:
#return stemmed tokens to a long string, used in early stages

In [9]:
# cursor=table.find({'tokenized_snowball':{'$exists':True}})
# for item in cursor:
#     try:
#         original=item['tokenized_snowball']
#         s=" ".join(original)
#         table.update_one({'_id':item['_id']}, {'$set': {'tokenized_snowball': s} }, upsert=False)
#     except:
#         pass

In [54]:
# Both "science desk" and 'science' are present, et cetera

def consolidate_desks():
    #Sports Desk->Sports
    table.update_many({'news_desk':'Sports Desk'}, {'$set': {'news_desk': 'Sports'}})
        
    #The City Weekly Desk->City Weekly
    table.update_many({'news_desk':'The City Weekly Desk'}, {'$set': {'news_desk': 'City Weekly'}})
    
    #CityWeekly->City Weekly
    table.update_many({'news_desk':'CityWeekly'}, {'$set': {'news_desk': 'City Weekly'}})

    #Science Desk->Science
    table.update_many({'news_desk':'Science Desk'}, {'$set': {'news_desk': 'Science'}})
    
    #Science Desk;->Science
    table.update_many({'news_desk':'Science Desk;'}, {'$set': {'news_desk': 'Science'}})
    
    #SCI -> Science
    table.update_many({'news_desk':'SCI'}, {'$set': {'news_desk': 'Science'}})
        
    #Weschester -> Westchester
    table.update_many({'news_desk':'Weschester Weekly Desk'}, {'$set': {'news_desk': 'Westchester Weekly'}})
        
    #Magazine Desk-> Magazine
    table.update_many({'news_desk':'Magazine Desk'}, {'$set': {'news_desk': 'Magazine'} })
    
    #MAG Desk-> Magazine
    table.update_many({'news_desk':'MAG'}, {'$set': {'news_desk': 'Magazine'} })
        
    #Metropolitan Desk-> Metro
    table.update_many({'news_desk':'Metropolitan Desk'}, {'$set': {'news_desk': 'Metro'} })
    
    #Metropolitan Desk;->Metro
    table.update_many({'news_desk':'Metropolitan Desk;'}, {'$set': {'news_desk': 'Metro'} })

    #Arts and Leisure Desk-> Arts&Leisure
    table.update_many({'news_desk':'Arts and Leisure Desk'}, {'$set': {'news_desk': 'Arts&Leisure'} })
    
    #Arts & Leisure Desk-> Arts&Leisure
    table.update_many({'news_desk':'Arts & Leisure Desk'}, {'$set': {'news_desk': 'Arts&Leisure'} })
        
    #Editorial Desk-> Editorial
    table.update_many({'news_desk':'Editorial Desk'}, {'$set': {'news_desk': 'Editorial'} })    
    
    #House & Home/Style Desk-> Home Desk
    table.update_many({'news_desk':'House & Home/Style Desk'}, {'$set': {'news_desk': 'Home Desk'} }) 
        
    #Financial Desk-> Business
    table.update_many({'news_desk':'Financial Desk'}, {'$set': {'news_desk': 'Business'} })
        
    #Money & Business/Financial Desk-> Business
    table.update_many({'news_desk':'Money and Business/Financial Desk'}, {'$set': {'news_desk': 'Business'} })
    
    #Money & Business-> Business
    table.update_many({'news_desk':'Money & Business'}, {'$set': {'news_desk': 'Business'} })
    
    #Business/Financial Desk-> Business
    table.update_many({'news_desk':'Business/Financial Desk'}, {'$set': {'news_desk': 'Business'} })
    
    #Personal Investing->Business
    table.update_many({'news_desk':'Personal Investing'}, {'$set': {'news_desk': 'Business'} })
    
    #Personal Investing Supplement Desk ->Business
    table.update_many({'news_desk':'Personal Investing Supplement Desk'}, {'$set': {'news_desk': 'Business'} })
        
    #Book Review Desk-> Book Review
    table.update_many({'news_desk':'Book Review Desk'}, {'$set': {'news_desk': 'Book Review'} })
    
    #BookReview-> Book Review
    table.update_many({'news_desk':'BookReview'}, {'$set': {'news_desk': 'Book Review'} })

    #Dining In, Dining Out/Style Desk-> Style
    table.update_many({'news_desk':'Dining In, Dining Out/Style Desk'}, {'$set': {'news_desk': 'Style'} })
    
    #Style Desk-> Style
    table.update_many({'news_desk':'Style Desk'}, {'$set': {'news_desk': 'Style'} })
    
    #ROUND TWO
    #Week In Review DeskWeek In Review Desk -> Week In Review
    table.update_many({'news_desk':'Week In Review DeskWeek In Review Desk'}, {'$set': {'news_desk': 'Week In Review'} })
    
    #Week in Review Desk -> Week In Review
    table.update_many({'news_desk':'Week in Review Desk'}, {'$set': {'news_desk': 'Week In Review'} })
    
    #Society Desk -> Society
    table.update_many({'news_desk':'Society Desk'}, {'$set': {'news_desk': 'Society'} })
    
    #Culture Desk -> Culture
    table.update_many({'news_desk':'Culture Desk'}, {'$set': {'news_desk': 'Culture'} })
    
    #Cultural Desk -> Culture
    table.update_many({'news_desk':'Cultural Desk'}, {'$set': {'news_desk': 'Culture'} })
    
    #Movies, Performing Arts/Weekend Desk -> Arts&Leisure
    table.update_many({'news_desk':'Movies, Performing Arts/Weekend Desk'}, {'$set': {'news_desk': 'Arts&Leisure'} })
    
    #The Arts/Cultural Desk ->Arts&Leisure
    table.update_many({'news_desk':'The Arts/Cultural Desk'}, {'$set': {'news_desk': 'Arts&Leisure'} })
    
    #Arts & Ideas/Cultural Desk - Arts&Leisure
    table.update_many({'news_desk':'Arts & Ideas/Cultural Desk'}, {'$set': {'news_desk': 'Arts&Leisure'} })
    
    #Travel Desk -> Travel
    table.update_many({'news_desk':'Travel Desk'}, {'$set': {'news_desk': 'Travel'} })
    
    #Real Estate Desk - Real Estate
    table.update_many({'news_desk':'Real Estate Desk'}, {'$set': {'news_desk': 'Real Estate'} })
    
    #Arts->Arts&Leisure
    table.update_many({'news_desk':'Arts'}, {'$set': {'news_desk': 'Arts&Leisure'} })
    
    #Leisure/Weekend Desk->Arts&Leisure
    table.update_many({'news_desk':'Leisure/Weekend Desk'}, {'$set': {'news_desk': 'Arts&Leisure'} })
    
    #Foreign Desk->Foreign
    table.update_many({'news_desk':'Foreign Desk'}, {'$set': {'news_desk': 'Foreign'} })
    
    #National Desk -> National
    table.update_many({'news_desk':'National Desk'}, {'$set': {'news_desk': 'National'} })
    
    #National Desk;->National
    table.update_many({'news_desk':'National Desk;'}, {'$set': {'news_desk': 'National'} })
    
    #NAT->National
    table.update_many({'news_desk':'NAT'}, {'$set': {'news_desk': 'National'} })
    
    #National Desk National Desk->National
    table.update_many({'news_desk':'National Desk National Desk'}, {'$set': {'news_desk': 'National'} })
    
    #Museums->Arts&Leisure
    table.update_many({'news_desk':'Museums'}, {'$set': {'news_desk': 'Arts&Leisure'} })
    
    #Television->Arts&Leisure
    table.update_many({'news_desk':'Television'}, {'$set': {'news_desk': 'Arts&Leisure'} })
    
    

In [55]:
%time consolidate_desks()

CPU times: user 20 ms, sys: 0 ns, total: 20 ms
Wall time: 830 ms


In [5]:
# Delete outliers: failed scrapes, broken links, etc

def clean_up():
    #the below are broken links
    table.delete_many({'news_desk':'Column 1'})
    
    table.delete_many({'news_desk':'Games'})
    
    table.delete_many({'news_desk':'Video'})
    
    table.delete_many({'original_text':{'$exists':False}})
    
    table.delete_many({'original_text':''})
    
    #DELETE ALL OLD ARTICLES that are represented by scans of originals, alongside
    #a short description (that is why not caught earlier)
    abstracts=re.compile('https:\/\/query\.nytimes\.com\/gst\/abstract\.html')
    for item in table.find():
        if abstracts.match(item['url']):
            table.delete_one({'_id':item['_id']})

In [6]:
%time clean_up()

CPU times: user 1.01 s, sys: 560 ms, total: 1.57 s
Wall time: 2.05 s


### Fix Dates

In [7]:
def get_reduced_date(s):
    '''given string return just date part'''
    return s[:10]

In [8]:
cursor=table.find({'date':{'$exists':True}})
for item in cursor:
    date=item['date']
    if len(date)>11:
        try:
            new_d=get_reduced_date(date)
            table.update_one({'_id':item['_id']}, {'$set': {'date': new_d} }, upsert=False)
        except:
            pass

## Planning . 
Blogs have weird characters