### process_data_raw_text_to_DB

 Extract date, title, author and text from each web page scraped by scrape_american_presidency_raw_text.ipynb and insert the record into mongoDB

In [1]:
import pymongo
from pymongo import MongoClient
import datetime
import pickle

import re
from dateutil import parser


#### connect to database; start a collection to store records with author, title, date and raw text as fields.

In [2]:
client = MongoClient()

In [3]:
db = client.test_database

In [None]:
presidency_docs = db.presidency_docs

In [7]:
'''
Extract date, title, author and text from a scraped web page
return a mongo DB record.

'''

def process_docs (docs):

    mongo_db_records = []

    # a very small fraction of pages have a different format:
    # author, title, date are not included in <span class="displaytext">
    # but rather, each is in a seperate <span class="ver10">
    # that follows <span class="displaytext">
    # mark those pages and process them differently later
    error_ids = []

    for i in range(len(docs)):
        content = docs[i]
        text = content.split('Online by')[0]
    
        # get rid of '[laughter]', '[applause]','(applause)', etc.
        text = re.sub('\[.*?\]', ' ', text)
        text = re.sub('\(.*?\)', ' ', text)
    
        # get rid of footnotes
        lst =text.split('\n\tCitation:\xa0')
    
    
        doc=lst[0]
        doc=doc.split('Note:')[0]
        doc=doc.split('NOTE:')[0]
        doc=doc.split('NOTE:')[0]
    
    
        try:
            if len(lst)>1:
    
                temp=lst[1].split('"')
                title = temp[1].strip().replace(',','')
                date = temp[2].strip().replace('.','')
                name=temp[0].strip().replace(':','')
    
    
                record = {"author":name,"title":title,"date":parser.parse(date),"text":doc}
                mongo_db_records.append(record)
        
            else:
                error_ids.append(i)
        except:
            error_ids.append(i)
    
    
    
    return mongo_db_records ,error_ids

#### now precess all 130K raw text and save page ids of the ones that fail

In [22]:
def process_all_raw_text_filenames():
        
    start_pts = [i for i in range(1,128000,1000)]
    end_pts = [ i+1000 for i in start_pts]
    last_interval = (129001, 129484)

    start_pts.append(last_interval[0])
    end_pts.append(last_interval[1])
    
    all_err_ids = []
    
    for i in range(len(start_pts)):
        filename = '/home/ubuntu/proj4/data/presidency_docs_'+str(start_pts[i])+'_to_'+str(end_pts[i]-1)+'.pkl'
        with open(filename, 'rb') as fp:
            docs = pickle.load(fp)
        # err_ids are the positions of error pages in current doc list    
        records, err_ids = process_docs (docs)
        # err_page_ids are the page ids the web site uses for these pages
        err_page_ids = [err_id+start_pts[i] for err_id in err_ids]
        all_err_ids.extend(err_page_ids)
        presidency_docs.insert_many(records)
        
        print('inserted docs ',start_pts[i],' to ',end_pts[i]-1, )
            
    return all_err_ids

In [23]:
err_page_ids = process_all_raw_text_filenames()

<pymongo.results.InsertManyResult object at 0x7f6d5c10b3c8>
inserted docs  1  to  1000
<pymongo.results.InsertManyResult object at 0x7f6d54544288>
inserted docs  1001  to  2000
<pymongo.results.InsertManyResult object at 0x7f6d544924c8>
inserted docs  2001  to  3000
<pymongo.results.InsertManyResult object at 0x7f6d5455efc8>
inserted docs  3001  to  4000
<pymongo.results.InsertManyResult object at 0x7f6d54492788>
inserted docs  4001  to  5000
<pymongo.results.InsertManyResult object at 0x7f6d5455efc8>
inserted docs  5001  to  6000
<pymongo.results.InsertManyResult object at 0x7f6d544a8b48>
inserted docs  6001  to  7000
<pymongo.results.InsertManyResult object at 0x7f6d54536288>
inserted docs  7001  to  8000
<pymongo.results.InsertManyResult object at 0x7f6d544dad88>
inserted docs  8001  to  9000
<pymongo.results.InsertManyResult object at 0x7f6d54544808>
inserted docs  9001  to  10000
<pymongo.results.InsertManyResult object at 0x7f6d683c0748>
inserted docs  10001  to  11000
<pymongo.r

<pymongo.results.InsertManyResult object at 0x7f6d545d7dc8>
inserted docs  90001  to  91000
<pymongo.results.InsertManyResult object at 0x7f6d5e01b848>
inserted docs  91001  to  92000
<pymongo.results.InsertManyResult object at 0x7f6d54469548>
inserted docs  92001  to  93000
<pymongo.results.InsertManyResult object at 0x7f6d5c10b988>
inserted docs  93001  to  94000
<pymongo.results.InsertManyResult object at 0x7f6d54536508>
inserted docs  94001  to  95000
<pymongo.results.InsertManyResult object at 0x7f6d5e01b848>
inserted docs  95001  to  96000
<pymongo.results.InsertManyResult object at 0x7f6d54545408>
inserted docs  96001  to  97000
<pymongo.results.InsertManyResult object at 0x7f6d5e01b848>
inserted docs  97001  to  98000
<pymongo.results.InsertManyResult object at 0x7f6d5c794948>
inserted docs  98001  to  99000
<pymongo.results.InsertManyResult object at 0x7f6d6184b208>
inserted docs  99001  to  100000
<pymongo.results.InsertManyResult object at 0x7f6d5452b3c8>
inserted docs  1000