In [1]:
from pymongo import MongoClient

import numpy as np
import pandas as pd
import pickle as pk

import time

In [2]:
# timing function
def timefunc(f):
    def f_timer(*args, **kwargs):
        start = time.time()
        result = f(*args, **kwargs)
        end = time.time()
        print f.__name__, 'took', end - start, 'seconds'
        return result
    return f_timer

## Extracting data from tarfile

In [2]:
import tarfile

In [None]:
# Do not rerun
tfile = tarfile.open("data/scrapes.tar.gz", 'r:gz')
tfile.extractall('data/')

In [14]:
# Do not rerun
tfile = tarfile.open("data/urls.tar.gz", 'r:gz')
tfile.extractall('/data')

## Test reads of data

In [3]:
with open('data/master5085.pkl', 'rb') as f:
    urls = pk.load(f)

In [4]:
urls

{u'7524467': u'https://www.change.org/p/derek-rogers-move-workcamp-2017-to-erie-pa',
 u'8267192': u'https://www.change.org/p/in-n-out-burgers-shut-up-and-eat-somewhere-else',
 u'4794534': u'https://www.change.org/p/gladys-baisa-upcountry-rep-maui-county-council-block-cell-tower-placement-at-pukalani-golf-club',
 u'6910928': u'https://www.change.org/p/olympia-entertainment-change-name-of-largely-taxpayer-funded-little-ceasers-arena',
 u'8267195': u'https://www.change.org/p/espn-to-give-jose-fernandez-the-2016-cy-young-award',
 u'7220372': u'https://www.change.org/p/cthss-human-resources-keep-mrs-luster-at-vinal',
 u'6411353': u'https://www.change.org/p/the-us-nuclear-regulatory-commission-do-not-relicense-the-aging-diablo-canyon-nuclear-power-plant',
 u'8542361': u'https://www.change.org/p/bill-o-brien-get-the-houston-texans-coaching-staff-to-start-tom-savage',
 u'6411350': u'https://www.change.org/p/san-jose-city-council-limit-homelessness',
 u'8391371': u'https://www.change.org/p/texa

In [5]:
with open('data/urls1000_save.pkl', 'rb') as f:
    urls1000_save = pk.load(f)

In [6]:
urls1000_save

{u'1014650': {'goal': u'35000',
  'other_decision_makers': [],
  'petition_title': u'Stop the Lake Okeechobee discharges into the St Lucie River and Estuaries',
  'petitioner': u'River of Light',
  'petitioner_location': '',
  'petitioner_type': 'organization',
  'primary_decision_maker': u'Rick Scott',
  'story': [u'Since we started this petition a lot has happened. So like everything else in this movement that started off to save our Indian River Lagoon and morphed into saving Caloosahatchee River, saving the Everglades, saving the water of south florida , saving the florida bay, saving south florida and stopping the discharges of millions of gallons of fresh water out to tide. ',
   u"Sending water south will not only help us stop the discharges to us and Caloosahatchee River but it is needed to recharge the aquifers for people's drinking water and to help stop salt water intrusion and sea level rise. Plus that who wastes that much water? This is your water being wasted.",
   u'Acco

In [7]:
with open('data/urls1000_comments.pkl', 'rb') as f:
    urls1000_comments = pk.load(f)

In [8]:
urls1000_comments

{'1014650': {'19398223': {'comment_date': u'3 years ago',
   'comment_hearts': u'7',
   'comment_text': u"As an avid fisherman, diver and lifetime Floridian, I've watched the saltwater discharges ruin this valuable estuary once and it should not be allowed to happen again. The destruction of one of South Florida's vital nurseries and economic bolster points stands in the balance. Stop the discharges NOW!!",
   'commenter_location': u'Port St Lucie, FL',
   'commenter_name': u"Daniel O'Shea"},
  '19398827': {'comment_date': u'3 years ago',
   'comment_hearts': u'21',
   'comment_text': u"it should'nt matter how important it is to us... its illegal and should be dealt with properly, this is a dumb optional question.",
   'commenter_location': u'port saint lucie, FL',
   'commenter_name': u'matt garrity'},
  '19420676': {'comment_date': u'3 years ago',
   'comment_hearts': u'5',
   'comment_text': u'really!!! With rising population and record heat water is worth more than gold and we just

## Getting data into MongoDB

###  MongoDB workflow
For each petition 
* insert as document with petition id as id field

For each comments
* check to see if petition id exists
* if it exists, update with list with comment id as id field

In [9]:
client = MongoClient()

In [10]:
# seeing what databases exist
client.database_names()

[u'my_db', u'local', u'admin']

In [11]:
db = client.petitions_db
coll = db.petitions

## Inserting all petition pages into Mongo

In [12]:
def create_entries(fname, collection, victory=False):
    with open('data/' + fname, 'rb') as f:
        entries = pk.load(f)
    
    # create an entry for each item in pickle
    for i, entry in entries.iteritems():
        doc = {"_id": i}
        try:
            # integerize supporters and goals
            entry['supporters'] = int(entry['supporters'])
            entry['goal'] = int(entry['goal'])
        except:
            print "error with i"
        # insert into mongodb using update/upsert (which checks for existence before adding)
        collection.update_one(doc, {"$set": entry}, upsert=True)
        # add victory trigger
        if not victory:
            collection.update_one(doc, {"$set": {"victory": 0}}, upsert=True)
        else:
            collection.update_one(doc, {"$set": {"victory": 1}}, upsert=True)

In [13]:
# create all petition entries
@timefunc
def create_all_entries(collection, victory=False):
    if not victory:
        for i in xrange(1, 50):
            fname = 'urls' + str(i*1000) +'_save.pkl'
            create_entries(fname, collection, victory)
    else:
        for i in xrange(1, 5):
            fname = 'victories' + str(i*1000) +'_save.pkl'
            create_entries(fname, collection, victory)

In [14]:
# returns approximate age of comments
def return_age(date):
    if "year" in date:
        return int(date.split()[0]) * (12 * 365 + 6 * 365)
    if "month" in date:
        return int(date.split()[0]) * 30 + 15 
    if "week" in date:
        return int(date.split()[0]) * 7 + 4
    if "day" in date:
        return int(date.split()[0])
    if "hour" in date:
        return 1

In [15]:
# update entries with comments
def set_comments_to_entries(fname, collection):
    with open('data/' + fname, 'rb') as f:
        comments_group = pk.load(f)
    
    failures = []
    for i, group in comments_group.iteritems():
        doc = {"_id": i}
        l = []
        try:
            for j, comment in group.iteritems():
                comment_doc = {"comment_id": j}
                
                # integerize hearts
                comment['comment_hearts'] = int(comment['comment_hearts'])
                
                # create approximate age entry
                comment['comment_age'] = return_age(comment['comment_date'])
                comment_doc.update(comment)
                l.append(comment_doc)
            collection.update_one(doc, {"$set": {"comments": l}}) # upsert = False; do not create entry if petition doesn't exist
        except:
            print "no comments for", i
    print "added comments for", fname

In [16]:
# create all petition comment entries
@timefunc
def create_all_comment_entries(collection, victory=False):
    if not victory:
        for i in xrange(1, 50):
            fname = 'urls' + str(i*1000) +'_comments.pkl'
            set_comments_to_entries(fname, collection)
            
    else:
        for i in xrange(1, 5):
            fname = 'victories'+ str(i*1000) +'_comments.pkl'
            set_comments_to_entries(fname, collection)

In [17]:
# update entries with URLs
@timefunc
def set_urls_to_entries(collection, filename): # can use either master5085.pkl or victories_final.pkl
    with open(filename) as f:
        urls = pk.load(f)

    for i, url in urls.iteritems():
        doc = {"_id": i}
        entry = {"url": url}
        # insert into mongodb using update (upsert = false), which will only add urls to entries that exist
        collection.update_one(doc, {"$set": entry})

## Updating Mongo Database

In [19]:
# do not need to rerun
create_all_entries(coll, victory=False)

create_all_entries took 30.7234139442 seconds


In [20]:
# do not need to rerun
create_all_entries(coll, victory=True)

create_all_entries took 3.17572689056 seconds


In [21]:
# checking for victory stats
print coll.find({"victory": 0}).count()
print coll.find({"victory": 1}).count()

48768
3624


In [22]:
# Checking for number of entries in database
coll.count()

52392

In [23]:
create_all_comment_entries(coll, victory=False)

added comments for urls1000_comments.pkl
no comments for 5596146
added comments for urls2000_comments.pkl
no comments for 8066984
added comments for urls3000_comments.pkl
added comments for urls4000_comments.pkl
added comments for urls5000_comments.pkl
no comments for 5832758
added comments for urls6000_comments.pkl
no comments for 8096570
added comments for urls7000_comments.pkl
added comments for urls8000_comments.pkl
no comments for 5254286
added comments for urls9000_comments.pkl
no comments for 7888418
added comments for urls10000_comments.pkl
added comments for urls11000_comments.pkl
no comments for 6361703
added comments for urls12000_comments.pkl
no comments for 5617082
added comments for urls13000_comments.pkl
added comments for urls14000_comments.pkl
no comments for 6598310
added comments for urls15000_comments.pkl
added comments for urls16000_comments.pkl
added comments for urls17000_comments.pkl
no comments for 6619259
added comments for urls18000_comments.pkl
added comment

In [24]:
create_all_comment_entries(coll, victory=True)

added comments for victories1000_comments.pkl
added comments for victories2000_comments.pkl
added comments for victories3000_comments.pkl
added comments for victories4000_comments.pkl
create_all_comment_entries took 2.94592094421 seconds


In [25]:
set_urls_to_entries(coll, 'data/master5085.pkl')

set_urls_to_entries took 15.9628379345 seconds


In [26]:
# checking status of db
db.command({'collstats': 'petitions'})

{u'avgObjSize': 6600,
 u'count': 52392,
 u'indexSizes': {u'_id_': 2101232},
 u'lastExtentSize': 124993536,
 u'nindexes': 1,
 u'ns': u'petitions_db.petitions',
 u'numExtents': 15,
 u'ok': 1.0,
 u'paddingFactor': 1.0030000000000001,
 u'size': 345822576,
 u'storageSize': 460894208,
 u'systemFlags': 1,
 u'totalIndexSize': 2101232,
 u'userFlags': 1}

In [28]:
coll.count()

52392