In [1]:
## PosterPoster
### based on forumThreads.json created by parseFromWikiFactory.ipynb, we present posts from G+ and WikiFactory.

In [2]:
from pathlib import Path
p=Path('forumThreads.json')
import json
t = p.read_text()
threads=json.loads(t)

In [3]:
postCounter=0
for thread in threads:
    postCounter+=1
    for comment in thread['comments']:
        postCounter+=1
print(f'{postCounter} posts in {len(threads)} threads') 

76238 posts in 13542 threads


## time utils: timestamp(), datetime(), niceDateStr()
Timestamps in the data come as strings in two forms (with "T" and without), and have ugly +00:00 at the end.
Since sorting on string dates stumbles on '0', we have these utilities with 1 second resolution.

In [4]:
import arrow
from pandas import DataFrame

def timeInt(s='2015-11-17T20:43:01+00:00'):
    """timestamp is seconds since 1/1/1970
    """
    if 'T' in s:
        return arrow.get(s[:19], 'YYYY-MM-DDTHH:mm:ss').timestamp
    else:
        return arrow.get(s[:19], 'YYYY-MM-DD HH:mm:ss').timestamp
        
def timeInt_to_isoformat(timeInt):
    return arrow.Arrow.fromtimestamp(timeInt).isoformat()

## normalize threads, sort by timeInt.  Takes several minutes

In [5]:
from bs4 import BeautifulSoup as soup
from pandas import DataFrame

import warnings
warnings.filterwarnings("ignore", category=UserWarning, module='bs4')

from pprint import pprint

normThreads = []
for t in threads:
    t['timeInt']  = timeInt(  t['date_created'])  #DON'T CALL IT t['timestamp'] or it will get converted into a pandas timestamp! 
    t['body'] = soup(t['body']).text

    if 'comments' not in t:  #there are two 
        t['comments'] = [] 
    
    for i, comment in enumerate(t['comments']):
        t['comments'][i]['timeInt']   = timeInt(comment['date_created'])
        t['comments'][i]['body_html'] = comment['body']
        t['comments'][i]['body'] = soup(comment['body']).text
    normThreads.append(t)
    

In [6]:
from pandas import DataFrame
df = DataFrame(normThreads)
df = df.sort_values('timeInt')
df

Unnamed: 0,author,body,body_html,category,comments,date_created,members_only,timeInt,title
6785,"{'username': 'jonschull', 'avatar': 'https://w...",Welcome to the companion site to this map[http...,<p>Welcome to the companion site to this map<a...,"Legal, Documents, $$","[{'date_created': '2013-07-10T20:36:01+00:00',...",2013-07-10T20:01:06+00:00,True,1373486466,Welcome to the companion site to this map...
3363,"{'username': 'jonschull', 'avatar': 'https://w...",co-moderators wanted for each of the sub-topic...,<p>co-moderators wanted for each of the sub-to...,Announcements & Admin,[],2013-07-11T04:35:45+00:00,True,1373517345,co-moderators wanted for each of the sub-topic...
6784,"{'username': 'jonschull', 'avatar': 'https://w...",Poster explaining the vision for the e-nable n...,<p>Poster explaining the vision for the e-nabl...,"Legal, Documents, $$","[{'date_created': '2013-07-12T17:50:15+00:00',...",2013-07-11T06:40:51+00:00,True,1373524851,Poster explaining the vision for the e-nable n...
6783,"{'username': 'matanpresberg', 'avatar': 'https...","hi, my name is Matan, I work with Jon","<p>hi, my name is Matan, I work with Jon</p>",Introductions,[],2013-07-12T16:25:49+00:00,True,1373646349,"hi, my name is Matan, I work with Jon"
13511,"{'username': 'robohandsa', 'avatar': 'https://...",Robohand is on board! Thanks Jon,<p>Robohand is on board! Thanks Jon</p>,Introductions,[],2013-07-12T17:47:03+00:00,True,1373651223,Robohand is on board! Thanks Jon
3362,"{'username': 'jonschull', 'avatar': 'https://w...",Welcome! 1) Join Community 2) Create or rev...,<p>Welcome! </p><p>1) Join Community 2) Cre...,,"[{'date_created': '2013-08-18T17:17:28+00:00',...",2013-07-12T19:28:56+00:00,True,1373657336,Welcome! ...
6782,"{'username': 'jaredmistretta', 'avatar': 'http...",Hello! I'm Jared Mistretta and I work with Jon...,<p>Hello! I'm Jared Mistretta and I work with ...,Introductions,"[{'date_created': '2013-07-13T12:05:59+00:00',...",2013-07-12T22:04:27+00:00,True,1373666667,Hello! I'm Jared Mistretta and I work with Jon...
10204,"{'username': 'jonschull', 'avatar': 'https://w...",This would be a good place for people to discu...,<p>This would be a good place for people to di...,General Discussion,[],2013-07-12T22:47:08+00:00,True,1373669228,This would be a good place for people to discu...
10203,"{'username': 'jonschull', 'avatar': 'https://w...",I have the impression that there are two desig...,<p>I have the impression that there are two de...,Research & Development,"[{'date_created': '2013-07-12T23:10:54+00:00',...",2013-07-12T22:49:04+00:00,True,1373669344,I have the impression that there are two desig...
13510,"{'username': 'jonschull', 'avatar': 'https://w...","Ultimately its about health, wellness, social ...","<p>Ultimately its about health, wellness, soci...",General Discussion,"[{'date_created': '2013-07-13T00:01:31+00:00',...",2013-07-12T22:51:12+00:00,True,1373669472,"Ultimately its about health, wellness, social ..."


## create normPosts, normPosts.csv is our candidate for the Graph 

In [7]:
normPosts = []
#Create IDs

for ti,thread in enumerate(df.to_dict('records')):

    thread['ID'] = ti        #then all the comments
    normPosts.append(thread) #first the thread
    
    #create IDs for each comment based on the ID of the ThreadHead
    for ci, comment in enumerate(thread['comments']):
        num= ci + 1
        comment['ID'] = ti + (num/1000) #e.g., 3.025 for the 
        normPosts.append(comment)
        

In [8]:
#give them all isoformat strings

for pi, post in enumerate(normPosts):
    normPosts[pi]['isoformat'] = timeInt_to_isoformat(post['timeInt'])  


In [9]:
df = DataFrame(normPosts)

In [10]:
df.columns, len(df)


(Index(['ID', 'author', 'body', 'body_html', 'category', 'comments',
        'date_created', 'isoformat', 'members_only', 'timeInt', 'title'],
       dtype='object'), 76238)

In [11]:
df.to_json('normPosts.json') #save
len(df)

76238