In [1]:
import json
import en_core_web_sm
import nltk
from nltk.stem.wordnet import WordNetLemmatizer
import datetime

In [2]:
with open('data/top_data.json') as f:
    j = json.load(f)

ids_dropped = set(
    [
        '3nkl4l', 'etfdib', '312wku', 'bc0gjg', '95gk6u', '34ey1p', '3wjg2o', '51y9gl', '83n2z2',
        'ex5nam', '3xbynm', '4030de', '514s33', '3mx6p8', '2u6yyd', '9oeth4', '9ccilf', '3x2lnb', 
        '52yazt', '558t8b', '1pu47m', 'el5ny5', '56woyb'
    ]
)

new_j = []
for i, d in enumerate(j):
    d['title'] = (d['title']
        .lower()
        .replace(' | abc7.com', '')
        .replace(' – wsvn 7news | miami news, weather, sports', '')
        .replace('floridaman', 'florida man')
        .replace('floridawoman', 'florida woman')
        .replace('\n', ' ')
        .replace("[x-post /r/funny]", '')
        .replace('my result for the florida man challenge: ', '')
        .replace("floridamen", 'florida men')
        .replace('x-post from r/nottheonion :', '')
        .replace('(x-post r/funny)', '')
        .replace('(x-post from /r/thesquadonpoint)', '')
        .replace(' [this should be our new sidebar picture]', '')
        .strip()
    )
    
    d['time'] = datetime.datetime.fromtimestamp(d['time']).date().strftime('%B %d, %Y')
    
    
    if (not (d['id'] in ids_dropped)) and (not (d['domain'] == 'self.FloridaMan')):
        new_j.append(d)
j = new_j

titles = [d['title'] for d in j]

with open('data/titles.txt', 'w') as f:
    f.write('\n'.join(sorted(titles)))

In [3]:
# parse all verbs
nlp = en_core_web_sm.load()
def get_verb(s):
    m = [x.root.head.text for x in nlp(s).noun_chunks if x.root.head.pos_ == 'VERB']
    standardized = [WordNetLemmatizer().lemmatize(x, 'v') for x in m]
    remove = set(['d', "’re", "’m", "’s"])
    filtered = [x for x in standardized if x not in remove]
    return None if len(filtered) == 0 else list(set(filtered))

for d in j:
    d.update({'verbs': get_verb(d['title'])})
verb_json = j
del j

In [4]:
# Make manual mapper
with open('mappers/verb_mapper_manual.txt') as f:
    m = [x.strip() for x in f.readlines()]

d = {}
for x in m:
    verb, title = x.split(' => ')
    d[title] = [verb]

with open('mappers/verb_mapper_manual.json', 'w') as f:
    json.dump(d, f, indent=4)

manual_mapper = d

def apply_mapper(x):
    return manual_mapper.get(x['title'], x['verbs'])

for x in verb_json:
    x['verbs'] = apply_mapper(x)

In [5]:
# separate into man/woman
fm = [x for x in verb_json if 'florida man' in x['title'] or 'florida men' in x['title']]
fw = [x for x in verb_json if 'florida woman' in x['title'] or 'florida women' in x['title']]

In [6]:
# build final jsons
for j, name in zip([fm, fw], ['fm', 'fw']):
    d = {}
    d['count'] = len(j)
    d['verbs'] = {}
    
    # expand to one set
    l = []
    for x in j:
        l.extend(x['verbs'])
    verbs = set(l)
    
    for verb in verbs:
        content = sorted([x for x in j if verb in x['verbs']], key=lambda x: -x['score'])
        d['verbs'][verb] = {}
        d['verbs'][verb]['articles'] = content
        d['verbs'][verb]['count'] = len(content)

    with open(f'{name}.json', 'w') as f:
        json.dump(d, f, indent=4)