In [1]:
import pandas as pd
import pickle
import random
pd.set_option("display.max_columns", None)
from collections import Counter
import spacy
import re

In [8]:
nlp = spacy.load('en_core_web_lg')

In [11]:
postags = ["PROPN", "NOUN", "VERB", "ADJ"]

def clean_token(token):
    token = re.sub("\W", "", token)
    token = token[0] + token[1:].lower()
    return token

def doc_to_lemmata(doc, postags):
    lemmata_list = [t.lemma_ for t in doc if t.pos_ in postags]
    lemmata_list = [clean_token(el) for el in lemmata_list if len(el)>0]
    joined_lemmata_sorted = " ".join(sorted(lemmata_list))
    return joined_lemmata_sorted

def ngram_to_lemmata(ngram, ngram_nlp_dict):
    try:
        lemmata_sorted_str = doc_to_lemmata(ngram_nlp_dict[ngram]["doc"], postags)
        #print("found in preprocessed") # (used for execution time tests...)
    except:
        try:
            lemmata_sorted_str = doc_to_lemmata(nlp(ngram), postags)
            # print("processed now") # (used for execution time tests...)
        except:
            lemmata_sorted_str = ""
    return lemmata_sorted_str

def article_data_to_lemmata(ngrams_dict, ngram_nlp_dict):
    lemmata_tups = []
    for string, count in ngrams_dict.items():
        lemmata_tups.append((ngram_to_lemmata(string, ngram_nlp_dict), count))
    lemmata_tups = [tup for tup in lemmata_tups if len(tup[0]) > 1]
    lemmata_dict = Counter()
    for x,y in lemmata_tups:
        lemmata_dict.update({x : y})
    return lemmata_dict

In [5]:
ngram_type = "unigram"
ngramCount_dict = pickle.load(open("../data/large_files/{0}Count_dict.pickle".format(ngram_type), "rb"))
ngram_nlp_dict = pickle.load(open("../data/large_files/data_{0}s_nlp_dict.pickle".format(ngram_type), "rb"))

In [13]:
%%time
test_ngrams_dict = (list(ngramCount_dict.items())[5000][1])
test_output = article_data_to_lemmata(test_ngrams_dict, ngram_nlp_dict)
sorted(test_output.items(), key=lambda pair: pair[1], reverse=True)[:100]

CPU times: user 148 ms, sys: 35.1 ms, total: 184 ms
Wall time: 191 ms


[('child', 30),
 ('handicap', 18),
 ('street', 16),
 ('have', 11),
 ('city', 11),
 ('family', 10),
 ('life', 10),
 ('church', 9),
 ('Mexico', 9),
 ('work', 8),
 ('Christ', 8),
 ('feel', 8),
 ('syndrome', 7),
 ('education', 7),
 ('live', 6),
 ('home', 6),
 ('time', 6),
 ('physical', 6),
 ('ministry', 6),
 ('mexican', 6),
 ('God', 5),
 ('give', 5),
 ('parent', 5),
 ('social', 5),
 ('love', 4),
 ('need', 4),
 ('christian', 4),
 ('holy', 4),
 ('process', 4),
 ('gospel', 4),
 ('other', 4),
 ('people', 4),
 ('result', 3),
 ('society', 3),
 ('preach', 3),
 ('great', 3),
 ('programme', 3),
 ('centre', 3),
 ('government', 3),
 ('do', 3),
 ('day', 3),
 ('permit', 3),
 ('drug', 3),
 ('understand', 3),
 ('educational', 3),
 ('transformation', 3),
 ('man', 3),
 ('study', 3),
 ('serious', 3),
 ('career', 3),
 ('second', 3),
 ('most', 3),
 ('go', 3),
 ('scripture', 3),
 ('Jesus', 3),
 ('write', 3),
 ('reality', 3),
 ('environment', 3),
 ('situation', 3),
 ('faith', 3),
 ('involve', 3),
 ('attention',

In [19]:
%%time
n = 10
cleanedNgrams_tups = []
for k, v in list(ngramCount_dict.items())[:n]:
    cleanedNgrams_tups.append((k, article_data_to_lemmata(v, ngram_nlp_dict)))

CPU times: user 12 s, sys: 75.9 ms, total: 12.1 s
Wall time: 12.2 s


In [22]:
(1 * 14000) / 60

233.33333333333334