In [1]:
import pandas as pd
import pickle
import random
pd.set_option("display.max_columns", None)
from collections import Counter
import spacy

In [2]:
# to communicate with google spreadsheet...
import gspread
from gspread_dataframe import get_as_dataframe
from gspread_dataframe import set_with_dataframe
from google.oauth2 import service_account # based on google-auth library
import sddk

s = sddk.cloudSession("sciencedata.dk")
# establish connection with gogglesheets...
file_data = s.read_file("https://sciencedata.dk/files/ServiceAccountsKey.json", "dict") # or load it from a local storage: json.load(open("../../ServiceAccountsKey.json", "r"))
credentials = service_account.Credentials.from_service_account_info(file_data)
gc = gspread.Client(auth=credentials.with_scopes(['https://spreadsheets.google.com/feeds', 'https://www.googleapis.com/auth/drive']))
mops_data = gc.open_by_url("https://docs.google.com/spreadsheets/d/1VbCIAJssHKV9hlRTwzVFfm40CGnHesq53KXjv2qy4OM/edit?usp=sharing")

endpoint variable has been configured to: https://sciencedata.dk/files/


In [3]:
unigramCount_dict = pickle.load(open("../data/large_files/unigramCount_dict.pickle", "rb"))

In [4]:
bigramCount_dict = pickle.load(open("../data/large_files/bigramCount_dict.pickle", "rb"))

In [3]:
trigramCount_dict = pickle.load(open("../data/large_files/trigramCount_dict.pickle", "rb"))

# Generating most frequent ngrams for preprocessing

In [4]:
def merge_data_from_ids(ids, datadict):
    c = Counter()
    for id in ids:
        d = datadict[id]
        c.update(d)
    return c


In [8]:
# create a sample to preprocess the most frequent ngrams
random.seed(0)
ids = random.sample(list(trigramCount_dict.keys()), 5000)
len(ids)

5000

In [6]:
#data_unigrams = merge_data_from_ids(ids, unigramCount_dict)
#data_bigrams = merge_data_from_ids(ids, bigramCount_dict)
data_trigrams = merge_data_from_ids(ids, trigramCount_dict)

In [7]:
ids2 = [k for k in list(trigramCount_dict.keys()) if k not in ids]
len(ids2)

9103

In [9]:
data_trigrams2 = merge_data_from_ids(ids2, trigramCount_dict)

In [17]:
del trigramCount_dict

In [10]:
# load spacy english nlp model & english stopwords...
nlp = spacy.load('en_core_web_lg')
stop_words = nlp.Defaults.stop_words

# Most frequent unigrams to nlp docs

In [10]:
# check how raw unigrams look like
data_unigrams_sorted = sorted(data_unigrams.items(), key=lambda kv: kv[1], reverse=True)
data_unigrams_sorted[:100]

[('the', 3333595),
 ('of', 2261372),
 ('and', 1459413),
 ('to', 1228717),
 ('in', 1151878),
 ('a', 783061),
 ('is', 639492),
 ('that', 618083),
 ('as', 422447),
 ('for', 345389),
 ('The', 316494),
 ('with', 299495),
 ('his', 276897),
 ('by', 273503),
 ('not', 273209),
 ('was', 261923),
 ('on', 251693),
 ('be', 248574),
 ('this', 235586),
 ('from', 223616),
 ('it', 221608),
 ('he', 216982),
 ('are', 195085),
 ('which', 184861),
 ('or', 166677),
 ('have', 158059),
 ('an', 157658),
 ('but', 147118),
 ('who', 145350),
 ('at', 143826),
 ('their', 143725),
 ('I', 124501),
 ('they', 122254),
 ('one', 115975),
 ('In', 115523),
 ('has', 113383),
 ('we', 111595),
 ('were', 108817),
 ('all', 105330),
 ('God', 102838),
 ('had', 102805),
 ('.', 99676),
 ('also', 96125),
 ('its', 87907),
 ('will', 84576),
 ('been', 80307),
 ('Christian', 78791),
 ('more', 78720),
 ('Paul', 76343),
 ('would', 73443),
 ('what', 73327),
 ('This', 73203),
 ('these', 70165),
 ('other', 69944),
 ('only', 68254),
 ('about'

In [11]:
# how many of the 100 most frequent unigrams are actually not stopwords?
len([el for el in data_unigrams_sorted[:100] if el[0].lower() not in stop_words])

9

In [12]:
# what is the number of unigrams appearing more than 10 times?
len([el for el in data_unigrams_sorted if el[1] > 10])

158342

In [14]:
%%time
# use spacy to generate "Doc" object for ngram
# (use carefully, takes some time)
data_unigrams_sorted_nlp = [(el[0], {"doc" : nlp(el[0]), "count" : el[1]}) for el in data_unigrams_sorted if el[1] > 10]

CPU times: user 7min 42s, sys: 8.41 s, total: 7min 50s
Wall time: 8min 3s


In [15]:
data_unigrams_sorted_nlp[:5]

[('the', {'doc': the, 'count': 3333595}),
 ('of', {'doc': of, 'count': 2261372}),
 ('and', {'doc': and, 'count': 1459413}),
 ('to', {'doc': to, 'count': 1228717}),
 ('in', {'doc': in, 'count': 1151878})]

In [16]:
data_unigrams_nlp_dict = dict(data_unigrams_sorted_nlp)

In [17]:
# save for future usage
with open("../data/large_files/data_unigrams_nlp_dict.pickle", "wb") as f:
    pickle.dump(data_unigrams_nlp_dict, f)

In [18]:
# remove middle-step data to save CPU...
del data_unigrams
del data_unigrams_sorted
del data_unigrams_sorted_nlp

# Bigrams to nlp docs

In [19]:
data_bigrams_sorted = sorted(data_bigrams.items(), key=lambda kv: kv[1], reverse=True)
len([el for el in data_bigrams_sorted if el[1] > 10])

467586

In [20]:
data_bigrams_sorted[:10]

[('of the', 632448),
 ('in the', 313627),
 ('to the', 210980),
 ('and the', 142843),
 ('on the', 93142),
 ('that the', 92278),
 ('to be', 81009),
 ('for the', 78802),
 ('with the', 77489),
 ('from the', 73152)]

In [21]:
%%time
data_bigrams_sorted_nlp = [(el[0], {"doc" : nlp(el[0]), "count" : el[1]}) for el in data_bigrams_sorted if el[1] > 10]

CPU times: user 20min 56s, sys: 22.5 s, total: 21min 19s
Wall time: 21min 49s


In [22]:
data_bigrams_nlp_dict = dict(data_bigrams_sorted_nlp)

In [23]:
with open("../data/large_files/data_bigrams_nlp_dict.pickle", "wb") as f:
    pickle.dump(data_bigrams_nlp_dict, f)

In [24]:
del data_bigrams
del data_bigrams_sorted_nlp

# Trigrams to nlp docs

In [11]:
data_trigrams_sorted = sorted(data_trigrams.items(), key=lambda kv: kv[1], reverse=True)
len([el for el in data_trigrams_sorted if el[1] > 10])

458444

In [26]:
%%time
data_trigrams_sorted_nlp = [(el[0], {"doc" : nlp(el[0]), "count" : el[1]}) for el in data_trigrams_sorted if el[1] > 10]

CPU times: user 21min 37s, sys: 33.8 s, total: 22min 11s
Wall time: 23min 14s


In [16]:
del data_trigrams_sorted

In [27]:
data_trigrams_nlp_dict = dict(data_trigrams_sorted_nlp)

In [28]:
with open("../data/large_files/data_trigrams_nlp_dict.pickle", "wb") as f:
    pickle.dump(data_trigrams_nlp_dict, f)

# Second round of merged trigrams nlp preprocessing

In [12]:
len(data_trigrams2)

46627339

In [42]:
data_trigrams2 = dict(data_trigrams2)

In [13]:
keys1 = [el[0] for el in data_trigrams.items() if el[1] > 10]
len(keys1)

458444

In [22]:
del data_trigrams

In [14]:
keys2 = [el[0] for el in data_trigrams2.items() if el[1] > 10]
len(keys2) #  [el[0] for el in data_trigrams2_filtered]

774574

In [15]:
keys2_filtered = list((set(keys2) ^ set(keys1)) & set(keys2))
len(keys2_filtered)

342693

In [31]:
del nlp

In [32]:
print("hello")

hello


In [39]:
pickle.dump(keys2_filtered, open("../data/large_files/keys2_filtered.pickle", "wb"))
pickle.dump(data_trigrams2, open("../data/large_files/data_trigrams2.pickle", "wb"))

In [40]:
len(data_trigrams2)

46627339

In [43]:
%%time
data_trigrams2_filtered = []
n = 0
for el in keys2_filtered:
    if n in range(0, len(keys2_filtered), 5000):
        print(n)
    n += 1
    data_trigrams2_filtered.append((el, data_trigrams2[el]))

0
5000
10000
15000
20000
25000
30000
35000
40000
45000
50000
55000
60000
65000
70000
75000
80000
85000
90000
95000
100000
105000
110000
115000
120000
125000
130000
135000
140000
145000
150000
155000
160000
165000
170000
175000
180000
185000
190000
195000
200000
205000
210000
215000
220000
225000
230000
235000
240000
245000
250000
255000
260000
265000
270000
275000
280000
285000
290000
295000
300000
305000
310000
315000
320000
325000
330000
335000
340000
CPU times: user 285 ms, sys: 286 ms, total: 572 ms
Wall time: 839 ms


In [44]:
len(data_trigrams2_filtered)

342693

In [46]:
pickle.dump(data_trigrams2_filtered, open("../data/large_files/data_trigrams2_filtered.pickle", "wb"))

In [47]:
nlp = spacy.load('en_core_web_lg')

In [48]:
%%time
data_trigrams2_nlp_tups = []
n = 0
for el in data_trigrams2_filtered:
    if n in range(0, len(data_trigrams2_filtered), 5000):
        print(n)
    n += 1
    data_trigrams2_nlp_tups.append((el[0], {"doc" : nlp(el[0]), "count" : el[1]}))

0
5000
10000
15000
20000
25000
30000
35000
40000
45000
50000
55000
60000
65000
70000
75000
80000
85000
90000
95000
100000
105000
110000
115000
120000
125000
130000
135000
140000
145000
150000
155000
160000
165000
170000
175000
180000
185000
190000
195000
200000
205000
210000
215000
220000
225000
230000
235000
240000
245000
250000
255000
260000
265000
270000
275000
280000
285000
290000
295000
300000
305000
310000
315000
320000
325000
330000
335000
340000
CPU times: user 14min 25s, sys: 14.3 s, total: 14min 39s
Wall time: 14min 44s


In [49]:
pickle.dump(dict(data_trigrams2_nlp_tups), open("../data/large_files/data_trigrams2_nlp_dict.pickle", "wb"))