# Getting Sentences to Vectorize

I'm creating my own `word2vec` model, so I need tokenized sentences. Here I'm using a large chunk of the Project Gutenberg corpus as well as some [Amazon review data](https://snap.stanford.edu/data/web-Amazon.html).   

I attach the part-of-speech tag to the token so that I can disambiguate different word senses after training.

*This notebook is repurposed from this one [here](https://github.com/kbooten/stylevise/blob/master/getting_syntax_ngrams.ipynb).*
***

In [1]:
import os
gb_files = [f for f in os.listdir("/Users/kyle/Documents/downloading_gutenberg/data/") if f.startswith('gb_')]

In [2]:
len(gb_files)

19507

In [3]:
from tqdm import tqdm
import json

In [4]:
import spacy
nlp = spacy.load("en")

### Gutenberg data

In [5]:
from collections import defaultdict

In [6]:
tokenized_sentences = []
from gutenberg.cleanup import strip_headers
#used_ids = []
#token2pos_tag_counts = defaultdict(lambda: defaultdict(int))

for fy in tqdm(gb_files): ## for debugging
#for fy in tqdm(gb_files):
    with open("/Users/kyle/Documents/downloading_gutenberg/data/"+fy,'r') as f:
        tempdata = f.read()
        filenumber = fy.lstrip("gb_").rstrip(".txt")
        if "Language: English" in tempdata[:1000]:  ## make sure english 
            tempdata = strip_headers(tempdata)
            tempspacy = nlp(tempdata[:100000])### limit to first n chars
            sents = list(tempspacy.sents)
            sents_ok = [sent for sent in sents if (sent.text[0] in 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' and sent.text[-1] in ".?!")]
            for sent in sents_ok:
                tagged = [(i.text.lower(),i.tag_) for i in sent]
                tagged_clean = [(token,tag) for token,tag in tagged if "\n" not in token]
                tokens = [token+"_"+tag for token,tag in tagged_clean]
                tokenized_sentences.append(tokens)
#                 for token,tag in tagged_clean:
#                     token2pos_tag_counts[token][tag]+=1
            ###
            #used_ids.append(filenumber)


100%|██████████| 19507/19507 [11:31:57<00:00,  2.13s/it]  


In [7]:
len(tokenized_sentences)

4850043

In [8]:
tokenized_sentences[:100]

[['mr_NNP', '._.'],
 ['in_IN',
  'which_WDT',
  'keziah_NNP',
  'hears_VBZ',
  'two_CD',
  'proposals_NNS',
  'and_CC',
  'the_DT',
  'beginning_NN',
  'of_IN',
  'a_DT',
  'third_NNP',
  'trumet_NNP',
  'in_IN',
  'a_DT',
  'fog_NN',
  ';_:',
  'a_DT',
  'fog_NN',
  'blown_VBN',
  'in_RB',
  'during_IN',
  'the_DT',
  'night_NN',
  'by_IN',
  'the_DT',
  'wind_NN',
  'from_IN',
  'the_DT',
  'wide_JJ',
  'atlantic_NNP',
  '._.'],
 ['the_DT',
  'pole_NN',
  'on_IN',
  'cannon_NNP',
  'hill_NNP',
  ',_,',
  'where_WRB',
  'the_DT',
  'beacon_NN',
  'was_VBD',
  'hoisted_VBN',
  'when_WRB',
  'the_DT',
  'packet_NN',
  'from_IN',
  'boston_NNP',
  'dropped_VBD',
  'anchor_NN',
  'in_IN',
  'the_DT',
  'bay_NN',
  ',_,',
  'was_VBD',
  'shiny_JJ',
  'and_CC',
  'slippery_JJ',
  '._.'],
 ['the_DT',
  'new_JJ',
  'weathervane_NN',
  ',_,',
  'a_DT',
  'gilded_VBN',
  'whale_NN',
  ',_,',
  'presented_VBN',
  'to_IN',
  'the_DT',
  '"_``',
  'regular_JJ',
  '"_\'\'',
  'church_NN',
  'by_IN'

In [9]:
 with open('gutenberg_sentences_with_tags.txt', 'w') as f:
    for sent in tokenized_sentences:
        if len(sent)>0:
            f.write(" ".join(sent)+"\n" )

### Amazon data

In [1]:
import random
random.seed(3010010)

In [2]:
import spacy
nlp = spacy.load("en")

In [3]:
from tqdm import tqdm
import json

In [4]:
urls = [
    '/Volumes/extra_data/reviews_Beauty_5.json.gz',
    '/Volumes/extra_data/reviews_Home_and_Kitchen_5.json.gz',
    '/Volumes/extra_data/reviews_Grocery_and_Gourmet_Food_5.json.gz',
    '/Volumes/extra_data/reviews_Sports_and_Outdoors_5.json.gz',
]

In [5]:
import gzip
def parse(path): 
    g = gzip.open(path, 'r') 
    for l in g: 
        yield eval(l)

In [6]:
from collections import defaultdict

In [8]:
tokenized_sentences_amazon = []
item2count = defaultdict(int)
# token2pos_tag_counts_amazon = defaultdict(lambda: defaultdict(int))

for url in tqdm(urls):
    c=0
    x = parse(url)
    #for i in tqdm(x):
    for i in x:
#         item = i['asin']
#         if item2count[item]>7: ## max per item
#             pass
#         else:
        review = nlp(i['reviewText'])
        try:
            sents = list(review.sents)
            sents_ok = [sent for sent in sents if (sent.text[0] in 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' and sent.text[-1] in ".?!")]
            sent = random.choice(sents_ok)
            tagged = [(i.text.lower(),i.tag_) for i in sent]
            tagged_clean = [(token,tag) for token,tag in tagged if "\n" not in token]
            tokens = [token+"_"+tag for token,tag in tagged_clean]
            tokenized_sentences_amazon.append(tokens)
#             for token,tag in tagged_clean:
#                 token2pos_tag_counts_amazon[token][tag]+=1
            c+=1
            #item2count[item]+=1
        except:
            pass
#         if c>50: ## max number for debugging
#             print("break!")
#             break

100%|██████████| 4/4 [6:06:13<00:00, 5493.38s/it]  


In [9]:
len(tokenized_sentences_amazon)

1087605

In [10]:
tokenized_sentences_amazon[:10]

[['very_RB', 'oily_JJ', 'and_CC', 'creamy_JJ', '._.'],
 ['this_DT',
  'palette_NN',
  'conceals_NNS',
  'decently_RB',
  ',_,',
  'however_RB',
  ',_,',
  'it_PRP',
  'does_VBZ',
  'somewhat_RB',
  'cake_VB',
  'up_RP',
  'and_CC',
  'crease_VB',
  '._.'],
 ['and_CC',
  'so_RB',
  'far_RB',
  'i_PRP',
  'tried_VBD',
  'twice_RB',
  'but_CC',
  'it_PRP',
  'does_VBZ',
  'nt_RB',
  'really_RB',
  'show_VB',
  'any_DT',
  'color_NN',
  'on_IN',
  'my_PRP$',
  'face_NN',
  '._.'],
 ['i_PRP',
  'think_VBP',
  'it_PRP',
  'does_VBZ',
  'great_JJ',
  'coverage_NN',
  'for_IN',
  'the_DT',
  'price_NN',
  'i_PRP',
  'paid_VBD',
  '._.'],
 ['i_PRP',
  'also_RB',
  'use_VBP',
  'a_DT',
  'silicone_NN',
  'based_VBN',
  'primer_NNP',
  ',_,',
  'which_WDT',
  'works_VBZ',
  'well_RB',
  'for_IN',
  'my_PRP$',
  'skin_NN',
  'type_NN',
  'to_TO',
  'give_VB',
  'me_PRP',
  'the_DT',
  'fresh_JJ',
  ',_,',
  'dewy_JJ',
  'look_NN',
  'i_PRP',
  'love_VBP',
  '._.'],
 ['do_VB', 'it_PRP', '!_.'],
 ['

In [11]:
 with open('amazon_sentences_with_tags.txt', 'w') as f:
    for sent in tokenized_sentences_amazon:
        if len(sent)>0:
            f.write(" ".join(sent)+"\n" )

## Combine

### Combine tokenized sents and shuffle

In [1]:
import random
random.seed(2091848572)

In [2]:
with open('amazon_sentences_with_tags.txt','r') as f:
    amz = f.readlines()
    
with open('gutenberg_sentences_with_tags.txt','r') as f:
    gtb = f.readlines()

In [3]:
len(amz)

1087605

In [4]:
len(gtb)

4850043

In [5]:
combined = amz + gtb

In [6]:
random.shuffle(combined)

In [7]:
with open('gutenberg_and_amazon_sents_with_tags_tokenized_shuffled.txt','w') as f:
    for sent in combined:
        if len(sent)>0:
            f.write(sent)

Small version for convenience when debugging model.

In [10]:
with open('gutenberg_and_amazon_sents_with_tags_tokenized_shuffled_2000_sents.txt','w') as f:
    for sent in combined[:2000]:
        if len(sent)>0:
            f.write(sent)

In [11]:
combined[:10]

['something_NN had_VBD obliged_VBN her_PRP to_TO tell_VB her_PRP ._.\n',
 "he_PRP had_VBD struck_VBN at_IN me_PRP with_IN his_PRP$ automatic_NN ,_, which_WDT i_PRP think_VBP he_PRP must_MD have_VB dropped_VBN ,_, though_RB i_PRP 'm_VBP not_RB sure_JJ of_IN that_DT ._.\n",
 'being_VBG a_DT type_NN 2_CD diabetic_NN ,_, i_PRP am_VBP always_RB looking_VBG sweet_JJ tasting_NN treats_NNS that_WDT will_MD not_RB affect_VB my_PRP$ blood_NN sugar_NN and_CC this_DT sugarfree_JJ gum_NN is_VBZ now_RB very_RB high_JJ on_IN my_PRP$ list_NN favorite_JJ things_NNS ._.\n',
 'it_PRP happened_VBD thus_RB ._.\n',
 'secondly_RB ,_, jim_NNP had_VBD gone_VBN to_IN heaps_NNS of_IN trouble_NN gathering_VBG all_PDT the_DT breeding_NN -_HYPH stock_NN of_IN holy_NNP cross_NNP ,_, for_IN a_DT party_NN named_VBN jabez_NNP y._NNP stone_NNP to_TO steal_VB them_PRP convenient_RB ._.\n',
 'he_PRP had_VBD been_VBN taunted_VBN about_IN it_PRP --_: by_IN boys_NNS ._.\n',
 'he_PRP is_VBZ exchanged_VBN at_IN length_NN ,_, b

## Count Tokens

In [39]:
from collections import defaultdict

word2count = defaultdict(int)

for s in combined:
    for t in s.split():
        word2count[t.lower()]+=1

In [38]:
word2count['sugarfree_jj']

0

In [40]:
len(word2count.keys())

750552

In [42]:
to_delete = []

for w in word2count:
    if word2count[w]<5: ## min count of word2vec model
        to_delete.append(w)
        
for wd in to_delete:
    del word2count[wd]

In [43]:
word2count['sugarfree_jj']

0

In [44]:
len(word2count.keys())

180394

In [45]:
import json
with open('gutenberg_and_amazon_token2count.json','w') as f:
    json.dump(word2count,f)

***