# Getting Sentences to Vectorize

I'm creating my own `word2vec` model, so I need tokenized sentences. Here I'm using a large chunk of the Project Gutenberg corpus as well as some [Amazon review data](https://snap.stanford.edu/data/web-Amazon.html).   

I also collect the respective part-of-speech tags for all the tokens, which I'll use later.

*This notebook is repurposed from this one [here](https://github.com/kbooten/stylevise/blob/master/getting_syntax_ngrams.ipynb).*
***

In [1]:
import os
gb_files = [f for f in os.listdir("/Users/kyle/Documents/downloading_gutenberg/data/") if f.startswith('gb_')]

In [2]:
len(gb_files)

19507

In [3]:
from tqdm import tqdm
import json

In [4]:
import spacy
nlp = spacy.load("en")

### Gutenberg data

In [5]:
from collections import defaultdict

In [6]:
tokenized_sentences = []
from gutenberg.cleanup import strip_headers
#used_ids = []
token2pos_tag_counts = defaultdict(lambda: defaultdict(int))

#for fy in tqdm(random.sample(gb_files,2)):
for fy in tqdm(gb_files):
    with open("/Users/kyle/Documents/downloading_gutenberg/data/"+fy,'r') as f:
        tempdata = f.read()
        filenumber = fy.lstrip("gb_").rstrip(".txt")
        if "Language: English" in tempdata[:1000]:  ## make sure english 
            tempdata = strip_headers(tempdata)
            tempspacy = nlp(tempdata[:100000])### limit to first n chars
            sents = list(tempspacy.sents)
            sents_ok = [sent for sent in sents if (sent.text[0] in 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' and sent.text[-1] in ".?!")]
            for sent in sents_ok:
                tagged = [(i.text.lower(),i.tag_) for i in sent]
                tagged_clean = [(token,tag) for token,tag in tagged if "\n" not in token]
                tokens = [token for token,tag in tagged_clean]
                tokenized_sentences.append(tokens)
                for token,tag in tagged_clean:
                    token2pos_tag_counts[token][tag]+=1
            ###
            #used_ids.append(filenumber)


100%|██████████| 19507/19507 [11:17:23<00:00,  2.08s/it]  


In [7]:
token2pos_tag_counts_jsonable = {}

for key in token2pos_tag_counts:
    token2pos_tag_counts_jsonable[key] = dict(token2pos_tag_counts[key])
    
token2pos_tag_counts_jsonable

{'mr': {'NNP': 8088},
 '.': {'.': 4414672,
  'NNP': 33734,
  'FW': 2127,
  'NN': 9991,
  'NNS': 34,
  'CD': 66,
  'SYM': 19,
  ',': 18,
  'JJ': 1,
  'IN': 5,
  'NNPS': 5,
  'RB': 6},
 'in': {'IN': 1695413,
  'RB': 22302,
  'RP': 36306,
  'NNP': 249,
  'NN': 215,
  'FW': 2,
  'NNS': 2,
  'XX': 1,
  'JJ': 2},
 'which': {'WDT': 456704, 'NNP': 10, 'NN': 2},
 'keziah': {'NNP': 44},
 'hears': {'VBZ': 821, 'NNS': 84, 'NNPS': 4, 'NNP': 12},
 'two': {'CD': 136008, 'NNP': 492, 'NN': 4, 'PRP': 1},
 'proposals': {'NNS': 498, 'NNP': 1, 'NNPS': 9},
 'and': {'CC': 2907362, 'JJ': 10, 'NNP': 2, 'RB': 1},
 'the': {'DT': 6312067, 'NN': 16, 'NNP': 1208, 'RB': 26},
 'beginning': {'NN': 7511, 'VBG': 5024, 'NNP': 13, 'JJ': 1},
 'of': {'IN': 3404445,
  'RB': 10542,
  'JJ': 50,
  'RP': 10,
  'NNP': 10,
  'NN': 17,
  'CC': 1,
  'XX': 1},
 'a': {'DT': 2022544,
  'FW': 300,
  'NN': 4008,
  'RB': 352,
  'NNP': 2846,
  'LS': 346,
  'XX': 11,
  'JJ': 2,
  'UH': 4,
  'NNS': 1,
  'VB': 1,
  'VBP': 1},
 'third': {'NNP'

In [8]:
len(tokenized_sentences)

4850043

In [9]:
tokenized_sentences[:100]

[['mr', '.'],
 ['in',
  'which',
  'keziah',
  'hears',
  'two',
  'proposals',
  'and',
  'the',
  'beginning',
  'of',
  'a',
  'third',
  'trumet',
  'in',
  'a',
  'fog',
  ';',
  'a',
  'fog',
  'blown',
  'in',
  'during',
  'the',
  'night',
  'by',
  'the',
  'wind',
  'from',
  'the',
  'wide',
  'atlantic',
  '.'],
 ['the',
  'pole',
  'on',
  'cannon',
  'hill',
  ',',
  'where',
  'the',
  'beacon',
  'was',
  'hoisted',
  'when',
  'the',
  'packet',
  'from',
  'boston',
  'dropped',
  'anchor',
  'in',
  'the',
  'bay',
  ',',
  'was',
  'shiny',
  'and',
  'slippery',
  '.'],
 ['the',
  'new',
  'weathervane',
  ',',
  'a',
  'gilded',
  'whale',
  ',',
  'presented',
  'to',
  'the',
  '"',
  'regular',
  '"',
  'church',
  'by',
  'captain',
  'zebedee',
  'mayo',
  ',',
  'retired',
  'whaler',
  ',',
  'swam',
  'in',
  'a',
  'sea',
  'of',
  'cloud',
  '.'],
 ['the',
  'lichened',
  'eaves',
  'of',
  'the',
  'little',
  '"',
  'come',
  '-',
  'outer',
  '"',
  

In [10]:
with open('gutenberg_token2pos.json','w') as f:
    json.dump(token2pos_tag_counts_jsonable,f)

In [11]:
 with open('gutenberg_sentences.txt', 'a') as f:
    for sent in tokenized_sentences:
        if len(sent)>0:
            f.write(" ".join(sent)+"\n" )

### Amazon data

In [1]:
import random
random.seed('3')

In [2]:
import spacy
nlp = spacy.load("en")

In [3]:
from tqdm import tqdm
import json

In [4]:
urls = [
    '/Volumes/extra_data/reviews_Beauty_5.json.gz',
    '/Volumes/extra_data/reviews_Home_and_Kitchen_5.json.gz',
    '/Volumes/extra_data/reviews_Grocery_and_Gourmet_Food_5.json.gz',
    '/Volumes/extra_data/reviews_Sports_and_Outdoors_5.json.gz',
]

In [5]:
import gzip
def parse(path): 
    g = gzip.open(path, 'r') 
    for l in g: 
        yield eval(l)

In [6]:
from collections import defaultdict

In [7]:
tokenized_sentences_amazon = []
item2count = defaultdict(int)
token2pos_tag_counts_amazon = defaultdict(lambda: defaultdict(int))

for url in tqdm(urls):
    c=0
    x = parse(url)
    #for i in tqdm(x):
    for i in x:
#         item = i['asin']
#         if item2count[item]>7: ## max per item
#             pass
#         else:
        review = nlp(i['reviewText'])
        try:
            sents = list(review.sents)
            sents_ok = [sent for sent in sents if (sent.text[0] in 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' and sent.text[-1] in ".?!")]
            sent = random.choice(sents_ok)
            tagged = [(i.text.lower(),i.tag_) for i in sent]
            tagged_clean = [(token,tag) for token,tag in tagged if "\n" not in token]
            tokens = [token for token,tag in tagged_clean]
            tokenized_sentences_amazon.append(tokens)
            for token,tag in tagged_clean:
                token2pos_tag_counts_amazon[token][tag]+=1
            c+=1
            i#tem2count[item]+=1
        except:
            pass
#         if c>50: ## max number
#             print("break!")
#             break

100%|██████████| 4/4 [6:14:43<00:00, 5620.86s/it]  


In [8]:
len(tokenized_sentences_amazon)

1087605

In [9]:
token2pos_tag_counts_amazon_jsonable = {}

for key in token2pos_tag_counts_amazon:
    token2pos_tag_counts_amazon_jsonable[key] = dict(token2pos_tag_counts_amazon[key])
    
token2pos_tag_counts_amazon_jsonable

{'plus': {'CC': 2980, 'NN': 1295, 'NNP': 170, 'JJ': 208, 'IN': 33, 'RB': 3},
 ',': {',': 516569, '.': 16},
 'took': {'VBD': 4314,
  'VBZ': 5,
  'NNP': 8,
  'VBN': 11,
  'VB': 1,
  'NN': 2,
  'VBP': 1},
 'forever': {'NNP': 36, 'RB': 1231, 'NNS': 1, 'NNPS': 1, 'NN': 1},
 'to': {'TO': 284052,
  'IN': 87263,
  'NN': 136,
  'RB': 3,
  'NNP': 3,
  'JJ': 1,
  'RP': 1},
 'arrive': {'VB': 280, 'VBP': 132, 'VBZ': 1, 'JJ': 1, 'NN': 1},
 '.': {'.': 943321,
  'FW': 223,
  'NNP': 409,
  'NN': 81,
  'JJ': 1,
  'RB': 1,
  'CD': 1},
 'this': {'DT': 248430,
  'NNP': 165,
  'NN': 57,
  'RB': 7,
  'XX': 1,
  'VBZ': 2,
  'PRP': 6,
  'JJ': 4,
  'NNS': 8,
  'VBP': 2,
  'WDT': 1,
  'PDT': 1,
  'VB': 1},
 'palette': {'NN': 448, 'NNP': 19, 'VBP': 1, 'VB': 2},
 'conceals': {'NNS': 25, 'VBZ': 4},
 'decently': {'RB': 165},
 'however': {'RB': 9885, 'NNP': 13, 'WRB': 25, 'NN': 1},
 'it': {'PRP': 428158, 'NN': 379, 'NNP': 193, 'VBZ': 5, 'UH': 4},
 'does': {'VBZ': 40447, 'MD': 28, 'NNS': 2, 'NNP': 6},
 'somewhat': {'R

In [10]:
with open('amazon_token2pos.json','w') as f:
    json.dump(token2pos_tag_counts_amazon_jsonable,f)

In [11]:
 with open('amazon_sentences.txt', 'w') as f:
    for sent in tokenized_sentences_amazon:
        if len(sent)>0:
            f.write(" ".join(sent)+"\n" )

## Combine

### Combine tokenized sents and shuffle

In [1]:
import random
random.seed(2091848572)

In [2]:
with open('amazon_sentences.txt','r') as f:
    amz = f.readlines()
    
with open('gutenberg_sentences.txt','r') as f:
    gtb = f.readlines()

In [3]:
len(amz)

1087605

In [4]:
len(gtb)

7923248

In [5]:
combined = amz + gtb

In [6]:
random.shuffle(combined)

In [7]:
with open('gutenberg_and_amazon_sents_tokenized_shuffled.txt','w') as f:
    for sent in combined:
        if len(sent)>0:
            f.write(sent)

### Combine `token2pos` files

In [1]:
import json
with open('amazon_token2pos.json','r') as f:
    amz_token2pos = json.load(f)
    
with open('gutenberg_token2pos.json','r') as f:
    gtb_token2pos = json.load(f)

In [2]:
[t for t in amz_token2pos if t in gtb_token2pos][:10]

['plus',
 ',',
 'took',
 'forever',
 'to',
 'arrive',
 '.',
 'this',
 'palette',
 'conceals']

In [3]:
[t for t in amz_token2pos if t not in gtb_token2pos][:10]

['silicone',
 'xtreme',
 'perfumy',
 'picky',
 'parfume',
 'aveda',
 'ajurvedic',
 'luffy',
 'chamberone',
 'a+']

In [4]:
gtb_token2pos['plus']

{'CC': 480, 'JJ': 12, 'NN': 12, 'NNP': 21, 'IN': 7, 'FW': 1}

Combine the dictionaries (within dictionaries).

In [5]:
for token_ in amz_token2pos:
    if token_ not in gtb_token2pos:
        gtb_token2pos[token_] = amz_token2pos[token_]
    else:
        for postag in amz_token2pos[token_]: ## iterate through pos tags
            ## add if not present, otherwise increment
            if postag not in gtb_token2pos[token_]:
                gtb_token2pos[token_][postag] = amz_token2pos[token_][postag]
            else:
                gtb_token2pos[token_][postag] += amz_token2pos[token_][postag]

In [6]:
gtb_token2pos['silicone']

{'NN': 1553, 'VB': 2, 'NNP': 107, 'JJ': 1}

In [7]:
gtb_token2pos['plus']

{'CC': 3460, 'JJ': 220, 'NN': 1307, 'NNP': 191, 'IN': 40, 'FW': 1, 'RB': 3}

In [10]:
with open('gutenberg_and_amazon_token2pos.json','w') as f:
    json.dump(gtb_token2pos,f)

***