# Extract Ngram Frequency 

### Imports

In [1]:
import requests 
import urllib 
import inflect
import time
import csv
from requests import JSONDecodeError
import numpy as np

### Functions

In [2]:

def getSub(cue, phrase, p):
    terms = phrase.split(',')
    for t in terms:
        words = t.split(' ')
        idx = words.index(cue) - 1
        word = words[idx]
        singular = p.singular_noun(word)
        if  singular is False:
            words[idx] = p.plural(word)
        else:
            words[idx] = singular
        phrase += "," + " ".join(words)
    return phrase
    

def getPlurals(grams):
    p = inflect.engine()    
    pluralGrams = []
    for term in grams:
        words = term.split(' ')
        finalWord = words[-1]
        singular = p.singular_noun(finalWord)
        if  singular is False:
            replace = p.plural(finalWord)
        else:
            replace = singular
        
        term2 = " ".join(words[:-1]) + f" {replace}"
        phrase = term + "," + term2
        if 'in' in words:
            phrase = getSub('in', phrase, p)
        elif 'and' in words:
            phrase = getSub('and', phrase, p)  
        elif "'s" in words:
            phrase = getSub("'s", phrase, p)
        elif 'of' in words:
            phrase = getSub('of', phrase, p)
        elif 'for' in words:
            phrase = getSub('for', phrase, p)
        pluralGrams.append(phrase)
    return pluralGrams

def getNgram(query, startYear, endYear, corpus='en-2019', smoothing=3): 
    query = urllib.parse.quote(query) 
    url = f'https://books.google.com/ngrams/json?content={query}&year_start={startYear}\
        &year_end={endYear}&corpus={corpus}&smoothing={smoothing}&case_insensitive=true'
    response = requests.get(url) 
    
    try:
        output = response.json() 
    except JSONDecodeError:
        print(response)
        return False
    
    frequency = np.zeros((endYear-startYear+1))
  
    if len(output) == 0: 
        return None
    else: 
        for num in range(len(output)): 
           if '(All)' not in output[num]['ngram']:
              frequency += np.array(output[num]['timeseries'])  
    return frequency 


In [6]:
in_filename = 'mGrams'
out_filename = 'mGrams'

In [7]:
lcshGrams = []
yrs = []
indices = []
with open (f'{in_filename}.txt') as f:
    for line in f:
        idx, gram, year = line.replace('\n', '').split('\t')
        lcshGrams.append(gram)
        yrs.append(int(year))
        indices.append(idx)
        
gramsPlural = getPlurals(lcshGrams)
print(len(gramsPlural))

371


In [8]:
i = 0
headers = ['Ngram'] + ['Year_Added'] + [i for i in range(1970, 2020)]

with open(f'{out_filename}.csv', 'w', newline='') as f:
    csvwriter = csv.writer(f)
    csvwriter.writerow(headers)
    print(f'Processing Ngrams for {out_filename}')
    while i < len(gramsPlural):
         gram = gramsPlural[i]
         term = gram.split(',')[0]
         if i%10 == 1:
             print(f'{i} file(s) already processed. Currently processing {term}.')
         freq = getNgram(gram, 1970, 2019)
         if freq is not None and freq is not False:
             row = [indices[i]] + [term] + [yrs[i]] + freq.tolist()
             csvwriter.writerow(row)
             i += 1
         elif freq is None:
             i += 1
         elif freq is False:
             print(f'pausing at index {i}, term {term}')
             time.sleep(20)
             continue
        
print(f'Stopped at index {i}, term {term}')

Processing Ngrams for mGrams
1 file(s) already processed. Currently processing eating disorders in men.
11 file(s) already processed. Currently processing depression in men.
<Response [429]>
pausing at index 17, term latter day saint men
21 file(s) already processed. Currently processing male language.
31 file(s) already processed. Currently processing african american bisexual men.
41 file(s) already processed. Currently processing male preschool teachers.
<Response [429]>
pausing at index 47, term grief in men
51 file(s) already processed. Currently processing african american male college students.
61 file(s) already processed. Currently processing teenage boys on television.
71 file(s) already processed. Currently processing male homosexuality in motion pictures.
<Response [429]>
pausing at index 77, term latter day saint boys
81 file(s) already processed. Currently processing fathers of prime ministers.
91 file(s) already processed. Currently processing male artists.
101 file(s) a

### Extraction

In [11]:
in_filename = ''
out_filename = ''

In [4]:
lcshGrams = []
yrs = []
indices = []
with open (f'{in_filename}.txt') as f:
    for line in f:
        idx, gram, year = line.replace('\n', '').split('\t')
        lcshGrams.append(gram)
        yrs.append(int(year))
        indices.append(idx)
        
gramsPlural = getPlurals(lcshGrams)
print(len(gramsPlural))

1873


In [5]:
i = 0
headers = ['Ngram'] + ['Year_Added'] + [i for i in range(1970, 2020)]

with open(f'{out_filename}.csv', 'w', newline='') as f:
    csvwriter = csv.writer(f)
    csvwriter.writerow(headers)
    print(f'Processing Ngrams for {out_filename}')
    while i < len(gramsPlural):
         gram = gramsPlural[i]
         term = gram.split(',')[0]
         if i%10 == 1:
             print(f'{i} file(s) already processed. Currently processing {term}.')
         freq = getNgram(gram, 1970, 2019)
         if freq is not None and freq is not False:
             row = [indices[i]] + [term] + [yrs[i]] + freq.tolist()
             csvwriter.writerow(row)
             i += 1
         elif freq is None:
             i += 1
         elif freq is False:
             print(f'pausing at index {i}, term {term}')
             time.sleep(20)
             continue
        
print(f'Stopped at index {i}, term {term}')

Processing Ngrams for wGrams
1 file(s) already processed. Currently processing overweight women in art.
11 file(s) already processed. Currently processing boxing for women.
21 file(s) already processed. Currently processing women tap dancers.
<Response [429]>
pausing at index 30, term women wood pulp industry workers
31 file(s) already processed. Currently processing javanese women.
41 file(s) already processed. Currently processing women transport workers.
51 file(s) already processed. Currently processing women curlers.
<Response [429]>
pausing at index 60, term jewish religious education of women
61 file(s) already processed. Currently processing women military cadets.
71 file(s) already processed. Currently processing women violists.
81 file(s) already processed. Currently processing adnyamathanha women.
<Response [429]>
pausing at index 90, term respiratory diseases in women
91 file(s) already processed. Currently processing internet and women.
101 file(s) already processed. Curre