In [100]:
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk import word_tokenize
import json
import pandas as pd
stop_words = set(stopwords.words("english"))

def tokenizer(description, stop_words, normalization):
    
    if normalization == 'lemmatize':
        # tokenize and lemmatize text
        lemmatizer = WordNetLemmatizer()
        tokens = [lemmatizer.lemmatize(w) for w in word_tokenize(description)]
        
    elif normalization == 'stem':
        # tokenize and stem text
        stemmer = PorterStemmer()
        tokens = [stemmer.stem(w) for w in word_tokenize(description)]
    
   # remove tokens length of 2 or below and make all lowercase and remove stop words
    tokens = [w.lower() for w in tokens if (w.lower() not in stop_words) and (len(w) > 2) and(w.isalpha())]
    
    return tokens    
    
def process_query(query, normalization):
    
    stop_words = set(stopwords.words("english"))
    
    return tokenizer(query, stop_words, normalization)

In [53]:
def retrieve_n_rank_docs(inverted_index, query, normalization, max_docs=-1):
    ret_docs = {}
    
    counts = {}
    query = process_query(query, normalization
                         )
    for word in query:
        try:
            docs = inverted_index.get(word)
            for k, v in docs.items():
                if k in counts:
                    counts[k] += v
                else:
                    counts[k] = v

        except Exception as E:
            pass
        break
    counts = sorted(counts.items(), key=lambda x: (x[1], -int(x[0][1:])), reverse=True)
    if max_docs > -1:
        ret_docs[' '.join(query)] = [x[0] for x in counts][:max_docs]
    else:
        ret_docs[' '.join(query)] = [x[0] for x in counts]
        
    return ret_docs

In [38]:
# IMPORT INVERTED_INDEXES
with open(r'assets/inverted_index_stem.json') as f:
    inverted_index_stem = json.load(f)
    
with open(r'assets/inverted_index_lem.json') as f:
    inverted_index_lem = json.load(f)

In [72]:
naics_titles = pd.read_excel('assets/6-digit_2017_Codes.xlsx')
naics_titles['naics'] = naics_titles['naics'].astype(str)

## Stemmed Word Count Model

In [87]:
stem_df = pd.DataFrame(retrieve_n_rank_docs(inverted_index_lem, 'Home improvement store', 'stem'))
stem_df.columns.values[0] = 'naics'
stem_df = stem_df.merge(naics_titles, on='naics', how='outer')
stem_df[['naics', 'title']].head(10)

Unnamed: 0,naics,title
0,321999,All Other Miscellaneous Wood Product Manufactu...
1,453998,All Other Miscellaneous Store Retailers (excep...
2,423220,Home Furnishing Merchant Wholesalers
3,454390,Other Direct Selling Establishments
4,333111,Farm Machinery and Equipment Manufacturing
5,321920,Wood Container and Pallet Manufacturing
6,423390,Other Construction Material Merchant Wholesalers
7,236117,New Housing For-Sale Builders
8,442299,All Other Home Furnishings Stores
9,623990,Other Residential Care Facilities


In [89]:
stem_df = pd.DataFrame(retrieve_n_rank_docs(inverted_index_lem, 'musical instrument store', 'stem'))
stem_df.columns.values[0] = 'naics'
stem_df = stem_df.merge(naics_titles, on='naics', how='outer')
stem_df[['naics', 'title']].head(10)

Unnamed: 0,naics,title
0,453998,All Other Miscellaneous Store Retailers (excep...
1,511199,All Other Publishers
2,611610,Fine Arts Schools
3,512230,Music Publishers
4,511120,Periodical Publishers
5,511130,Book Publishers
6,621340,"Offices of Physical, Occupational and Speech T..."
7,453310,Used Merchandise Stores
8,512290,Other Sound Recording Industries
9,511140,Directory and Mailing List Publishers


## Lemmatized Word Count Model

In [91]:
lem_df = pd.DataFrame(retrieve_n_rank_docs(inverted_index_lem, 'Home improvement store', 'lemmatize'))
lem_df.columns.values[0] = 'naics'
lem_df = lem_df.merge(naics_titles, on='naics', how='outer')
lem_df[['naics', 'title']].head(10)

Unnamed: 0,naics,title
0,321999,All Other Miscellaneous Wood Product Manufactu...
1,453998,All Other Miscellaneous Store Retailers (excep...
2,423220,Home Furnishing Merchant Wholesalers
3,454390,Other Direct Selling Establishments
4,333111,Farm Machinery and Equipment Manufacturing
5,321920,Wood Container and Pallet Manufacturing
6,423390,Other Construction Material Merchant Wholesalers
7,236117,New Housing For-Sale Builders
8,442299,All Other Home Furnishings Stores
9,623990,Other Residential Care Facilities


In [98]:
lem_df = pd.DataFrame(retrieve_n_rank_docs(inverted_index_lem, 'musical instrument store', 'lemmatize'))
lem_df.columns.values[0] = 'naics'
lem_df = lem_df.merge(naics_titles, on='naics', how='outer')
lem_df[['naics', 'title']].head(10)

Unnamed: 0,naics,title
0,711510,"Independent Artists, Writers, and Performers"
1,711130,Musical Groups and Artists
2,711310,"Promoters of Performing Arts, Sports, and Simi..."
3,711320,"Promoters of Performing Arts, Sports, and Simi..."
4,711110,Theater Companies and Dinner Theaters
5,711219,Other Spectator Sports
6,339992,Musical Instrument Manufacturing
7,711410,"Agents and Managers for Artists, Athletes, Ent..."
8,711211,Sports Teams and Clubs
9,339999,All Other Miscellaneous Manufacturing
