The final version of the Recommender system demonstrates the "improvement" of the system after receiving user feedback. It uses the LSI model from before, but takes the relevance judgments as feedback to imrpove the next iteration. The only drawback in this scenario is that Rocchio feedback only works with queries that have been seen before. While the system improves greatly, it does nothing to improve the results of the LSI model if the query has not been seen before.

In [39]:
import pandas as pd
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import re
from sklearn.decomposition import TruncatedSVD
import json

In [40]:
stop_words = set(stopwords.words("english"))

PATTERN_S = re.compile("\'s")  # matches `'s` from text  
PATTERN_RN = re.compile("\\r\\n\\b") #matches `\r` and `\n`
PATTERN_PUNC = re.compile(r"[^\w\s]") # matches all non 0-9 A-z whitespace 


def clean_text(text):
    """
    Series of cleaning. String to lower case, remove non words characters and numbers (punctuation, curly brackets etc).
        text (str): input text
    return (str): modified initial text
    """
    text = text.lower()  # lowercase text
    # replace the matched string with ' '
    text = re.sub(PATTERN_S, ' ', text)
    text = re.sub(PATTERN_RN, ' ', text)
    text = re.sub(PATTERN_PUNC, ' ', text)
    return text

def tokenizer(description, stop_words, normalization):
    
    if normalization == 'lemmatize':
        # tokenize and lemmatize text
        lemmatizer = WordNetLemmatizer()
        tokens = [lemmatizer.lemmatize(w) for w in word_tokenize(description)]
        
    elif normalization == 'stem':
        # tokenize and stem text
        stemmer = PorterStemmer()
        tokens = [stemmer.stem(w) for w in word_tokenize(description)]
    
   # remove tokens length of 2 or below and make all lowercase and remove stop words
    tokens = [w.lower() for w in tokens if (w.lower() not in stop_words) and (len(w) > 2) and(w.isalpha())]
    
    return tokens    
    
def process_query(query, normalization):
    stop_words = set(stopwords.words("english"))
    
    return tokenizer(clean_text(query), stop_words, normalization)

In [41]:
# code example taken from https://towardsdatascience.com/build-a-text-recommendation-system-with-python-e8b95d9f251c
def retrieve_top_n(m, max_docs):
    # return the sum on all tokens of cosinus for each sentence
    if len(m.shape) > 1:
        cos_sim = np.mean(m, axis=0) 
    else: 
        cos_sim = m
    index = np.argsort(cos_sim)[::-1] # from highest idx to smallest score \
    mask = np.ones(len(cos_sim))
    mask = np.logical_or(cos_sim[index] != 0, mask) #eliminate 0 cosine distance
    best_index = index[mask][:max_docs]  
    return best_index

In [42]:
class RocchioRecommenderSystem:
    def __init__(self, docs, num_concepts=350, alpha=1.0, beta=0.75, gamma=0.15):
        self.alpha, self.beta, self.gamma = alpha, beta, gamma
        
        # create a doc-term matrix out of our doc collection
        self.vec = TfidfVectorizer()
        doc_term_mat = self.vec.fit_transform([" ".join(docs[doc_id]) for doc_id in docs])
        self.svd = TruncatedSVD(n_components=num_concepts, random_state=42)
        result = self.svd.fit_transform(doc_term_mat)
        
        self.q_vecs = {}
        
        self.doc_vecs = result # document vectors in a matrix
        
    def retrieve_docs(self, query, max_docs=10, normalization='lemmatize'):
        query = ' '.join(process_query(query, normalization))
        
        if query not in self.q_vecs:
            q_vec = self.vec.transform([query])
            lsi_transform = self.svd.transform(q_vec)
            self.q_vecs[query] = lsi_transform
        
        ret_docs = {}
        
        mat = cosine_similarity(self.q_vecs[query], self.doc_vecs)
        best_index = retrieve_top_n(mat, max_docs=max_docs)
        
        return best_index
    
    def gather_feedback(self, query, max_docs=10, feedback=None, normalization='lemmatize'):
        """
        This function models the interactive relevance feedback loop
        """
        query = ' '.join(process_query(query, normalization))
        # Step 2: Retrieve the required number of docs in reponse to the queries
        ret_docs = self.retrieve_docs(query, max_docs=max_docs)
        
        
        # display docs to user
        # receive feedback from user

        # Step 3: Obtain feedback from the user in the form of precisions at each rank
        # user_feedback = feedback
        # map index to user feedback
        idx_dic = {}
        # for i, doc in enumerate(ret_docs):
        #     try:
        #         idx_dic[doc] = user_feedback[i]
        #     except:
        #         idx_dic[doc] = 0
        
        for feed in feedback:
            if feed[0] == 1:
                idx_dic[feed[1]] = 1
            else:
                idx_dic[feed[1]] = 0
                

        self.q_vecs[query] = np.dot(self.alpha, self.q_vecs[query])
        for key, value in idx_dic.items():
            if value == 1:
                self.q_vecs[query] += np.dot(self.beta, self.doc_vecs[key])
            else:
                self.q_vecs[query] -= np.dot(self.gamma, self.doc_vecs[key])
        self.q_vecs[query][self.q_vecs[query] < 0] = 0

In [43]:
df = pd.read_pickle(r'assets/processed_df.pkl')
docs = dict(zip(df['naics'], df['lemmatized']))
model = RocchioRecommenderSystem(docs, beta=1.2, gamma=.05)
best_index = model.retrieve_docs('lawyer office')
naics_titles = pd.read_excel('assets/6-digit_2017_Codes.xlsx')
naics_titles['naics'] = naics_titles['naics'].astype(str)
df = df.merge(naics_titles, on='naics', how='outer')
df.iloc[best_index][['naics', 'title']]

Unnamed: 0,naics,title
127,238220,"Plumbing, Heating, and Air-Conditioning Contra..."
126,238210,Electrical Contractors and Other Wiring Instal...
136,238990,All Other Specialty Trade Contractors
129,238310,Drywall and Insulation Contractors
118,238110,Poured Concrete Foundation and Structure Contr...
135,238910,Site Preparation Contractors
120,238130,Framing Contractors
130,238320,Painting and Wall Covering Contractors
121,238140,Masonry Contractors
122,238150,Glass and Glazing Contractors


In [44]:
relevant_naics = pd.read_pickle('assets/relevant_naics_df.pkl')

In [45]:
relevant_naics

Unnamed: 0,query,relevant_naics
0,Home improvement store,"[444110, 444120, 444130, 444190]"
1,Diesel fuel supplier,"[424710, 424720]"
2,Church,[813110]
3,Farm,"[115116, 115111, 115112, 115113, 115114, 115115]"
4,Seed supplier,"[424910, 424920, 424930, 424940, 424950, 424990]"
...,...,...
1149,State department agricultural development,"[926140, 926110, 926120, 926130, 926150]"
1150,Military base,"[928110, 928120]"
1151,Embassy,"[928120, 928110]"
1152,Window supplier,"[321911, 321912, 321918, 321920, 321991, 32199..."


In [46]:
max_docs = 10

In [47]:
for row in relevant_naics.iterrows():
    best_index = model.retrieve_docs(row[1]['query'], normalization='lemmatize')
    feedback = []
    for rel_code in row[1]['relevant_naics']:
        try:
            idx = np.where(df['naics'].values == str(rel_code))[0][0]
        except:
            print(idx, rel_code)
        if rel_code in df.iloc[best_index]['naics'].values:
            feedback.append((1, idx))
        else:
            feedback.append((0, idx))
    model.gather_feedback(row[1]['query'], feedback=feedback, max_docs=max_docs, normalization='lemmatize')

In [48]:
df.iloc[model.retrieve_docs('air conditioning contractor', normalization='lemmatize')][['naics', 'title']]

Unnamed: 0,naics,title
127,238220,"Plumbing, Heating, and Air-Conditioning Contra..."
126,238210,Electrical Contractors and Other Wiring Instal...
128,238290,Other Building Equipment Contractors
134,238390,Other Building Finishing Contractors
123,238160,Roofing Contractors
132,238340,Tile and Terrazzo Contractors
124,238170,Siding Contractors
129,238310,Drywall and Insulation Contractors
125,238190,"Other Foundation, Structure, and Building Exte..."
130,238320,Painting and Wall Covering Contractors


In [11]:
def calc_pre_rec_at_n(ret_docs, reljudges, n=-1):
    """
    Calculate precision and recall at n for each query in ret_docs
    """
    
    pre_at_n, rec_at_n = {}, {}
    
    for k, v in ret_docs.items():
        if n > -1 and n <= len(ret_docs):
            s1 = set(v[:n])
        else:
            s1 = set(v)
        s2 = reljudges[k]
        precision = len(s1.intersection(s2)) / len(s1)
        recall = len(s1.intersection(s2)) / len(reljudges[k])
        pre_at_n[k] = round(precision, 3)
        rec_at_n[k] = round(recall, 3)
    return pre_at_n, rec_at_n

In [12]:
def calc_avg_pre(ret_docs, reljudges, cutoff=-1):
    """
    Calculate (mean) average precision for each query in ret_docs
    """
    
    avg_pre, mean_avg_pre = {}, None
    for k, v in ret_docs.items():
        total_rel = 0
        total = 0
        avg_prec = 0
        for i, doc in enumerate(v):
            if doc in reljudges[k] and cutoff == -1:
                total_rel += 1
                total += 1
                precision = total_rel/total
            elif doc in reljudges[k] and i+1 <= cutoff:
                total_rel += 1
                total += 1
                precision = total_rel/total
            else:
                total += 1
                precision = 0
            avg_prec += precision

        avg_pre[k] = round(avg_prec/len(reljudges[k]), 3,)
    
    mean_avg_pre = round(sum(avg_pre.values()) / len(avg_pre), 3)
        
    return avg_pre, mean_avg_pre

In [13]:
import math

def calc_NDCG_at_n(ret_docs, reljudges, n=-1, base=2):
    """
    Calculate NDCG at n for each query in ret_docs
    """
    
    ndcg = {}
    
    for k, v in ret_docs.items():
        
        counts = list(reversed([x for x in range(2,len(reljudges[k])+2)]))
        ideals = {reljudges[k][i]: counts[i] for i in range(len(reljudges[k]))}
        
        add_ons = {}
        if len(v) > len(reljudges[k]):
            for i in range(len(v)-len(reljudges[k])):
                add_ons[i] = 1
        ideals.update(add_ons)
        nums = list(map(ideals.get, v))
        
        systems = {}
        for i, doc in enumerate(v):
            if nums[i] == None:
                systems[doc] = 1
            else:
                systems[doc] = nums[i]
                
        ideal_order = {}
        if n != -1:
            for i, (key, value) in enumerate(ideals.items()):
                if i < n:
                    ideal_order[key] = value
        else:
            ideal_order = ideals
        
        add_ons = {}
        
        
        log = 0
        for i, (doc, rank) in enumerate(ideal_order.items()):
            if i >= len(v):
                break
            elif i < base:
                log += rank
            else:
                log += rank/math.log(i+1, base)

                
        system_order = {}
        if n != -1:
            for i, (key, value) in enumerate(systems.items()):
                if i < n:
                    system_order[key] = value
        else:
            system_order = systems
        
        
        
        system_log = 0
        for i, (doc, rank) in enumerate(system_order.items()):
            if i >= len(v):
                break
            elif i < base:
                system_log += rank
            else:
                system_log += rank/math.log(i+1, base)
        system_log / log
        ndcg[k] = round(system_log / log, 3)
        
    
    return ndcg

In [14]:
ret_docs_dic = {}
queries_dic = {}

for query in relevant_naics.iterrows():
    query_name =  ' '.join(process_query(query[1]['query'], 'lemmatize'))
    ret_docs_index = model.retrieve_docs(query_name, max_docs=max_docs, normalization='lemmatize')
    ret_docs = df.iloc[ret_docs_index]['naics'].tolist()

    query_docs = query[1]['relevant_naics']
    ret_docs_dic[query_name] = ret_docs
    queries_dic[query_name] = query_docs

In [15]:
ret_docs_dic

{'home improvement store': ['442299',
  '442291',
  '453930',
  '453998',
  '453310',
  '453991',
  '453910',
  '453920',
  '442110',
  '442210'],
 'diesel fuel supplier': ['447190',
  '447110',
  '454310',
  '454390',
  '336310',
  '333618',
  '445120',
  '336214',
  '332912',
  '332919'],
 'church': ['813110',
  '813930',
  '813410',
  '813910',
  '813219',
  '813990',
  '813311',
  '813940',
  '813312',
  '813212'],
 'farm': ['115115',
  '111310',
  '115113',
  '111130',
  '111140',
  '115112',
  '115116',
  '115114',
  '111110',
  '111920'],
 'seed supplier': ['311919',
  '311911',
  '111191',
  '111219',
  '111199',
  '111211',
  '111120',
  '111160',
  '111150',
  '111110'],
 'accountant': ['541211',
  '541219',
  '541213',
  '541214',
  '541120',
  '541110',
  '541199',
  '541191',
  '541420',
  '541930'],
 'trucking company': ['484110',
  '484122',
  '484121',
  '484220',
  '484230',
  '484210',
  '493110',
  '488510',
  '493120',
  '493190'],
 'exporter': ['522220',
  '522292'

In [16]:
queries_dic

{'home improvement store': ['444110', '444120', '444130', '444190'],
 'diesel fuel supplier': ['424710', '424720'],
 'church': ['813110'],
 'farm': ['115116', '115111', '115112', '115113', '115114', '115115'],
 'seed supplier': ['424910', '424920', '424930', '424940', '424950', '424990'],
 'accountant': ['541219', '541211', '541213', '541214'],
 'trucking company': ['484110', '484121', '484122'],
 'exporter': ['522293',
  '522291',
  '522292',
  '522294',
  '522298',
  '522210',
  '522220'],
 'grain elevator': ['493130', '493110', '493120', '493190'],
 'popcorn store': ['445299', '445291', '445292', '445210', '445220', '445230'],
 'agricultural service': ['115116',
  '115111',
  '115112',
  '115113',
  '115114',
  '115115'],
 'warehouse': ['493110', '493120', '493130', '493190'],
 'agricultural production': ['111998',
  '111991',
  '111992',
  '111910',
  '111920',
  '111930',
  '111940'],
 'ranch': ['112130', '112111', '112112', '112120'],
 'holding company': ['551112', '551111', '551

In [17]:
calc_pre_rec_at_n(ret_docs_dic, queries_dic, n=-1)[0]

{'home improvement store': 0.0,
 'diesel fuel supplier': 0.0,
 'church': 0.1,
 'farm': 0.5,
 'seed supplier': 0.0,
 'accountant': 0.4,
 'trucking company': 0.3,
 'exporter': 0.6,
 'grain elevator': 0.0,
 'popcorn store': 0.6,
 'agricultural service': 0.0,
 'warehouse': 0.4,
 'agricultural production': 0.0,
 'ranch': 0.0,
 'holding company': 0.3,
 'farm equipment supplier': 0.0,
 'store': 0.0,
 'grocery store': 0.2,
 'rice mill': 0.7,
 'food product supplier': 0.0,
 'accounting firm': 0.4,
 'produce market': 0.6,
 'pet supply store': 0.5,
 'wholesaler': 0.0,
 'produce wholesaler': 0.0,
 'distribution service': 0.0,
 'crop grower': 0.4,
 'addiction treatment center': 0.0,
 'natural good store': 0.0,
 'orchard': 0.4,
 'lumber store': 0.0,
 'mine': 0.5,
 'transportation service': 0.2,
 'investment company': 0.0,
 'fruit wholesaler': 0.4,
 'real estate agency': 0.1,
 'event venue': 0.0,
 'frozen dessert supplier': 0.5,
 'wine wholesaler importer': 0.2,
 'winery': 0.6,
 'aerospace company': 

In [18]:
# IMPORT INVERTED_INDEXES
with open(r'assets/rocchio-pre.json', 'w') as f:
    json.dump(calc_pre_rec_at_n(ret_docs_dic, queries_dic, n=-1)[0], f)
    
# IMPORT INVERTED_INDEXES
with open(r'assets/rocchio-re.json', 'w') as f:
    json.dump(calc_pre_rec_at_n(ret_docs_dic, queries_dic, n=-1)[1], f)

In [19]:
calc_pre_rec_at_n(ret_docs_dic, queries_dic, n=-1)[1]

{'home improvement store': 0.0,
 'diesel fuel supplier': 0.0,
 'church': 1.0,
 'farm': 0.833,
 'seed supplier': 0.0,
 'accountant': 1.0,
 'trucking company': 1.0,
 'exporter': 0.857,
 'grain elevator': 0.0,
 'popcorn store': 1.0,
 'agricultural service': 0.0,
 'warehouse': 1.0,
 'agricultural production': 0.0,
 'ranch': 0.0,
 'holding company': 1.0,
 'farm equipment supplier': 0.0,
 'store': 0.0,
 'grocery store': 1.0,
 'rice mill': 1.0,
 'food product supplier': 0.0,
 'accounting firm': 1.0,
 'produce market': 1.0,
 'pet supply store': 1.0,
 'wholesaler': 0.0,
 'produce wholesaler': 0.0,
 'distribution service': 0.0,
 'crop grower': 0.571,
 'addiction treatment center': 0.0,
 'natural good store': 0.0,
 'orchard': 0.444,
 'lumber store': 0.0,
 'mine': 0.833,
 'transportation service': 1.0,
 'investment company': 0.0,
 'fruit wholesaler': 0.444,
 'real estate agency': 1.0,
 'event venue': 0.0,
 'frozen dessert supplier': 1.0,
 'wine wholesaler importer': 1.0,
 'winery': 1.0,
 'aerospac

In [20]:
calc_avg_pre(ret_docs_dic, queries_dic, cutoff=-1)[0]

{'home improvement store': 0.0,
 'diesel fuel supplier': 0.0,
 'church': 1.0,
 'farm': 0.561,
 'seed supplier': 0.0,
 'accountant': 1.0,
 'trucking company': 1.0,
 'exporter': 0.837,
 'grain elevator': 0.0,
 'popcorn store': 0.958,
 'agricultural service': 0.0,
 'warehouse': 1.0,
 'agricultural production': 0.0,
 'ranch': 0.0,
 'holding company': 1.0,
 'farm equipment supplier': 0.0,
 'store': 0.0,
 'grocery store': 1.0,
 'rice mill': 0.982,
 'food product supplier': 0.0,
 'accounting firm': 1.0,
 'produce market': 0.855,
 'pet supply store': 1.0,
 'wholesaler': 0.0,
 'produce wholesaler': 0.0,
 'distribution service': 0.0,
 'crop grower': 0.571,
 'addiction treatment center': 0.0,
 'natural good store': 0.0,
 'orchard': 0.422,
 'lumber store': 0.0,
 'mine': 0.833,
 'transportation service': 1.0,
 'investment company': 0.0,
 'fruit wholesaler': 0.341,
 'real estate agency': 1.0,
 'event venue': 0.0,
 'frozen dessert supplier': 1.0,
 'wine wholesaler importer': 1.0,
 'winery': 1.0,
 'ae

In [21]:
calc_avg_pre(ret_docs_dic, queries_dic, cutoff=-1)[1]

0.6

In [22]:
rocchio_maps = []
for i in range(1, 11):
    rocchio_maps.append(calc_avg_pre(ret_docs_dic, queries_dic, cutoff=i)[1])

In [23]:
rocchio_maps

[0.211, 0.354, 0.444, 0.512, 0.551, 0.574, 0.585, 0.592, 0.597, 0.6]

In [24]:
calc_NDCG_at_n(ret_docs_dic, queries_dic, n=-1, base=2)

{'home improvement store': 0.375,
 'diesel fuel supplier': 0.637,
 'church': 1.0,
 'farm': 0.581,
 'seed supplier': 0.244,
 'accountant': 1.0,
 'trucking company': 0.966,
 'exporter': 0.769,
 'grain elevator': 0.375,
 'popcorn store': 0.948,
 'agricultural service': 0.244,
 'warehouse': 0.919,
 'agricultural production': 0.203,
 'ranch': 0.375,
 'holding company': 1.0,
 'farm equipment supplier': 0.244,
 'store': 0.637,
 'grocery store': 1.0,
 'rice mill': 0.986,
 'food product supplier': 0.148,
 'accounting firm': 1.0,
 'produce market': 0.956,
 'pet supply store': 0.964,
 'wholesaler': 0.637,
 'produce wholesaler': 0.148,
 'distribution service': 0.203,
 'crop grower': 0.514,
 'addiction treatment center': 0.637,
 'natural good store': 0.299,
 'orchard': 0.681,
 'lumber store': 0.375,
 'mine': 0.664,
 'transportation service': 1.0,
 'investment company': 0.375,
 'fruit wholesaler': 0.513,
 'real estate agency': 1.0,
 'event venue': 0.483,
 'frozen dessert supplier': 0.912,
 'wine who

In [25]:
# IMPORT INVERTED_INDEXES
with open(r'assets/rocchio-ndcg.json', 'w') as f:
    json.dump(calc_pre_rec_at_n(ret_docs_dic, queries_dic, n=-1)[1], f)

# Stemming

In [26]:
docs = dict(zip(df['naics'], df['stemmed']))

In [27]:
model = RocchioRecommenderSystem(docs, beta=1.2, gamma=.05)

In [266]:
for row in relevant_naics.iterrows():
    best_index = model.retrieve_docs(row[1]['query'], normalization='stem')
    feedback = []
    for rel_code in row[1]['relevant_naics']:
        idx = np.where(df['naics'].values == str(rel_code))[0][0]
        if rel_code in df.iloc[best_index]['naics'].values:
            feedback.append((1, idx))
        else:
            feedback.append((0, idx))
    model.gather_feedback(row[1]['query'], feedback=feedback, max_docs=max_docs, normalization='stem')

In [267]:
ret_docs_dic = {}
queries_dic = {}

for query in relevant_naics.iterrows():
    query_name =  ' '.join(process_query(query[1]['query'], 'stem'))
    ret_docs_index = model.retrieve_docs(query_name, max_docs=max_docs, normalization='stem')
    ret_docs = df.iloc[ret_docs_index]['naics'].tolist()

    query_docs = query[1]['relevant_naics']
    ret_docs_dic[query_name] = ret_docs
    queries_dic[query_name] = query_docs

In [49]:
df.iloc[model.retrieve_docs('air conditioning contractor', normalization='stem')][['naics', 'title']]

Unnamed: 0,naics,title
127,238220,"Plumbing, Heating, and Air-Conditioning Contra..."
126,238210,Electrical Contractors and Other Wiring Instal...
129,238310,Drywall and Insulation Contractors
136,238990,All Other Specialty Trade Contractors
118,238110,Poured Concrete Foundation and Structure Contr...
135,238910,Site Preparation Contractors
120,238130,Framing Contractors
130,238320,Painting and Wall Covering Contractors
121,238140,Masonry Contractors
122,238150,Glass and Glazing Contractors


In [268]:
calc_pre_rec_at_n(ret_docs_dic, queries_dic, n=-1)[0]

{'home improv store': 0.0,
 'diesel fuel supplier': 0.0,
 'church': 0.1,
 'farm': 0.0,
 'seed supplier': 0.0,
 'account': 0.4,
 'truck compani': 0.3,
 'export': 0.0,
 'grain elev': 0.0,
 'popcorn store': 0.4,
 'agricultur servic': 0.6,
 'warehous': 0.4,
 'agricultur product': 0.0,
 'ranch': 0.4,
 'hold compani': 0.3,
 'farm equip supplier': 0.0,
 'store': 0.0,
 'groceri store': 0.2,
 'rice mill': 0.7,
 'food product supplier': 0.0,
 'account firm': 0.4,
 'produc market': 0.6,
 'pet suppli store': 0.5,
 'wholesal': 0.0,
 'produc wholesal': 0.1,
 'distribut servic': 0.0,
 'crop grower': 0.6,
 'addict treatment center': 0.0,
 'natur good store': 0.0,
 'orchard': 0.6,
 'lumber store': 0.0,
 'mine': 0.5,
 'transport servic': 0.2,
 'invest compani': 0.0,
 'fruit wholesal': 0.1,
 'real estat agenc': 0.1,
 'event venu': 0.0,
 'frozen dessert supplier': 0.5,
 'wine wholesal import': 0.2,
 'wineri': 0.6,
 'aerospac compani': 0.6,
 'cold storag facil': 0.4,
 'employ agenc': 0.4,
 'plant nurseri':

In [269]:
calc_pre_rec_at_n(ret_docs_dic, queries_dic, n=-1)[1]

{'home improv store': 0.0,
 'diesel fuel supplier': 0.0,
 'church': 1.0,
 'farm': 0.0,
 'seed supplier': 0.0,
 'account': 1.0,
 'truck compani': 1.0,
 'export': 0.0,
 'grain elev': 0.0,
 'popcorn store': 0.667,
 'agricultur servic': 1.0,
 'warehous': 1.0,
 'agricultur product': 0.0,
 'ranch': 1.0,
 'hold compani': 1.0,
 'farm equip supplier': 0.0,
 'store': 0.0,
 'groceri store': 1.0,
 'rice mill': 1.0,
 'food product supplier': 0.0,
 'account firm': 1.0,
 'produc market': 1.0,
 'pet suppli store': 1.0,
 'wholesal': 0.0,
 'produc wholesal': 0.111,
 'distribut servic': 0.0,
 'crop grower': 0.857,
 'addict treatment center': 0.0,
 'natur good store': 0.0,
 'orchard': 0.667,
 'lumber store': 0.0,
 'mine': 0.833,
 'transport servic': 1.0,
 'invest compani': 0.0,
 'fruit wholesal': 0.111,
 'real estat agenc': 1.0,
 'event venu': 0.0,
 'frozen dessert supplier': 1.0,
 'wine wholesal import': 1.0,
 'wineri': 1.0,
 'aerospac compani': 1.0,
 'cold storag facil': 1.0,
 'employ agenc': 1.0,
 'pla

In [270]:
calc_avg_pre(ret_docs_dic, queries_dic, cutoff=-1)[0]

{'home improv store': 0.0,
 'diesel fuel supplier': 0.0,
 'church': 1.0,
 'farm': 0.0,
 'seed supplier': 0.0,
 'account': 1.0,
 'truck compani': 1.0,
 'export': 0.0,
 'grain elev': 0.0,
 'popcorn store': 0.633,
 'agricultur servic': 1.0,
 'warehous': 1.0,
 'agricultur product': 0.0,
 'ranch': 1.0,
 'hold compani': 1.0,
 'farm equip supplier': 0.0,
 'store': 0.0,
 'groceri store': 1.0,
 'rice mill': 1.0,
 'food product supplier': 0.0,
 'account firm': 1.0,
 'produc market': 0.877,
 'pet suppli store': 1.0,
 'wholesal': 0.0,
 'produc wholesal': 0.012,
 'distribut servic': 0.0,
 'crop grower': 0.857,
 'addict treatment center': 0.0,
 'natur good store': 0.0,
 'orchard': 0.63,
 'lumber store': 0.0,
 'mine': 0.759,
 'transport servic': 1.0,
 'invest compani': 0.0,
 'fruit wholesal': 0.011,
 'real estat agenc': 1.0,
 'event venu': 0.0,
 'frozen dessert supplier': 1.0,
 'wine wholesal import': 0.583,
 'wineri': 1.0,
 'aerospac compani': 1.0,
 'cold storag facil': 1.0,
 'employ agenc': 1.0,
 '

In [271]:
calc_avg_pre(ret_docs_dic, queries_dic, cutoff=-1)[1]

0.549

In [272]:
calc_NDCG_at_n(ret_docs_dic, queries_dic, n=-1, base=2)

{'home improv store': 0.375,
 'diesel fuel supplier': 0.637,
 'church': 1.0,
 'farm': 0.244,
 'seed supplier': 0.244,
 'account': 1.0,
 'truck compani': 0.966,
 'export': 0.203,
 'grain elev': 0.375,
 'popcorn store': 0.892,
 'agricultur servic': 0.829,
 'warehous': 0.876,
 'agricultur product': 0.203,
 'ranch': 0.938,
 'hold compani': 1.0,
 'farm equip supplier': 0.244,
 'store': 0.637,
 'groceri store': 1.0,
 'rice mill': 0.968,
 'food product supplier': 0.148,
 'account firm': 1.0,
 'produc market': 0.96,
 'pet suppli store': 0.979,
 'wholesal': 0.637,
 'produc wholesal': 0.193,
 'distribut servic': 0.203,
 'crop grower': 0.687,
 'addict treatment center': 0.637,
 'natur good store': 0.299,
 'orchard': 0.789,
 'lumber store': 0.375,
 'mine': 0.678,
 'transport servic': 1.0,
 'invest compani': 0.375,
 'fruit wholesal': 0.225,
 'real estat agenc': 1.0,
 'event venu': 0.483,
 'frozen dessert supplier': 0.924,
 'wine wholesal import': 0.955,
 'wineri': 0.874,
 'aerospac compani': 0.798,

In [274]:
best_index = model.retrieve_docs('lawyer office', normalization='stem')
df.iloc[best_index][['naics', 'title']]

Unnamed: 0,naics,title
787,541110,Offices of Lawyers
790,541199,All Other Legal Services
789,541191,Title Abstract and Settlement Offices
788,541120,Offices of Notaries
908,621399,Offices of All Other Miscellaneous Health Prac...
907,621391,Offices of Podiatrists
622,453210,Office Supplies and Stationery Stores
906,621340,"Offices of Physical, Occupational and Speech T..."
1036,922130,Legal Counsel and Prosecution
489,339940,Office Supplies (except Paper) Manufacturing
