In [157]:
import pandas as pd
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import re

In [164]:
stop_words = set(stopwords.words("english"))

PATTERN_S = re.compile("\'s")  # matches `'s` from text  
PATTERN_RN = re.compile("\\r\\n\\b") #matches `\r` and `\n`
PATTERN_PUNC = re.compile(r"[^\w\s]") # matches all non 0-9 A-z whitespace 


def clean_text(text):
    """
    Series of cleaning. String to lower case, remove non words characters and numbers (punctuation, curly brackets etc).
        text (str): input text
    return (str): modified initial text
    """
    text = text.lower()  # lowercase text
    # replace the matched string with ' '
    text = re.sub(PATTERN_S, ' ', text)
    text = re.sub(PATTERN_RN, ' ', text)
    text = re.sub(PATTERN_PUNC, ' ', text)
    return text

def tokenizer(description, stop_words, normalization):
    
    if normalization == 'lemmatize':
        # tokenize and lemmatize text
        lemmatizer = WordNetLemmatizer()
        tokens = [lemmatizer.lemmatize(w) for w in word_tokenize(description)]
        
    elif normalization == 'stem':
        # tokenize and stem text
        stemmer = PorterStemmer()
        tokens = [stemmer.stem(w) for w in word_tokenize(description)]
    
   # remove tokens length of 2 or below and make all lowercase and remove stop words
    tokens = [w.lower() for w in tokens if (w.lower() not in stop_words) and (len(w) > 2) and(w.isalpha())]
    
    return tokens    
    
def process_query(query, normalization):
    
    return tokenizer(clean_text(query), stop_words, normalization)

In [165]:
# code example taken from https://towardsdatascience.com/build-a-text-recommendation-system-with-python-e8b95d9f251c
def retrieve_top_n(m, max_docs):
    # return the sum on all tokens of cosinus for each sentence
    if len(m.shape) > 1:
        cos_sim = np.mean(m, axis=0) 
    else: 
        cos_sim = m
    index = np.argsort(cos_sim)[::-1] # from highest idx to smallest score \
    mask = np.ones(len(cos_sim))
    mask = np.logical_or(cos_sim[index] != 0, mask) #eliminate 0 cosine distance
    best_index = index[mask][:max_docs]  
    return best_index

In [166]:
class TfidfRecommenderSystem:
    def __init__(self, docs, num_concepts=100, min_df=1, alpha=1.0, beta=0.75, gamma=0.15):
        self.alpha, self.beta, self.gamma = alpha, beta, gamma
        
        # create a doc-term matrix out of our doc collection
        self.vec = TfidfVectorizer()
        doc_term_mat = self.vec.fit_transform([" ".join(docs[doc_id]) for doc_id in docs])
        result = doc_term_mat
        
        self.q_vecs = {}
        
        self.doc_vecs = result # document vectors in a matrix
        
    def retrieve_docs(self, query, max_docs=15):
        query = ' '.join(process_query(query, 'lemmatize'))
        
        if query not in self.q_vecs:
            q_vec = self.vec.transform([query])
            self.q_vecs[query] = q_vec
        
        ret_docs = {}
        
        mat = cosine_similarity(self.q_vecs[query], self.doc_vecs)
        best_index = retrieve_top_n(mat, max_docs=max_docs)
        
        return best_index
    
    def gather_feedback(self, query, max_docs=15, feedback=None):
        """
        This function models the interactive relevance feedback loop
        """
        query = ' '.join(process_query(query, 'lemmatize'))
        # Step 2: Retrieve the required number of docs in reponse to the queries
        ret_docs = self.retrieve_docs(query, max_docs=max_docs)
        
        
        # display docs to user
        # receive feedback from user

        # Step 3: Obtain feedback from the user in the form of precisions at each rank
        user_feedback = feedback
        # map index to user feedback
        idx_dic = {}
        
        for i, doc in enumerate(ret_docs):
            try:
                idx_dic[doc] = user_feedback[i]
            except:
                idx_dic[doc] = 0

        self.q_vecs[query] = np.dot(self.alpha, self.q_vecs[query])
        for key, value in idx_dic.items():
            if value == 1:
                self.q_vecs[query] += np.dot(self.beta, self.doc_vecs[key])
            else:
                self.q_vecs[query] -= np.dot(self.gamma, self.doc_vecs[key])
        self.q_vecs[query][self.q_vecs[query] < 0] = 0

In [167]:
df = pd.read_pickle(r'assets/processed_df.pkl')
docs = dict(zip(df['naics'], df['lemmatized']))

In [168]:
tfidf_model = TfidfRecommenderSystem(docs)

In [169]:
best_index = tfidf_model.retrieve_docs('Home improvement store')

In [170]:
naics_titles = pd.read_excel('assets/6-digit_2017_Codes.xlsx')
naics_titles['naics'] = naics_titles['naics'].astype(str)

In [171]:
df = df.merge(naics_titles, on='naics', how='outer')

In [172]:
df.iloc[best_index][['naics', 'description']]

Unnamed: 0,naics,description
578,442299,Bath shops All Other Home Furnishings Stores ...
629,453998,Architectural supply stores All Other Miscella...
628,453991,Cigar stores Tobacco Stores This U S industr...
577,442291,Curtain and drapery stores packaged Window Tr...
627,453930,Manufactured mobile home dealers Manufacture...
625,453910,Feed stores pet Pet and Pet Supplies Stores ...
626,453920,Art auctions Art Dealers This industry compri...
624,453310,Antique dealers except motor vehicles Used M...
594,445299,Coffee and tea i e packaged stores All Oth...
581,444110,Home centers building materials Home Centers ...
