In [79]:
import pandas as pd
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import re
from sklearn.decomposition import TruncatedSVD

In [80]:
stop_words = set(stopwords.words("english"))

PATTERN_S = re.compile("\'s")  # matches `'s` from text  
PATTERN_RN = re.compile("\\r\\n\\b") #matches `\r` and `\n`
PATTERN_PUNC = re.compile(r"[^\w\s]") # matches all non 0-9 A-z whitespace 


def clean_text(text):
    """
    Series of cleaning. String to lower case, remove non words characters and numbers (punctuation, curly brackets etc).
        text (str): input text
    return (str): modified initial text
    """
    text = text.lower()  # lowercase text
    # replace the matched string with ' '
    text = re.sub(PATTERN_S, ' ', text)
    text = re.sub(PATTERN_RN, ' ', text)
    text = re.sub(PATTERN_PUNC, ' ', text)
    return text

def tokenizer(description, stop_words, normalization):
    
    if normalization == 'lemmatize':
        # tokenize and lemmatize text
        lemmatizer = WordNetLemmatizer()
        tokens = [lemmatizer.lemmatize(w) for w in word_tokenize(description)]
        
    elif normalization == 'stem':
        # tokenize and stem text
        stemmer = PorterStemmer()
        tokens = [stemmer.stem(w) for w in word_tokenize(description)]
    
   # remove tokens length of 2 or below and make all lowercase and remove stop words
    tokens = [w.lower() for w in tokens if (w.lower() not in stop_words) and (len(w) > 2) and(w.isalpha())]
    
    return tokens    
    
def process_query(query, normalization):
    stop_words = set(stopwords.words("english"))
    
    return tokenizer(clean_text(query), stop_words, normalization)

In [81]:
# code example taken from https://towardsdatascience.com/build-a-text-recommendation-system-with-python-e8b95d9f251c
def retrieve_top_n(m, max_docs):
    # return the sum on all tokens of cosinus for each sentence
    if len(m.shape) > 1:
        cos_sim = np.mean(m, axis=0) 
    else: 
        cos_sim = m
    index = np.argsort(cos_sim)[::-1] # from highest idx to smallest score \
    mask = np.ones(len(cos_sim))
    mask = np.logical_or(cos_sim[index] != 0, mask) #eliminate 0 cosine distance
    best_index = index[mask][:max_docs]  
    return best_index

In [82]:
class LsiTfidfRecommenderSystem:
    def __init__(self, docs, num_concepts=100, alpha=1.0, beta=0.75, gamma=0.15):
        self.alpha, self.beta, self.gamma = alpha, beta, gamma
        
        # create a doc-term matrix out of our doc collection
        self.vec = TfidfVectorizer()
        doc_term_mat = self.vec.fit_transform([" ".join(docs[doc_id]) for doc_id in docs])
        self.svd = TruncatedSVD(n_components=num_concepts, random_state=42)
        result = self.svd.fit_transform(doc_term_mat)
        
        self.q_vecs = {}
        
        self.doc_vecs = result # document vectors in a matrix
        
    def retrieve_docs(self, query, max_docs=10, normalization='lemmatize'):
        query = ' '.join(process_query(query, normalization))
        
        if query not in self.q_vecs:
            q_vec = self.vec.transform([query])
            lsi_transform = self.svd.transform(q_vec)
            self.q_vecs[query] = lsi_transform
        
        ret_docs = {}
        
        mat = cosine_similarity(self.q_vecs[query], self.doc_vecs)
        best_index = retrieve_top_n(mat, max_docs=max_docs)
        
        return best_index

In [83]:
df = pd.read_pickle(r'assets/processed_df.pkl')
docs = dict(zip(df['naics'], df['lemmatized']))

In [84]:
lsi_model = LsiTfidfRecommenderSystem(docs)

In [85]:
best_index = lsi_model.retrieve_docs('Home improvement store', normalization='lemmatize')

In [86]:
naics_titles = pd.read_excel('assets/6-digit_2017_Codes.xlsx')
naics_titles['naics'] = naics_titles['naics'].astype(str)

In [87]:
df = df.merge(naics_titles, on='naics', how='outer')

In [88]:
df.iloc[best_index][['naics', 'title']]

Unnamed: 0,naics,title
578,442299,All Other Home Furnishings Stores
577,442291,Window Treatment Stores
576,442210,Floor Covering Stores
629,453998,All Other Miscellaneous Store Retailers (excep...
628,453991,Tobacco Stores
627,453930,Manufactured (Mobile) Home Dealers
625,453910,Pet and Pet Supplies Stores
626,453920,Art Dealers
624,453310,Used Merchandise Stores
623,453220,"Gift, Novelty, and Souvenir Stores"


In [89]:
relevant_naics = pd.read_pickle('assets/relevant_naics_df.pkl')

In [90]:
relevant_naics.head()

Unnamed: 0,query,relevant_naics
0,Home improvement store,"[444110, 444120, 444130, 444190, 444210, 44422..."
1,Diesel fuel supplier,"[424710, 424720, 424110, 424120, 424130, 42421..."
2,Church,"[813110, 813211, 813212, 813219, 813311, 81331..."
3,Farm,"[115116, 115111, 115112, 115113, 115114, 11511..."
4,Seed supplier,"[424910, 424920, 424930, 424940, 424950, 42499..."


In [91]:
max_docs = 10

In [92]:
def calc_pre_rec_at_n(ret_docs, reljudges, n=-1):
    """
    Calculate precision and recall at n for each query in ret_docs
    """
    
    pre_at_n, rec_at_n = {}, {}
    
    for k, v in ret_docs.items():
        if n > -1 and n <= len(ret_docs):
            s1 = set(v[:n])
        else:
            s1 = set(v)
        s2 = reljudges[k]
        precision = len(s1.intersection(s2)) / len(s1)
        recall = len(s1.intersection(s2)) / len(reljudges[k])
        pre_at_n[k] = round(precision, 3)
        rec_at_n[k] = round(recall, 3)
    return pre_at_n, rec_at_n

In [93]:
def calc_avg_pre(ret_docs, reljudges, cutoff=-1):
    """
    Calculate (mean) average precision for each query in ret_docs
    """
    
    avg_pre, mean_avg_pre = {}, None
    for k, v in ret_docs.items():
        total_rel = 0
        total = 0
        avg_prec = 0
        for i, doc in enumerate(v):
            if doc in reljudges[k] and cutoff == -1:
                total_rel += 1
                total += 1
                precision = total_rel/total
            elif doc in reljudges[k] and i+1 <= cutoff:
                total_rel += 1
                total += 1
                precision = total_rel/total
            else:
                total += 1
                precision = 0
            avg_prec += precision

        avg_pre[k] = round(avg_prec/len(reljudges[k]), 3,)
    
    mean_avg_pre = round(sum(avg_pre.values()) / len(avg_pre), 3)
        
    return avg_pre, mean_avg_pre

In [94]:
import math

def calc_NDCG_at_n(ret_docs, reljudges, n=-1, base=2):
    """
    Calculate NDCG at n for each query in ret_docs
    """
    
    ndcg = {}
    
    for k, v in ret_docs.items():
        
        counts = list(reversed([x for x in range(2,len(reljudges[k])+2)]))
        ideals = {reljudges[k][i]: counts[i] for i in range(len(reljudges[k]))}
        
        add_ons = {}
        if len(v) > len(reljudges[k]):
            for i in range(len(v)-len(reljudges[k])):
                add_ons[i] = 1
        ideals.update(add_ons)
        nums = list(map(ideals.get, v))
        
        systems = {}
        for i, doc in enumerate(v):
            if nums[i] == None:
                systems[doc] = 1
            else:
                systems[doc] = nums[i]
                
        ideal_order = {}
        if n != -1:
            for i, (key, value) in enumerate(ideals.items()):
                if i < n:
                    ideal_order[key] = value
        else:
            ideal_order = ideals
        
        add_ons = {}
        
        
        log = 0
        for i, (doc, rank) in enumerate(ideal_order.items()):
            if i >= len(v):
                break
            elif i < base:
                log += rank
            else:
                log += rank/math.log(i+1, base)

                
        system_order = {}
        if n != -1:
            for i, (key, value) in enumerate(systems.items()):
                if i < n:
                    system_order[key] = value
        else:
            system_order = systems
        
        
        
        system_log = 0
        for i, (doc, rank) in enumerate(system_order.items()):
            if i >= len(v):
                break
            elif i < base:
                system_log += rank
            else:
                system_log += rank/math.log(i+1, base)
        system_log / log
        ndcg[k] = round(system_log / log, 3)
        
    
    return ndcg

In [None]:
ret_docs_dic = {}
queries_dic = {}

for query in relevant_naics.iterrows():
    query_name =  ' '.join(process_query(query[1]['query'], 'lemmatize'))
    ret_docs_index = lsi_model.retrieve_docs(query_name, max_docs=max_docs, normalization='lemmatize')
    ret_docs = df.iloc[ret_docs_index]['naics'].tolist()

    query_docs = query[1]['relevant_naics']
    ret_docs_dic[query_name] = ret_docs
    queries_dic[query_name] = query_docs

In [None]:
ret_docs_dic

In [None]:
queries_dic

In [None]:
calc_pre_rec_at_n(ret_docs_dic, queries_dic, n=-1)[0]

In [None]:
calc_pre_rec_at_n(ret_docs_dic, queries_dic, n=-1)[1]

In [None]:
calc_avg_pre(ret_docs_dic, queries_dic, cutoff=-1)[0]

In [None]:
# lmmatizing performs a bit better in the concept space, perhaps because full words group into concepts al ittle differently?

In [None]:
calc_avg_pre(ret_docs_dic, queries_dic, cutoff=-1)[1]

In [None]:
lsi_maps = []
for i in range(1, 11):
    lsi_maps.append(calc_avg_pre(ret_docs_dic, queries_dic, cutoff=i)[1])

In [None]:
lsi_maps

In [None]:
calc_NDCG_at_n(ret_docs_dic, queries_dic, n=-1, base=2)

# Stemming

In [None]:
docs = dict(zip(df['naics'], df['stemmed']))

In [None]:
lsi_model = LsiTfidfRecommenderSystem(docs)

In [None]:
ret_docs_dic = {}
queries_dic = {}

for query in relevant_naics.iterrows():
    query_name =  ' '.join(process_query(query[1]['query'], 'stem'))
    ret_docs_index = lsi_model.retrieve_docs(query_name, max_docs=max_docs, normalization='stem')
    ret_docs = df.iloc[ret_docs_index]['naics'].tolist()

    query_docs = query[1]['relevant_naics']
    ret_docs_dic[query_name] = ret_docs
    queries_dic[query_name] = query_docs

In [None]:
calc_pre_rec_at_n(ret_docs_dic, queries_dic, n=-1)[0]

In [None]:
calc_pre_rec_at_n(ret_docs_dic, queries_dic, n=-1)[1]

In [None]:
calc_avg_pre(ret_docs_dic, queries_dic, cutoff=-1)[0]

In [None]:
calc_avg_pre(ret_docs_dic, queries_dic, cutoff=-1)[1]

In [None]:
calc_NDCG_at_n(ret_docs_dic, queries_dic, n=-1, base=2)

In [None]:
df = pd.read_pickle(r'assets/processed_df.pkl')
docs = dict(zip(df['naics'], df['lemmatized']))

In [None]:
concepts = [5, 10, 20, 50, 100, 150, 200]

In [None]:
for concept in concepts:
    lsi_model = LsiTfidfRecommenderSystem(docs, num_concepts=concept)
    ret_docs_dic = {}
    queries_dic = {}

    for query in relevant_naics.iterrows():
        query_name =  ' '.join(process_query(query[1]['query'], 'lemmatize'))
        ret_docs_index = lsi_model.retrieve_docs(query_name, max_docs=max_docs)
        ret_docs = df.iloc[ret_docs_index]['naics'].tolist()

        query_docs = query[1]['relevant_naics']
        ret_docs_dic[query_name] = ret_docs
        queries_dic[query_name] = query_docs
    print(f'Number of concepts: {concept} ' + str(calc_avg_pre(ret_docs_dic, queries_dic, cutoff=-1)[1]))