# Starting Code for Exercise 6

### Import Modules and Download Data

In [1]:
import re
import requests
from io import StringIO
import pandas as pd
import nltk
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
url_data = 'https://docs.google.com/spreadsheets/d/e/2PACX-1vTxbA16lnYbtH-j6PPrPogc6ft03gp0y5mmo1Nq3l-Pxnb05nP1C-mOxUYvTciA2gq5nkwAqz9Y7Imi/pub?gid=646892609&single=true&output=tsv'

In [3]:
def load_dataset(url):
    r = requests.get(url)
    data = r.content.decode('utf8')
    df = pd.read_csv(StringIO(data), sep='\t')
    return df

In [4]:
df = load_dataset(url_data)

### Inspect the Dataset

In [5]:
df.head(15)

Unnamed: 0,name,description,country,founding_date,relevancy
0,Pandora Car Rental,"Welcome to Pandora Car Rental, Car Hire and Ai...",United Kingdom,2011-04-05,0
1,SurplusMatch,SurplusMatch is an online marketplace for cont...,United Kingdom,2008-01-01,2
2,Gimenez Ganga,Giménez Ganga is a company that has been provi...,Switzerland,1959-01-01,0
3,SMC3,"Freight shippers, motor carriers, logistics se...",United States,1935-01-01,0
4,Much Asphalt,Much Asphalt is southern Africa’s commercial s...,South Africa,1965-01-01,0
5,The Hisey Company,The Hisey Company provides quality arbor care ...,United States,2011-02-19,0
6,"FREIGHTALIA, LTD.",#1 Automatic quoting system ever created for F...,United Kingdom,2015-09-26,0
7,Instant Access Au,Instant Access is a provider of Access equipme...,Australia,1968-01-01,1
8,CANOR International,CANOR International provides project managemen...,Hungary,1993-01-01,0
9,LISUTO,LISUTO is a Multi-language batch exhibition sy...,Japan,2016-11-01,1


### Preprocess the Data

In [6]:
nltk.download('stopwords')
stopwords = nltk.corpus.stopwords.words('english')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [7]:
stemmer = PorterStemmer()

def prep_process_tokenize(text):
    #websites, email and any punctuation cleaning
    text = re.sub("((\S+)?(http(s)?)(\S+))|((\S+)?(www)(\S+))|((\S+)?(\@)(\S+)?)", " ", text)
    text = re.sub("[^a-zA-Z ]", "", text)
    text = text.lower() # lower case the text
    text = nltk.word_tokenize(text)
    #removing stopwords
    text = [word for word in text if word not in stopwords]
    #stemming
    try:
        text = [stemmer.stem(word) for word in text]
        text = [word for word in text if len(word) > 1]
    except IndexError:
        pass
    return text


def pre_process(text):
    return " ".join(prep_process_tokenize(text))

### Tf-Idf Based Approach (Vector Space Modeling)

In [8]:
tfidf = TfidfVectorizer(preprocessor=pre_process).fit_transform(df.description)
#convert text into TFIDF feature values (numerical form)
#https://programminghistorian.org/en/lessons/analyzing-documents-with-tfidf
# https://kanoki.org/2018/12/27/text-matching-cosine-similarity/

# Similarity for Company: Vahanalytics
doc_index_to_compare = df.index[df['name'] == "Vahanalytics"].tolist()[0]
top_k = 5

cosine_similarities = cosine_similarity(tfidf[doc_index_to_compare:doc_index_to_compare + 1], tfidf).flatten()
related_docs_indices = cosine_similarities.argsort()[:-top_k - 1:-1]

tfidf_result_df = df[df.index.isin(related_docs_indices)]
tfidf_result_df

Unnamed: 0,name,description,country,founding_date,relevancy
93,Ship Supplies Direct,We aim to use digital technology to transform ...,Singapore,2018-05-14,1
656,BISAF,BISAF is a technological company for the const...,United Kingdom,2006-05-01,1
695,Vahanalytics,Vahanalytics aims to create better drivers and...,India,2016-01-01,1
1542,GeoSpock,GeoSpock brings together their expertise of bi...,United Kingdom,2013-01-01,1
1982,Axenda,Axenda is a cloud-based software platform for ...,Mexico,2017-01-01,2


In [9]:
# Similarity for Company: Much Asphalt
doc_index_to_compare_company = df.index[df['name'] == "Much Asphalt"].tolist()[0]
top_k_company = 5

cosine_similarities_company = cosine_similarity(tfidf[doc_index_to_compare_company:doc_index_to_compare_company + 1], tfidf).flatten()
related_docs_indices_company = cosine_similarities_company.argsort()[:-top_k_company - 1:-1]

tfidf_result_df_company = df[df.index.isin(related_docs_indices_company)]
tfidf_result_df_company

Unnamed: 0,name,description,country,founding_date,relevancy
4,Much Asphalt,Much Asphalt is southern Africa’s commercial s...,South Africa,1965-01-01,0
57,Sunland Asphalt,"Sunland Asphalt, a commercial asphalt paving c...",United States,1979-01-01,0
618,Central-Allied Enterprises,Central States Construction was founded in 192...,United States,1929-01-01,0
862,FAST FELT,"The patented product FAST FELT®, with its plas...",United States,2007-01-01,0
1443,Saldus Celinieks,Saldus Celinieks is specialising in road const...,Latvia,1991-01-01,1


In [10]:
# # see similarity score of Much Asphalt w.r.t other companies
# similarity_score = cosine_similarity(tfidf[doc_index_to_compare_company:doc_index_to_compare_company + 1], tfidf, 3)
# for i in range(2000):
#   print(i, similarity_score[0,i])

## Topic Modeling Using LDA

In [11]:
from gensim import models, corpora, similarities
from nltk import FreqDist
import numpy as np
from scipy.stats import entropy

#### You can use the steps outlined below to train the LDA-algorithm and solve the coding section of part 2.

1. Apply the pre_process function to the description-column to create a new column called `tokenized`. This is the column we plan to use for training the LDA-algorithm.
2. Using this new column `tokenized`, find the 5000 most common tokens.
3. Remove all tokens that are not in the 5000 most common tokens from the column `tokenized`. 
4. Implement and execute the `train_lda`-function.
5. Use the `show_topic`-method to inspect the resulting topics.
6. Convert the LDA-results to a 2D array to use as a document-matrix.
7. Extract the LDA-results for `Much Asphalt` and `Vahanalytics` and use them as a query vector to extract the 5 most closest matches using `get_top_k_similar_docs`.

In [12]:
# implement steps 1 to step 7

from collections import Counter

tokenized = []
wordcount = {}

# apply pre_process and look for common words
for des in df.description.values:
 sentence = pre_process(des)
 tokenized.append(sentence)
 for word in sentence.split():
   if word not in wordcount:
    wordcount[word] = 1
   else:
    wordcount[word] += 1

# new column tokenized
df['tokenized'] = tokenized

# list of common words
word_counter = Counter(wordcount)
common_word_ls= []
for wor_d, count in word_counter.most_common(5000):
    common_word_ls.append(wor_d)

print('Common Words: ', common_word_ls)    

# remove words which are not common in tokenized column
f = lambda x: ' '.join([item for item in x.split() if item in common_word_ls])
df["tokenized"] = df["tokenized"].apply(f)

df.head()

Common Words:  ['servic', 'compani', 'provid', 'construct', 'manag', 'product', 'custom', 'solut', 'industri', 'logist', 'project', 'offer', 'includ', 'develop', 'busi', 'oper', 'design', 'build', 'technolog', 'platform', 'system', 'transport', 'engin', 'use', 'deliveri', 'also', 'softwar', 'market', 'manufactur', 'work', 'equip', 'base', 'time', 'need', 'client', 'suppli', 'process', 'qualiti', 'home', 'found', 'cost', 'help', 'user', 'year', 'new', 'onlin', 'us', 'contractor', 'deliv', 'mobil', 'group', 'integr', 'creat', 'one', 'commerci', 'special', 'lead', 'distribut', 'materi', 'profession', 'experi', 'price', 'team', 'intern', 'plan', 'make', 'data', 'support', 'world', 'global', 'allow', 'well', 'network', 'locat', 'unit', 'order', 'ship', 'best', 'track', 'model', 'tool', 'inc', 'gener', 'chain', 'enabl', 'innov', 'rang', 'area', 'connect', 'commun', 'freight', 'retail', 'sector', 'improv', 'truck', 'infrastructur', 'vehicl', 'structur', 'effici', 'applic', 'consult', 'inform'

Unnamed: 0,name,description,country,founding_date,relevancy,tokenized
0,Pandora Car Rental,"Welcome to Pandora Car Rental, Car Hire and Ai...",United Kingdom,2011-04-05,0,welcom pandora car rental car hire airport tra...
1,SurplusMatch,SurplusMatch is an online marketplace for cont...,United Kingdom,2008-01-01,2,surplusmatch onlin marketplac contractor merch...
2,Gimenez Ganga,Giménez Ganga is a company that has been provi...,Switzerland,1959-01-01,0,gimnez ganga compani provid solut window sunli...
3,SMC3,"Freight shippers, motor carriers, logistics se...",United States,1935-01-01,0,freight shipper motor carrier logist servic pr...
4,Much Asphalt,Much Asphalt is southern Africa’s commercial s...,South Africa,1965-01-01,0,much asphalt southern africa commerci supplier...


In [13]:
import gensim
from gensim.corpora.dictionary import Dictionary

# Convert document to tokens
doc_processed = [doc.split() for doc in df['tokenized']]

# Mapping from token to id in each document
dictionary = corpora.Dictionary(doc_processed)

# Prepapre a document term matrix, mapping of (word_id, word_frequency).
# Representing the corpus as a bag of words
doc_term_matrix = [dictionary.doc2bow(doc) for doc in doc_processed]

#Lda model
Lda = gensim.models.ldamodel.LdaModel
#Lda model to get the num_topics, number of topic required, 
#passses is the number training do you want to perform
#2 different topics where each topic is a combination of keywords and each keyword contributes a certain weightage to the topic.
ldamodel = Lda(doc_term_matrix, num_topics=2, id2word = dictionary, passes=5)

In [14]:
# Convert the LDA-results to a 2D array to use as a document-matrix.
doc_topic_dist = [[tup[1] for tup in lst] for lst in ldamodel[doc_term_matrix]]

# column names
topicnames = ['Topic_' + str(i) for i in range(2)]
docnames = df.name.values

# create a dataframe
df_document_topic_dist = pd.DataFrame(doc_topic_dist, columns=topicnames, index=docnames)
df_document_topic_dist.head()

Unnamed: 0,Topic_0,Topic_1
Pandora Car Rental,0.992102,
SurplusMatch,0.02911,0.97089
Gimenez Ganga,0.927762,0.072238
SMC3,0.614007,0.385993
Much Asphalt,0.080684,0.919316


In [15]:
# Extract the LDA-results for Much Asphalt and Vahanalytics and use them as a query vector to extract the 5 most closest matches using get_top_k_similar_docs.
asphalt = doc_processed[df.index[df['name'] == 'Much Asphalt'].tolist()[0]]
asphalt_bow = dictionary.doc2bow(asphalt)
asphalt_doc_distribution = np.array([tup[1] for tup in ldamodel.get_document_topics(bow=asphalt_bow)])

vahanalytics = doc_processed[df.index[df['name'] == 'Vahanalytics'].tolist()[0]]
vahanalytics_bow = dictionary.doc2bow(vahanalytics)
vahanalytics_doc_distribution = np.array([tup[1] for tup in ldamodel.get_document_topics(bow=vahanalytics_bow)])

asphalt_doc_distribution, vahanalytics_doc_distribution

(array([0.08070377, 0.91929626], dtype=float32),
 array([0.50929666, 0.49070337], dtype=float32))

In [16]:
from scipy.spatial import distance
def jensen_shannon(query, matrix):
  sim=[distance.jensenshannon(data,query) for data in matrix]
  return sim

def get_most_similar_documents(query,matrix,k=5):
    """
    This function implements the Jensen-Shannon distance above
    and retruns the top k indices of the smallest jensen shannon distances
    """
    sims_ls = jensen_shannon(query,matrix) # list of jensen shannon distances
    sims_ar = np.array(sims_ls)
    return sims_ar.argsort()[:k] # the top k positional index of the smallest Jensen Shannon distances

# Find top 5 similar comapnies for Much Asphalt
asphalt_most_sim_ids = get_most_similar_documents(asphalt_doc_distribution,doc_topic_dist)
print('Most similar Ids for Much Asphalt: ',asphalt_most_sim_ids)

asphalt_most_similar_df = df[df.index.isin(asphalt_most_sim_ids)]
asphalt_most_similar_df['name'], df_document_topic_dist[df.index.isin(asphalt_most_sim_ids)]



Most similar Ids for Much Asphalt:  [   4 1703 1824 1519  815]


  return np.sqrt(js / 2.0)


(4                              Much Asphalt
 815                      AR Elektroprosjekt
 1519                             Channel 40
 1703    Sempa Bilgi İslem Sanayi ve Ticaret
 1824               CorePoint Solutions Inc.
 Name: name, dtype: object,
                                       Topic_0   Topic_1
 Much Asphalt                         0.080684  0.919316
 AR Elektroprosjekt                   0.079849  0.920151
 Channel 40                           0.081371  0.918629
 Sempa Bilgi İslem Sanayi ve Ticaret  0.080672  0.919328
 CorePoint Solutions Inc.             0.081012  0.918988)

In [17]:
# Find top 5 similar comapnies for Vahanalytics
vahanalytics_most_sim_ids = get_most_similar_documents(vahanalytics_doc_distribution,doc_topic_dist)
print('Most similar Ids for Vahanalytics: ',vahanalytics_most_sim_ids)

vahanalytics_most_similar_df = df[df.index.isin(vahanalytics_most_sim_ids)]
vahanalytics_most_similar_df['name'], df_document_topic_dist[df.index.isin(vahanalytics_most_sim_ids)]


Most similar Ids for Vahanalytics:  [ 695  485 1977 1047  807]


  return np.sqrt(js / 2.0)


(485                     Lifescape Colorado
 695                           Vahanalytics
 807     United Liner Shipping Services LLP
 1047                                Kaliti
 1977                              PriceHub
 Name: name, dtype: object,
                                      Topic_0   Topic_1
 Lifescape Colorado                  0.509556  0.490444
 Vahanalytics                        0.509337  0.490663
 United Liner Shipping Services LLP  0.506854  0.493146
 Kaliti                              0.507145  0.492855
 PriceHub                            0.507687  0.492313)

In [18]:
# Plot Topics
!python -m pip install -U pyLDAvis
import pyLDAvis
import pyLDAvis.gensim  # don't skip this
import matplotlib.pyplot as plt
%matplotlib inline

# Visualize  topics and the associated keywords
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(ldamodel, doc_term_matrix, dictionary)
vis

Requirement already up-to-date: pyLDAvis in /usr/local/lib/python3.6/dist-packages (2.1.2)


In [None]:
#Questions:

# Part 2. Which method produces more sensible output? Discuss. (Identify manually on the basis of the description of companies)
# Answer:
#  Part 1, using sklearn cosine-similarity is giving more sensible results as compared to genism LDA.

#  This could be due to the fact that we need to tune LDA model parameters like passes,chunksize,random_state so as to perform better.
#  For this we can use GridSearch.

# Part 1:  “Much Asphalt” Which are the most similar companies? Do the results make sense?
# # Answer:
# Yes, the description of the top 5 companies present in the result and the description of the company, Much Asphalt matches.
# The description of the top 5 companies are similar and related.
  
