### Dependencies

In [None]:
!pip install PyPDF2

In [None]:
import pandas as pd
import numpy as np
import math
import string
import PyPDF2
import nltk
from nltk import sent_tokenize
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk import tokenize

### 1. Load PDF

In [None]:
pdf = open('Commercial_Security_System_Market.pdf','rb')

In [None]:
def text_extraction(pdf):
    pdfReader = PyPDF2.PdfFileReader(pdf)
    count = pdfReader.numPages
    output = ''

    for i in range(count):
        page = pdfReader.getPage(i)
        output += (page.extractText())
    return output

In [None]:
# read text and tokenize
output = text_extraction(pdf)
sentences = tokenize.sent_tokenize(output)

In [None]:
output

### 2. Clean text (words & sentences)

In [None]:
def clean_sentences(sentences):
    stopWords = set(stopwords.words("english"))
    lemmatizer = WordNetLemmatizer()
    clean_document_sent = []

    for sent in sentences:
        words = word_tokenize(sent)
        clean_sentence = []
        for word in words:
            word = word.lower()
            word = lemmatizer.lemmatize(word)
            # remove punctuation from each word
            table = str.maketrans('','',string.punctuation)
            word = word.translate(table)
            if word.isalpha() == False:
                continue
            if word in stopWords:
                continue
            if (len(word)<4):
                continue
            clean_sentence.append(word)
        clean_document_sent.append(clean_sentence)
    return clean_document_sent

In [None]:
doc_sent_clean = clean_sentences(sentences)
len(doc_sent_clean)

In [None]:
doc_sent_clean = [i for i in doc_sent_clean if len(i)>3]
len(doc_sent_clean)

In [None]:
doc_sent_clean

In [None]:
doc_word_clean =  [item for sublist in doc_sent_clean for item in sublist]

In [None]:
len(doc_word_clean)

In [None]:
len(doc_word_clean)

In [None]:
# add more words
exclude_key_words = ['table','figure']

In [None]:
#doc_word_clean = [x for x in doc_word_clean if x not in exclude_key_words]
#doc_sent_clean = [x for x in doc_sent_clean if x not in exclude_key_words]

In [None]:
len(doc_word_clean)



```
# This is formatted as code
```

### 3. Tf-Idf and top keywords

In [None]:
# tf score function

def tf_score(doc_word_clean):
    tf_score = {}
    for each_word in doc_word_clean:
        if each_word in tf_score:
            tf_score[each_word] += 1
        else:
            tf_score[each_word] = 1

    # Dividing by total_word_length for each dictionary element
    total_word_length = len(doc_word_clean)
    tf_score.update((x, y/int(total_word_length)) for x, y in tf_score.items())
    return tf_score

In [None]:
# save tf scores 
tf_score = tf_score(doc_word_clean)

In [None]:
tf_score

In [None]:
def check_sent(word, sentences): 
    final = [all([w in x for w in word]) for x in sentences] 
    sent_len = [sentences[i] for i in range(0, len(final)) if final[i]]
    return int(len(sent_len))

In [None]:
# transform sentences for idf method
transform_sentences = []
for i in doc_sent_clean:
    transform_sentences.append(' '.join(i))

In [None]:
transform_sentences

In [None]:
def idf_calc_score(doc_word_clean,transform_sentences):
    idf_score = {}
    for each_word in doc_word_clean:
        each_word = each_word.replace('.','')
        if each_word in idf_score:
            idf_score[each_word] = check_sent(each_word, transform_sentences)
        else:
            idf_score[each_word] = 1
    
    # Performing a log and divide
    total_sent_len = len(transform_sentences)
    idf_score.update((x, math.log(int(total_sent_len)/y)) for x, y in idf_score.items())
    return idf_score

In [None]:
# save idf scores
idf_score = idf_calc_score(doc_word_clean,transform_sentences)

In [None]:
# save tf-idf scores
tf_idf_score = {key: tf_score[key] * idf_score.get(key, 0) for key in tf_score.keys()}

In [None]:
# select top key words
from operator import itemgetter
def get_top_n(dict_elem, n):
    result = dict(sorted(dict_elem.items(), key = itemgetter(1), reverse = True)[:n]) 
    return result

In [None]:
tf_idf_score

In [None]:
key_words_new = get_top_n(tf_idf_score, 10)

In [None]:
key_words_new

### 4. Generate PDF vector based on key words

In [None]:
from sentence_transformers import SentenceTransformer
model_bert = SentenceTransformer('paraphrase-mpnet-base-v2')

In [None]:
def vector_pdf_embedding(key_words_new):
    # calculate weights for key words
    key_words_weights = np.array(list(key_words_new.values()))/np.sum(np.array(list(key_words_new.values())))
    # generate embeddings for each key word from top n list
    embeddings_key_words = model_bert.encode(list(key_words_new.keys()))
    # generate a embedding vector for pdf based on top key words
    vector_pdf = np.dot(key_words_weights,embeddings_key_words)
    return vector_pdf

In [None]:
# save pdf embeddings
vector_pdf = vector_pdf_embedding(key_words_new)

In [None]:
vector_pdf

In [None]:
len(vector_pdf)

In [None]:
# read industries embeddings 
industry = pd.read_csv('industry_improved_bert_embedding.csv')

In [None]:
industry

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
similarity_df = pd.DataFrame(((cosine_similarity(industry.iloc[:,1:],industry.iloc[:,1:])+1)/2),index = industry.iloc[:,0],columns =industry.iloc[:,0])

In [None]:
similarity_df['industry_name'] = similarity_df.index

In [None]:
# Importing Modules
from scipy.cluster.hierarchy import linkage, dendrogram
import matplotlib.pyplot as plt
plt.figure(figsize=(16,8))
import pandas as pd

# Reading the DataFrame
seeds_df = similarity_df

# Remove the grain species from the DataFrame, save for later
varieties = list(seeds_df.pop('industry_name'))

# Extract the measurements as a NumPy array
samples = seeds_df.values

"""
Perform hierarchical clustering on samples using the
linkage() function with the method='complete' keyword argument.
Assign the result to mergings.
"""
mergings = linkage(samples, method='complete')

"""
Plot a dendrogram using the dendrogram() function on mergings,
specifying the keyword arguments labels=varieties, leaf_rotation=90,
and leaf_font_size=6.
"""
dendrogram(mergings,
           labels=varieties,
           leaf_rotation=90,
           leaf_font_size=6,
           )

plt.show()

In [None]:
from scipy.cluster.hierarchy import fcluster
fl = fcluster(mergings,50,criterion='maxclust')

In [None]:
similarity_df['cluster'] = fl

In [None]:
cols =similarity_df.columns.tolist()
cols = cols[-1:] + cols[:-1]
similarity_df = similarity_df[cols]

In [None]:
similarity_df = similarity_df.sort_values(by = 'cluster',ascending = True)

In [None]:
similarity_df.to_csv('clusters_of_similar_industries.csv')

In [None]:
similarity_df

In [None]:
# generate sorted list of similar industries
df = pd.DataFrame(vector_pdf)
similarity_df = pd.DataFrame(((cosine_similarity(df.T,industry.iloc[:,1:])+1)/2),index = ['pdf'],columns =industry.iloc[:,0])
sorted_similarity = (similarity_df.sort_values(axis=1,by='pdf',ascending=False)).transpose()
sorted_similarity.head(30)

In [None]:
sorted_similarity.to_csv('sorted_similarity_industries_list_Commercial_Security_System_Market.csv')

### 5. Code for generating embeddings for industries

In [None]:
industry = pd.read_csv('idustry_test.csv')

In [None]:
industry

In [None]:
bert_embedding = []
for i in industry.industry:
    bert_embedding.append(model_bert.encode(i.replace('_',' ')))
df = pd.DataFrame(bert_embedding, index =industry.industry,columns = range(0,768))

In [None]:
df.to_csv('industry_improved_bert_embedding.csv')