In [1]:
import fitz
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
from nltk.corpus import wordnet
import string
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import pandas as pd
from gensim.models import Word2Vec, FastText

In [2]:
# nltk.download('punkt')
# nltk.download('stopwords')
# nltk.download('wordnet')
# nltk.download('averaged_perceptron_tagger')

### Preprocess for Skipgram and CBOW

In [3]:
def preprocess_text(text):
    text = text.lower()
#     print("After Lowercasing:", text)
#     print("\n\n\n\n")
    tokens = word_tokenize(text)
#     print("After Tokenization:", tokens)
#     print("\n\n\n\n")
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]
#     print("After Stopword Removal:", tokens)
#     print("\n\n\n\n")
    tokens = [token for token in tokens if token not in string.punctuation]
#     print("After Punctuation Removal:", tokens)
#     print("\n\n\n\n")
    tokens = [token for token in tokens if not (token.isdigit() or (token[:-1].isdigit() and token[-1] == '.'))]
#     print("After Number Removal:", tokens)
#     print("\n\n\n\n")
    lemmatizer = WordNetLemmatizer()
    pos_tags = pos_tag(tokens)

    def get_wordnet_pos(treebank_tag):
        if treebank_tag.startswith('J'):
            return wordnet.ADJ
        elif treebank_tag.startswith('V'):
            return wordnet.VERB
        elif treebank_tag.startswith('N'):
            return wordnet.NOUN
        elif treebank_tag.startswith('R'):
            return wordnet.ADV
        else:
            return wordnet.NOUN

    lemmatized_tokens = [lemmatizer.lemmatize(token, get_wordnet_pos(pos_tag)) for token, pos_tag in pos_tags]
#     print("After Lemmatization:", lemmatized_tokens)

    lemmatized_tokens = [token for token in lemmatized_tokens if token] 
    
    return lemmatized_tokens

### Preprocess for Bag of words and Tf-Idf

In [4]:
def preprocess_alt_text(text):

    stop_words = set(stopwords.words('english'))
    lemmatizer = WordNetLemmatizer()
    tokens = []

    for sentence in text:
        if sentence:
            s = ''
            for word, pos_tag in nltk.pos_tag(word_tokenize(sentence)):
                if not any(char.isdigit() for char in word) and word.lower() not in stop_words:
                    word_without_punct = ''.join(char for char in word if char not in string.punctuation)
                    pos_tag = get_wordnet_pos(pos_tag)
                    lemma = lemmatizer.lemmatize(word_without_punct, pos=pos_tag)
                    s += lemma.lower() + ' '
            tokens.append(s.strip())  
        
    return tokens

def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return 'a'  
    elif treebank_tag.startswith('V'):
        return 'v'  
    elif treebank_tag.startswith('N'):
        return 'n' 
    elif treebank_tag.startswith('R'):
        return 'r'  
    else:
        return 'n'

### PDF-Text Extraction 

In [5]:
def process_pdf(pdf_path):
    pdf_document = fitz.open(pdf_path)
    preprocessed_text_corpus = []
    altt_corpus=[]
    alt_corpus=[]
    for page_number in range(pdf_document.page_count):
        page = pdf_document[page_number]
        page_text = page.get_text()

        sentences = nltk.sent_tokenize(page_text)
        alt_corpus=preprocess_alt_text(sentences)
        altt_corpus+=alt_corpus
        preprocessed_page_text = []
        for sentence in sentences:
            preprocessed_sentence = preprocess_text(sentence)
#             preprocessed_page_text.append(preprocessed_sentence)
            preprocessed_page_text.append((sentence, preprocessed_sentence))
        preprocessed_text_corpus.append(preprocessed_page_text)
#         print(page_text)
    pdf_document.close()
    return preprocessed_sentence,preprocessed_text_corpus,altt_corpus

### Creating Corpus

In [6]:
pdf_path = r"C:\Users\Lohesh\Downloads\Notes\NLP\Text_GPT\IoT Discoverability_2.pdf"

preprocessed_sentence,preprocessed_text_corpus,altt_corpus = process_pdf(pdf_path)
corpus=[]
corpuss=[]

for page_corpus in preprocessed_text_corpus:
    for sentence, tokens in page_corpus:
        corpuss.append(( tokens, sentence)) 
        corpus.append(tokens)

### Bag of words

- The representation generated is too large and can't take care of words that are not in vocubulary and also the context

In [7]:
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer() 
BOW = count_vect.fit_transform(altt_corpus) 
print("Size of vocabulary: ", len(count_vect.vocabulary_))
print("Vocabulary:",count_vect.vocabulary_)

Size of vocabulary:  958
Vocabulary: {'resource': 735, 'discovery': 254, 'internet': 459, 'things': 864, 'current': 204, 'trends': 879, 'future': 366, 'standardization': 818, 'aspects': 66, 'soumya': 808, 'kanti': 481, 'datta': 210, 'rui': 752, 'pedro': 635, 'ferreira': 334, 'da': 206, 'costa': 196, 'christian': 141, 'bonnet': 107, 'mobile': 565, 'communications': 153, 'department': 221, 'eurecom': 306, 'biot': 103, 'france': 355, 'emails': 280, 'dattas': 211, 'eurecomfr': 307, 'abstract': 1, 'to': 870, 'realize': 703, 'vision': 923, 'must': 575, 'mechanisms': 549, 'discover': 250, 'capability': 122, 'thus': 867, 'become': 96, 'fundamental': 364, 'requirement': 732, 'iot': 465, 'platform': 649, 'paper': 627, 'provide': 682, 'comprehensive': 159, 'categorization': 127, 'technology': 859, 'landscape': 490, 'point': 651, 'advantage': 25, 'limitation': 508, 'novel': 593, 'search': 766, 'engine': 291, 'base': 90, 'framework': 354, 'propose': 678, 'comprises': 160, 'proxy': 683, 'layer': 492

In [8]:
bow_df = pd.DataFrame(BOW.toarray(), columns=count_vect.get_feature_names_out())
sorted_bow_df = bow_df.apply(lambda x: x.sort_values(ascending=False), axis=1)

print("Sorted BOG DataFrame:")
print(sorted_bow_df)

Sorted BOG DataFrame:
     ability  abstract  abstraction  accepted  access  accommodate  \
0          0         1            0         0       0            0   
1          0         0            0         0       0            0   
2          0         0            0         0       0            0   
3          0         0            0         0       0            0   
4          0         0            0         0       0            0   
..       ...       ...          ...       ...     ...          ...   
297        0         0            0         0       0            0   
298        0         0            0         0       0            0   
299        0         0            0         0       0            0   
300        0         0            0         0       0            0   
301        0         0            0         0       0            0   

     accomplish  accord  account  acknowledgment  ...  yang  ylianttila  yong  \
0             0       0        0               0  ...   

In [9]:
from sklearn.metrics.pairwise import cosine_similarity
word1_index = count_vect.vocabulary_['communicate']  
word2_index = count_vect.vocabulary_['communication']  

word1_vector = BOW[:, word1_index]
word2_vector = BOW[:, word2_index]

word1_vector = word1_vector.reshape(1, -1)
word2_vector = word2_vector.reshape(1, -1)

cosine_similarity_word1_word2 = cosine_similarity(word1_vector, word2_vector)

print(f"Similarity between 'word1' and 'word2': {cosine_similarity_word1_word2[0][0]}")


Similarity between 'word1' and 'word2': 0.0


### TF-IDF

- Similar to BOW, the representation generated is too large and can't take care of words that are not in vocubulary and also the context but atleast it gives an idea on how the words are related to the documents

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer
tr_idf_model  = TfidfVectorizer()
tf_idf_vector = tr_idf_model.fit_transform(altt_corpus)
print("Size of vocabulary: ", len(tr_idf_model.vocabulary_))
print("Vocabulary:",tr_idf_model.vocabulary_)

Size of vocabulary:  958
Vocabulary: {'resource': 735, 'discovery': 254, 'internet': 459, 'things': 864, 'current': 204, 'trends': 879, 'future': 366, 'standardization': 818, 'aspects': 66, 'soumya': 808, 'kanti': 481, 'datta': 210, 'rui': 752, 'pedro': 635, 'ferreira': 334, 'da': 206, 'costa': 196, 'christian': 141, 'bonnet': 107, 'mobile': 565, 'communications': 153, 'department': 221, 'eurecom': 306, 'biot': 103, 'france': 355, 'emails': 280, 'dattas': 211, 'eurecomfr': 307, 'abstract': 1, 'to': 870, 'realize': 703, 'vision': 923, 'must': 575, 'mechanisms': 549, 'discover': 250, 'capability': 122, 'thus': 867, 'become': 96, 'fundamental': 364, 'requirement': 732, 'iot': 465, 'platform': 649, 'paper': 627, 'provide': 682, 'comprehensive': 159, 'categorization': 127, 'technology': 859, 'landscape': 490, 'point': 651, 'advantage': 25, 'limitation': 508, 'novel': 593, 'search': 766, 'engine': 291, 'base': 90, 'framework': 354, 'propose': 678, 'comprises': 160, 'proxy': 683, 'layer': 492

In [11]:
tfidf_df = pd.DataFrame(tf_idf_vector.toarray(), columns=tr_idf_model.get_feature_names_out())

sorted_tfidf_df = tfidf_df.apply(lambda x: x.sort_values(ascending=False), axis=1)

print("Sorted TF-IDF DataFrame:")
print(sorted_tfidf_df)

Sorted TF-IDF DataFrame:
     ability  abstract  abstraction  accepted  access  accommodate  \
0        0.0  0.166165          0.0       0.0     0.0          0.0   
1        0.0  0.000000          0.0       0.0     0.0          0.0   
2        0.0  0.000000          0.0       0.0     0.0          0.0   
3        0.0  0.000000          0.0       0.0     0.0          0.0   
4        0.0  0.000000          0.0       0.0     0.0          0.0   
..       ...       ...          ...       ...     ...          ...   
297      0.0  0.000000          0.0       0.0     0.0          0.0   
298      0.0  0.000000          0.0       0.0     0.0          0.0   
299      0.0  0.000000          0.0       0.0     0.0          0.0   
300      0.0  0.000000          0.0       0.0     0.0          0.0   
301      0.0  0.000000          0.0       0.0     0.0          0.0   

     accomplish  accord  account  acknowledgment  ...  yang  ylianttila  yong  \
0           0.0     0.0      0.0             0.0  ...

In [12]:
word1_index = count_vect.vocabulary_['internet'] 
word2_index = count_vect.vocabulary_['things']  

word1_vector = tf_idf_vector[:, word1_index]
word2_vector = tf_idf_vector[:, word2_index]

word1_vector = word1_vector.reshape(1, -1)
word2_vector = word2_vector.reshape(1, -1)

cosine_similarity_word1_word2 = cosine_similarity(word1_vector, word2_vector)

print(f"Similarity between 'word1' and 'word2': {cosine_similarity_word1_word2[0][0]}")


Similarity between 'word1' and 'word2': 0.6575711968940768


In [13]:
word1_index = count_vect.vocabulary_['internet']
word1_vector = tf_idf_vector[:, word1_index]
print(f"Word1 Vector: {word1_vector}")


Word1 Vector:   (0, 0)	0.2052294965515316
  (14, 0)	0.47481256140188344
  (17, 0)	0.3149378989442622
  (50, 0)	0.24108931008857662
  (246, 0)	0.2743414155781209
  (249, 0)	0.3242087270464429
  (253, 0)	0.1795499592786646
  (256, 0)	0.3508577856247236
  (263, 0)	0.24754035456327828
  (267, 0)	0.13658804416114922
  (272, 0)	0.131648445947349
  (273, 0)	0.1349382461371116
  (280, 0)	0.1604251310725424
  (286, 0)	0.13657480770128266
  (288, 0)	0.1981800551666528
  (290, 0)	0.2224638635364373
  (293, 0)	0.19278018120435514
  (299, 0)	0.2113010666711102
  (301, 0)	0.20488473993737236


### CBOW

- Better representation since it can take context into account

In [14]:
from gensim.models import Word2Vec
model_cbow = Word2Vec(corpus, min_count=1, vector_size=60, window=2, sg=0)
model_cbow.train(corpus, total_examples=len(corpus), epochs=250)

(672265, 819750)

In [15]:
len(corpus)

302

In [16]:
word_vectors_cbow = model_cbow.wv

In [17]:
for word in word_vectors_cbow.index_to_key:
    print(f"Word: {word}")
    print(f"Vector: {word_vectors_cbow[word]}")
    print()

Word: discovery
Vector: [ 0.9493349  -0.17617545  0.509344    0.83418334 -0.19946381 -1.2108109
 -0.03470821  0.3768684  -0.22336082  0.5792366  -0.38072813  0.4464831
 -0.61239684 -0.5482269   0.40279874 -0.21064013  0.6090437   0.3152618
 -1.0193697  -0.23115575  0.18476324 -0.47336885  0.76885957  0.2713047
  0.45430526  0.06056568  0.06980343  0.7534128  -0.43018225 -0.24109274
 -0.41007173 -0.58398867  0.5572632  -0.54272276  0.12990017  1.0526522
  0.43053108 -0.32721946 -0.9515531  -0.05855869 -1.2230074   0.1208934
 -1.1013818  -0.988016   -0.5234327  -0.3927725  -1.327857    0.81783533
  0.77183926  0.45502537 -1.1348102  -0.88010824 -0.9994081  -0.29145288
 -0.46357536 -0.34355414  0.31306994  0.0059263  -0.552662    0.46209392]

Word: resource
Vector: [ 0.9901883   0.02614869 -0.53571916 -1.3763413   0.05113703  0.22415966
  0.9056656   1.1427703  -0.6036081   0.950013    0.38031435  1.2001281
 -0.7219963  -1.2449884   0.25191143  0.7207601   0.78924614  0.20186318
 -1.15955

Vector: [ 0.04848888  0.11003003 -0.12622638 -0.43241468 -0.5739636  -0.29666767
  0.15310815  0.7652679   0.25832605 -0.29853034  1.0067984   0.13153885
 -0.04445039  0.0373305   0.21853012 -0.15870956  0.19391638  0.20549364
 -0.45095086 -0.44327655 -0.30512917 -0.5770891   0.2113391   0.45452657
 -0.28781906 -0.14735515 -0.32137907  0.24205048  0.00808189 -0.1389158
  0.35157108 -0.4260376   0.34283593 -0.4528181  -0.31497973  0.2685225
 -0.11657328  0.00582622 -0.6856791  -0.07079376 -0.517535   -0.07327087
  0.30228746 -0.45496085  0.14348853 -0.03694283 -0.25310376 -0.7759211
 -0.26016772 -0.05227382  0.07842352  0.50328934  0.3928488   0.29574057
  0.11385091  0.71348673  0.1360822  -0.32742128  0.2144223  -0.14556707]

Word: berlin
Vector: [-3.15900147e-01  3.16230059e-01 -2.85275489e-01 -6.74515367e-02
 -1.69112980e-02  2.62831524e-02  2.59922504e-01  2.87984908e-01
  4.50752564e-02 -1.19512446e-01  5.49491882e-01  3.40899974e-01
  3.03234113e-03 -1.07766107e-01 -2.60532618e-0

In [18]:
similarity = word_vectors_cbow.similarity('internet', 'thing')
print(f"Similarity between word1 and word2: {similarity}")

Similarity between word1 and word2: 0.7022556066513062


### Skipgram

In [19]:
model_skip = Word2Vec(corpus, min_count=1, vector_size=60, window=2, sg=1)
model_skip.train(corpus, total_examples=len(corpus), epochs=200)

(537783, 655800)

In [20]:
word_vectors_skip = model_skip.wv

In [21]:
for word in word_vectors_skip.index_to_key:
    print(f"Word: {word}")
    print(f"Vector: {word_vectors_skip[word]}")
    print()

Word: discovery
Vector: [ 0.30428353 -0.10952382  0.26857817  0.45487195 -0.02542181 -0.95818406
  0.19846866  0.8250336  -0.47270748  0.05498452 -0.05533193  0.40647292
 -0.36376092 -0.4168898   0.03825685  0.04042215  0.23159578  0.74928296
 -0.9058702   0.11032262  0.65927607 -0.26212904  0.5358097   0.5671438
  0.3100578   0.14573455  0.7130905   0.6610008  -0.06228355  0.42566678
 -0.3744943  -0.38513672  1.0181447  -0.46471444 -0.29100317  1.1200483
  0.25274947 -0.7437734  -0.6110947  -0.17891335 -0.79942805  0.24231419
 -0.89300156 -0.3875713  -0.7085397  -0.27243888 -0.9615071   0.66806954
  0.5094191   0.34086823 -0.76808316 -0.5280619  -0.5684795  -0.4171137
  0.10131992 -0.46439612  0.4256682   0.7994337   0.21221654  0.1592963 ]

Word: resource
Vector: [ 0.6327977   0.13246576 -0.42241767 -1.2001615  -0.20509367  0.18983293
  0.8828374   1.0828488  -0.70252496  0.06859713  0.46884277  0.82216305
 -0.30224982 -1.0320644   0.8361695   0.45033702  0.7756357  -0.3866959
 -0.88

Vector: [ 0.08777151  0.1500013  -0.7567721  -0.1361812  -0.27847385 -0.45456752
  0.32500672  0.9233961   0.14238334  0.01109569  0.7662934   0.41790432
 -0.06020838  0.44041696 -0.02989801  0.03611397  0.2900286  -0.08527701
 -0.27283257 -0.2977111  -0.41643628 -0.29258534  0.18418902  0.5491611
 -0.37398082  0.06014686 -0.29478836  0.1349356  -0.43505558  0.00969273
  0.09720735 -0.7617857   0.5277566   0.26163834  0.19360638 -0.14768717
 -0.18416345 -0.53301847 -0.71934426  0.12913275 -0.5640672  -0.2938102
  0.33924606 -0.2785957  -0.3294246   0.01725888 -0.4174955  -0.6360037
 -0.25105274  0.25384882  0.06799738  0.04823934  0.26686478 -0.12641287
  0.4661528   0.62097585 -0.25049758 -0.42870829  0.00508769 -0.46478838]

Word: to-peer
Vector: [-0.04616751  0.14080884 -0.80340904 -0.35101897 -0.3899711  -0.34393716
  0.18909033  0.8499015   0.20383239 -0.20004183  0.7647181   0.44112754
  0.00476679  0.18139857 -0.02352348  0.06175871  0.39472502 -0.05970438
 -0.5474534  -0.205239

In [22]:
similarity = word_vectors_skip.similarity('internet', 'thing')
print(f"Similarity between word1 and word2: {similarity}")

Similarity between word1 and word2: 0.5290279388427734


### GloVe

In [23]:
model_glo = Word2Vec(sentences=corpus, min_count=1, vector_size=60, window=2, sg=0)
model_glo.train(corpus, total_examples=len(corpus), epochs=250)

word_vectors = model_glo.wv
for word in word_vectors.index_to_key:
    print(f"Word: {word}")
    print(f"Vector: {word_vectors[word]}")
    print()

Word: discovery
Vector: [ 0.9493349  -0.17617545  0.509344    0.83418334 -0.19946381 -1.2108109
 -0.03470821  0.3768684  -0.22336082  0.5792366  -0.38072813  0.4464831
 -0.61239684 -0.5482269   0.40279874 -0.21064013  0.6090437   0.3152618
 -1.0193697  -0.23115575  0.18476324 -0.47336885  0.76885957  0.2713047
  0.45430526  0.06056568  0.06980343  0.7534128  -0.43018225 -0.24109274
 -0.41007173 -0.58398867  0.5572632  -0.54272276  0.12990017  1.0526522
  0.43053108 -0.32721946 -0.9515531  -0.05855869 -1.2230074   0.1208934
 -1.1013818  -0.988016   -0.5234327  -0.3927725  -1.327857    0.81783533
  0.77183926  0.45502537 -1.1348102  -0.88010824 -0.9994081  -0.29145288
 -0.46357536 -0.34355414  0.31306994  0.0059263  -0.552662    0.46209392]

Word: resource
Vector: [ 0.9901883   0.02614869 -0.53571916 -1.3763413   0.05113703  0.22415966
  0.9056656   1.1427703  -0.6036081   0.950013    0.38031435  1.2001281
 -0.7219963  -1.2449884   0.25191143  0.7207601   0.78924614  0.20186318
 -1.15955

Vector: [-0.26643986  0.47886267  0.24416532  0.12753494 -0.04124488  0.24769998
  0.59310335  0.54144365  0.09209288 -0.04562355  0.7871985  -0.00833924
 -0.09236448  0.03590218 -0.23677538  0.0350104  -0.22256319  0.17763163
 -0.3973156  -0.25316778 -0.01668077 -0.06224221 -0.16257663  0.22260107
 -0.08947437 -0.17864439 -0.44005948 -0.06084609  0.31693193 -0.00243682
  0.08436614 -0.48215476  0.19748512  0.15164004  0.19192882  0.32191706
  0.08999666 -0.41884515 -0.3371044   0.07088171 -0.0449289  -0.08409083
 -0.3651692  -0.01470883  0.04076352  0.01980822 -0.10279685 -0.11070986
 -0.2001712   0.33557686  0.05079952  0.6619651   0.3661664   0.41072053
  0.0733277  -0.00116066  0.2533892  -0.26364672  0.17538738 -0.41886505]

Word: comprises
Vector: [-0.2521276  -0.19317515  0.38629204 -0.06430548 -0.50195235 -0.248468
 -0.04665188  0.30446023 -0.28254548  0.13545531 -0.23949017 -0.09641632
  0.08560734 -0.6729144   0.06028412 -0.37361488  0.07603224 -0.10569202
 -0.26470476 -0.225

In [24]:
similarity = word_vectors.similarity('internet', 'thing')
print(f"Similarity between word1 and word2: {similarity}")

Similarity between word1 and word2: 0.7022556066513062


### Fast Text

In [25]:
from gensim.models import FastText

model_fast = FastText(sentences=corpus, vector_size=60, window=5, min_count=1, sg=1)
model_fast.train(corpus, total_examples=len(corpus), epochs=250)

word_vectors = model_fast.wv
for word in word_vectors.index_to_key:
    print(f"Word: {word}")
    print(f"Vector: {word_vectors[word]}")

Word: discovery
Vector: [-1.1781613   0.46868885 -0.2806902  -0.47104192 -0.90297484  0.20208997
  0.26448968  1.1750501   0.7593291   0.08570699 -0.14917608  0.72838193
  0.72344756  0.05033582  0.49728525 -0.52167135  0.47026357 -0.19564877
 -1.2884239   0.14690438 -0.16797039  0.21990457  0.11277062 -0.5654018
 -0.2717944  -0.00135957 -0.03955529  0.9130546  -0.2846708  -0.25012052
  0.9659386  -0.51738495  0.20271814  0.2461623   0.48867786  0.40310842
  0.67465526  0.51611936 -0.31767598  0.37738064 -0.6049059  -0.37778905
  0.0994084   0.6945683  -0.0182425  -0.50973403 -0.82150817 -0.29873285
 -0.5222291   0.11552328  0.6986996  -0.44060442  0.5789711  -0.4969647
 -0.44787243 -0.46287328  0.06270897  0.06864813 -0.29522994 -0.23315547]
Word: resource
Vector: [-6.4001608e-01  6.3176490e-02  1.8230669e-01 -2.3307799e-01
 -1.7699111e-01  6.8763185e-01 -3.8815070e-02  3.0564255e-01
  6.5186119e-01  6.1402130e-01 -2.8827958e-02  3.4980211e-02
  2.1819754e-01 -8.7916362e-01  6.0959005

Vector: [-0.01879353  0.13662831 -0.26759002  0.1590986  -0.31486657  1.4208578
 -0.30591342  0.44588163  0.5097733   0.50234777  0.7408881   0.0821569
 -0.37113452 -0.40505216  0.30747467 -0.1660654  -0.29731137 -0.2812326
 -0.6799619  -0.40826055  1.3367105  -0.25410345 -0.16127345 -0.53737295
  0.26081097  0.1219745   0.34981805  0.302381   -0.37539518 -0.8666834
 -0.1430706  -0.06729686 -0.05180738 -0.88228446 -0.16435558  0.6791958
  0.27634913  0.6346865  -0.7300671   0.4468192  -0.00629624 -1.1799265
  0.16544478  0.6473732  -0.45557043 -0.6436242  -0.6115773  -1.0365319
 -0.52089465 -0.1985769  -0.16786934  0.12977426  1.2885836  -0.01190971
 -0.01935714  0.5959766   1.1397452   0.75865334 -0.0776956   0.49921563]
Word: overview
Vector: [-0.3291525   0.38980955  0.01183123 -0.14898518 -0.9313579   0.01123925
  0.12957485  0.8833836  -0.10958702  0.8665777   0.00388035  1.1583345
 -0.7276814   0.26652104  0.5594539  -0.3351793   0.37609482 -0.54173994
 -0.7600472  -0.77573127  0

In [26]:
similarity = word_vectors.similarity('internet', 'thing')
print(f"Similarity between 'internet' and 'thing': {similarity}")

Similarity between 'internet' and 'thing': 0.5609711408615112


### Sentence similarity

In [27]:
def get_sentence_embedding(sentence_tokens, model):
    word_embeddings = []
    for token in sentence_tokens:
        if token in model.wv.key_to_index:  
            word_embeddings.append(model.wv[token])
    
    if len(word_embeddings) == 0:
        return None
    
    sentence_embedding = sum(word_embeddings) / len(word_embeddings)
    return sentence_embedding

In [28]:
sentence_embedding = get_sentence_embedding(preprocessed_sentence, model_skip)
print("Sentence Embedding:", sentence_embedding)

Sentence Embedding: [-0.4923839   0.11871532  0.1875592   0.21102417 -0.07747093 -0.27613917
  0.2506155   0.63670516 -0.14942858 -0.00720931  0.8259224  -0.0594085
  0.43061197 -0.27368414  0.218618    0.05160017  0.46259382  0.04393807
 -0.47888014 -0.23602612  0.02286646  0.04732082  0.15595989  0.09664872
 -0.12893237  0.20759362 -0.34176627  0.11317436 -0.3513577  -0.4590762
 -0.02439859 -0.81307566  0.26943156  0.08949984  0.20946221  0.29937878
  0.58561295 -0.74726194 -0.8437544   0.11511525 -0.08439689 -0.1168282
 -0.65812343 -0.29496777  0.49517843  0.0730283  -0.11405507  0.3316443
 -0.05584212  0.23317721 -0.12393504  0.228982    0.13607524  0.73738647
 -0.00861673  0.39060107  0.13375522  0.01999325  0.40938914 -0.6174851 ]


In [29]:
def cosine_similarity(vector1, vector2):
    dot_product = np.dot(vector1, vector2)
    norm_vector1 = np.linalg.norm(vector1)
    norm_vector2 = np.linalg.norm(vector2)
    similarity = dot_product / (norm_vector1 * norm_vector2)
    return similarity

### Finding most Relevant Sentence for Word2Vec Model

In [30]:
from sklearn.metrics.pairwise import cosine_similarity

def find_most_relevant_sentences_using_word_embeddings(question, corpus, model, top_n=3):
    preprocessed_question = preprocess_text(question)
    question_embedding = np.zeros((1, model.vector_size))  # Initialize question embedding with zeros
    count = 0  # Initialize count to keep track of valid tokens

    # Compute average question embedding
    for token in preprocessed_question:
        if token in model.wv.key_to_index:
            question_embedding += model.wv[token]
            count += 1

    if count == 0:
        return "Unable to find relevant sentences."

    question_embedding /= count  # Average the question embedding

    top_sentences = []
    for sentence_tokens, original_sentence in corpus:
        sentence_embedding = np.zeros((1, model.vector_size))  # Initialize sentence embedding with zeros
        count = 0  # Reset count for each sentence

        # Compute average sentence embedding
        for token in sentence_tokens:
            if token in model.wv.key_to_index:
                sentence_embedding += model.wv[token]
                count += 1

        if count > 0:
            sentence_embedding /= count  # Average the sentence embedding
            similarity = cosine_similarity(question_embedding, sentence_embedding)
            top_sentences.append((original_sentence, similarity))

    top_sentences.sort(key=lambda x: x[1], reverse=True)
    top_sentences = top_sentences[:top_n]

    return top_sentences


### Finding most Relevant Sentence from Sentence Transformer

In [31]:
from sentence_transformers import SentenceTransformer, util

model_sbert = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

def calculate_similarity_st(sent_embedding1, sent_embedding2):
    return util.pytorch_cos_sim(sent_embedding1, sent_embedding2)

def find_most_relevant_sentences_using_sentence_transformers(question, corpus, model_sbert, top_n=3):
    question_embedding = model_sbert.encode([question], convert_to_tensor=True)

    top_sentences = []
    for sentence_tokens, original_sentence in corpus:
        sentence = ' '.join(sentence_tokens)
        sentence_embedding = model_sbert.encode([sentence], convert_to_tensor=True)
        similarity = calculate_similarity_st(question_embedding, sentence_embedding)
        top_sentences.append((original_sentence, similarity.item()))

    top_sentences.sort(key=lambda x: x[1], reverse=True)
    top_sentences = top_sentences[:top_n]

    return top_sentences



### Finding most Relevant Sentence for Tf-Idf and BOW

In [32]:
from sklearn.metrics.pairwise import cosine_similarity
def find_most_relevant_sentences_using_tfidf(question_vector, corpus_vectors, corpus_sentences, top_n=3):
    top_sentences = []
    for sentence_vector, original_sentence in zip(corpus_vectors, corpus_sentences):
        similarity = cosine_similarity(question_vector, sentence_vector)
        top_sentences.append((original_sentence, similarity))

    top_sentences.sort(key=lambda x: x[1], reverse=True)
    top_sentences = top_sentences[:top_n]

    return top_sentences


### Finding most Relevant Sentence for BERT

In [33]:
from transformers import AutoTokenizer, AutoModel
import torch

BERT_Model = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(BERT_Model)
model_bert = AutoModel.from_pretrained(BERT_Model)

def sent_embedding(sent):
    tokens = tokenizer.encode_plus(sent, max_length=128, truncation=True,
                                    padding='max_length', return_tensors='pt')
    with torch.no_grad():
        outputs = model_bert(**tokens)
        embedding = outputs.pooler_output.detach().numpy()
    return embedding

def calculate_similarity(sent_embedding1, sent_embedding2):
    sent_embedding1 = torch.tensor(sent_embedding1)
    sent_embedding2 = torch.tensor(sent_embedding2)
    return torch.nn.functional.cosine_similarity(sent_embedding1, sent_embedding2).item()

def find_most_relevant_sentences_using_bert(question, corpus, model, top_n=3):
    question_embedding = sent_embedding(question)

    top_sentences = []
    for sentence_tokens, original_sentence in corpus:
        sentence = ' '.join(sentence_tokens)
        sentence_embedding = sent_embedding(sentence)
        similarity = calculate_similarity(question_embedding, sentence_embedding)
        top_sentences.append((original_sentence, similarity))

    top_sentences.sort(key=lambda x: x[1], reverse=True)
    top_sentences = top_sentences[:top_n]

    return top_sentences


In [34]:
user_question = input("Enter your question: ")
a=input('Choose Model:\n1)Bag of Words\n2)Tf-Idf\n3)CBOW\n4)Skip Gram\n5)Glove\n6)FastText\n7)SentenceTransformer\n8)BERT\n')
if a=='1':
    print('Bag of Words')
    question_vector = count_vect.transform([user_question])
    top_relevant_sentences = find_most_relevant_sentences_using_tfidf(question_vector, BOW, altt_corpus, top_n=3)
elif a=='2':
    print('Tf-Idf')
    question_vector = tr_idf_model.transform([user_question])
    top_relevant_sentences = find_most_relevant_sentences_using_tfidf(question_vector, tf_idf_vector, altt_corpus, top_n=3)
elif a=='3':
    print('CBOW')
    top_relevant_sentences = find_most_relevant_sentences_using_word_embeddings(user_question, corpuss, model_cbow, top_n=3)
elif a=='4':
    print('Skip Gram')
    top_relevant_sentences = find_most_relevant_sentences_using_word_embeddings(user_question, corpuss, model_skip, top_n=3)
elif a=='5':
    print('Glove')
    top_relevant_sentences = find_most_relevant_sentences_using_word_embeddings(user_question, corpuss, model_glo, top_n=3)
elif a=='6':
    print('Fast Text')
    top_relevant_sentences = find_most_relevant_sentences_using_word_embeddings(user_question, corpuss, model_fast, top_n=3)
elif a=='7':
    print('Sentence Transformer')
    top_relevant_sentences = find_most_relevant_sentences_using_sentence_transformers(user_question, corpuss, model_sbert, top_n=3)
elif a=='8':
    print('BERT')
    top_relevant_sentences = find_most_relevant_sentences_using_bert(user_question, corpuss, model_bert, top_n=3)
else:
    print("Invalid Choice")

def capitalize_first_letter(sentence):
    if sentence:
        return sentence[0].upper() + sentence[1:]
    return ""    
    
required_words = ["In addition", "Moreover"]
required_punctuation = "."
modified_content = ""

if not top_relevant_sentences:
    modified_content = "Unable to find relevant sentences."
elif top_relevant_sentences== "Unable to find relevant sentences.":
    modified_content = "Unable to find relevant sentences."
else:
    for i, (sentence, _) in enumerate(top_relevant_sentences):
        sentence = sentence.replace('\n', '')
        modified_sentence = capitalize_first_letter(' '.join(sentence.split()))
        if(modified_sentence[-1]=='.'):
            modified_content += modified_sentence 
        else:
            modified_content += modified_sentence+required_punctuation
        if i < len(required_words) and len(top_relevant_sentences) > i:  
            modified_content += ' ' + required_words[i]+' '

print("\nModified Content:")
print(modified_content)


Enter your question: What is Distributed and P2P discovery services ?
Choose Model:
1)Bag of Words
2)Tf-Idf
3)CBOW
4)Skip Gram
5)Glove
6)FastText
7)SentenceTransformer
8)BERT
7
Sentence Transformer

Modified Content:
Distributed and P2P discovery services The authors of [1] reports a system for distributed discovery service. In addition Cirani et al reports about a scalable and self-configurable P2P architecture for service discovery (SD) [3]. Moreover The philosophy behind such system is peer-to-peer (P2P) approach that adopts the distributed hash table (DHT) techniques.
