# Try KeyBERT for Keyword Extraction
Use Python 3.6.14  
Need to install [keybert](https://github.com/MaartenGr/KeyBERT):

```
pip install keybert
```

In [2]:
from keybert import KeyBERT

Import os and glob modules.  
Indicating the docment route. 

In [17]:
import os, glob
doc_path = r'test data/*.vtt'
script_path = glob.glob(doc_path)

Read the transcript documents  
Each row includes the timestamp and the content, which are divided by arrow `-->`  
The contents before the first timestamp are context and are excluded from the documents being read.

In [28]:
y = [x.split('/')[1] for x in script_path]
y.sort()
print(y)


['tr-lec1-transcription-english.vtt', 'tr-lec10-transcription-english.vtt', 'tr-lec2-transcription-english.vtt', 'tr-lec3-transcription-english.vtt', 'tr-lec4-transcription-english.vtt', 'tr-lec5-transcription-english.vtt', 'tr-lec6-transcription-english.vtt', 'tr-lec7-transcription-english.vtt', 'tr-lec8-transcription-english.vtt', 'tr-lec9-transcription-english.vtt']


Sort the document titles.

In [33]:
script_path = []
for i in range(1,11):
    script_path.append(r'test data/tr-lec{}-transcription-english.vtt'.format(i))
script_path

['test data/tr-lec1-transcription-english.vtt',
 'test data/tr-lec2-transcription-english.vtt',
 'test data/tr-lec3-transcription-english.vtt',
 'test data/tr-lec4-transcription-english.vtt',
 'test data/tr-lec5-transcription-english.vtt',
 'test data/tr-lec6-transcription-english.vtt',
 'test data/tr-lec7-transcription-english.vtt',
 'test data/tr-lec8-transcription-english.vtt',
 'test data/tr-lec9-transcription-english.vtt',
 'test data/tr-lec10-transcription-english.vtt']

Extract the sentences by removing the time stamps, and concantenate them to form the documents.  
Also restore the sentences with the time stamps and transcript file name into another list. Eventualy, it is used to retrieve the location of the key phrases. 

In [34]:
time_arrow = '-->'
scrip_content = []
doc = ''
# script_path = script_path[:2]
# transcript_id = 0
doc_indexed = []
for file in script_path:
    with open(file,'r',encoding="utf-8") as query_file:
        start_flg = 0
        transcript_id = file
        for line in query_file:
            if start_flg == 1 and time_arrow not in line:
                this_sentence = line.strip()
                if len(this_sentence) > 0:
                    scrip_content.append(this_sentence)
                    this_sentence_indexed = (this_sentence,transcript_id,t1,t2)
                    doc_indexed.append(this_sentence_indexed)
            elif start_flg ==1 and time_arrow in line:
                [t1,t2] = line.split(time_arrow)
                t1 = t1.strip()
                t2 = t2.strip()
            elif start_flg == 0 and time_arrow in line:
                start_flg = 1
                [t1,t2] = line.split(time_arrow)
                t1 = t1.strip()
                t2 = t2.strip()
        this_doc = ' '.join(scrip_content)
    doc = doc + this_doc

In [35]:
doc_indexed

[('This lecture is about natural language',
  'test data/tr-lec1-transcription-english.vtt',
  '00:00:00.280',
  '00:00:02.670'),
 ('content analysis.',
  'test data/tr-lec1-transcription-english.vtt',
  '00:00:02.670',
  '00:00:04.410'),
 ('As you see from this picture, this is',
  'test data/tr-lec1-transcription-english.vtt',
  '00:00:04.410',
  '00:00:06.820'),
 ('really the first step to process any',
  'test data/tr-lec1-transcription-english.vtt',
  '00:00:06.820',
  '00:00:09.300'),
 ('text data, text data in natural',
  'test data/tr-lec1-transcription-english.vtt',
  '00:00:09.300',
  '00:00:11.580'),
 ('languages.',
  'test data/tr-lec1-transcription-english.vtt',
  '00:00:11.580',
  '00:00:12.310'),
 ('So computers have to understand natural',
  'test data/tr-lec1-transcription-english.vtt',
  '00:00:12.310',
  '00:00:15.760'),
 ('language to some extent in order to',
  'test data/tr-lec1-transcription-english.vtt',
  '00:00:15.760',
  '00:00:18.610'),
 ('make use of the da

Locate the key phrase in the document

In [36]:
key1 = "retrieval model"
result = []
for sentence in doc_indexed:
    if key1 in sentence[0]:
        this_result = (sentence[1],sentence[2])
        result.append(sentence)

Use KeyBERT to extract the keywords.

In [37]:
result

[('retrieval models in general.',
  'test data/tr-lec3-transcription-english.vtt',
  '00:26:01.450',
  '00:26:03.670'),
 ('retrieval model which gives us a',
  'test data/tr-lec4-transcription-english.vtt',
  '00:01:26.210',
  '00:01:28.570'),
 ('retrieval models.',
  'test data/tr-lec4-transcription-english.vtt',
  '00:01:36.630',
  '00:01:37.660'),
 ('form of a state of the retrieval model.',
  'test data/tr-lec4-transcription-english.vtt',
  '00:04:24.230',
  '00:04:27.350'),
 ('art retrieval models.',
  'test data/tr-lec4-transcription-english.vtt',
  '00:07:31.020',
  '00:07:32.460'),
 ('retrieval models.',
  'test data/tr-lec4-transcription-english.vtt',
  '00:07:50.310',
  '00:07:52.060'),
 ('retrieval models.',
  'test data/tr-lec4-transcription-english.vtt',
  '00:09:55.210',
  '00:09:56.310'),
 ("retrieval model we're going to give a",
  'test data/tr-lec5-transcription-english.vtt',
  '00:00:02.940',
  '00:00:05.520'),
 ('retrieval model is actually very easy',
  'test data/

In [59]:
kw_model = KeyBERT('all-MiniLM-L6-v2')
keywords = kw_model.extract_keywords(doc,keyphrase_ngram_range=(1, 1), stop_words='english', use_maxsum=True, nr_candidates=20, top_n=10)

In [60]:
keywords

[('semantically', -0.023),
 ('textual', -0.1077),
 ('rank', -0.0383),
 ('judgments', -0.0741),
 ('documents', -0.0467),
 ('indexer', 0.0324),
 ('disambiguation', -0.0155),
 ('rankings', -0.108),
 ('querying', 0.0099),
 ('similarity', 0.0415)]

In [61]:
key_phrase = kw_model.extract_keywords(doc, keyphrase_ngram_range=(1, 2), stop_words='english', use_maxsum=True, nr_candidates=20, top_n=10)

In [62]:
key_phrase

[('retrieval functions', -0.023),
 ('definition relevance', -0.0383),
 ('optimal retrieval', 0.0596),
 ('relevance achieve', -0.0467),
 ('retrieval empirically', -0.031),
 ('relevance document', 0.0084),
 ('information retrieval', -0.0419),
 ('define relevance', 0.036),
 ('modeling retrieval', 0.0032),
 ('retrieval architecture', 0.0303)]

In [63]:
kw_model.extract_keywords(doc, keyphrase_ngram_range=(1, 2), stop_words='english', 
                              use_mmr=True, diversity=0.7)

[('retrieval models', 0.5976),
 ('vector defined', 0.2982),
 ('assumption identical', 0.1005),
 ('processor tokenizer', 0.0435),
 ('occupy space', 0.1065)]

In [29]:
import yake

In [64]:
kw_extractor_yk = yake.KeywordExtractor(lan="en",top=50)

In [34]:
keywords_yk = kw_extractor_yk.extract_keywords(doc)

In [65]:
keywords_yk

[('vector', 0.0002300919031533347),
 ('document', 0.0002843807767844164),
 ('vector space', 0.0003222229742492705),
 ('space', 0.0005489864084768604),
 ('query vector', 0.0006652848119286617),
 ('term', 0.0006679424412080893),
 ('document vector', 0.0006716072162264517),
 ('query', 0.0007715311960628257),
 ('Vector space model', 0.00083229528965335),
 ('vectors', 0.0011044411351360066),
 ('documents', 0.0011943992624945487),
 ('vector space retrieval', 0.001262844026861005),
 ('model', 0.001321531776389084),
 ('terms', 0.002003827323624268),
 ('assume', 0.0020417407764351703),
 ('function', 0.0020426468357965083),
 ('similarity', 0.002049867315207322),
 ('library', 0.0020518020557878025),
 ('space model', 0.00216673181887239),
 ('dimensional space', 0.002365418932346318),
 ('define', 0.0024513622197958472),
 ('basic', 0.002589806325644574),
 ('ranking', 0.002590211710580821),
 ('programming', 0.002590960309501427),
 ('implement', 0.0025913159675481764),
 ('representation', 0.0025918577

In [66]:
candidates = [candidate[0] for candidate in keywords_yk]
keywords = kw_model.extract_keywords(doc, candidates=candidates,use_mmr=True, diversity=0.7)

In [67]:
keywords

[('retrieval model', 0.5932),
 ('vector space', 0.2826),
 ('term weight', 0.0668),
 ('define', 0.1404),
 ('presidential', -0.012)]