# Keywords

This notebook is for doing Keyword in Context analyses for multiple files in a directory.

In [None]:
#Using spaCy to pull out sentences that contain a given keyword
import spacy
import re
from IPython.display import Markdown, display

from pathlib import Path  
import glob


#Download the language model you're interested in
!python -m spacy download en_core_web_md

In [None]:
#Load language model
nlp = spacy.load('en_core_web_md')

#Open your texts and create spaCy document
filepath = 'kafka-corpus/'
text_files = glob.glob(f'{filepath}/*.txt')


def find_sentences_with_keyword(keyword, document):
        for sentence in document.sents:
            sentence = sentence.text
            if keyword.lower() in sentence.lower():
                #Use the regex library to replace linebreaks and to make the keyword bolded, ignoring capitalization
                sentence = re.sub('\n', ' ', sentence)
                sentence = re.sub(f"{keyword}", f"**{keyword}**", sentence, flags=re.IGNORECASE)
                display(Markdown(sentence))

#Loop through the files and open as spacy document
#Then print for each document the sentences containing a given keyword
#Change keyword ('france') to keyword you are looking for
for file in text_files:
    with open(file, 'r', encoding='utf-8') as f:
        text = f.read()
        print(file)
        document = nlp(text)
        kwic = find_sentences_with_keyword(keyword='legs', document=document)
        print(kwic)
        

# Keywords in Context

n-grams: any sequence of n tokens, n number of words  
N-grams are a way of representing groups of words as a single token.

In [None]:
#Loop through files and tokenize
directory_path = 'kafka-corpus/'
all_docs = []

def tokenize(text):
    lowercase_text = text.lower()
    split_words = re.split(r'\W+', lowercase_text)
    tokenized = [word for word in split_words if word.isalpha()]
    return tokenized

for filepath in Path(directory_path).glob("*.txt"):
    with open(filepath, 'r', encoding='utf-8') as file:
        text = file.read()
        tokenized_text = tokenize(text)
        all_docs.append(tokenized_text)

all_docs[0]

In [None]:
#Remove stopwrds
#Stopwords: refer to "Preprocessing" notebook for more details on stopwords

#Load custom stopwords list (this is the default spacy list)
#open your txt file and convert to a Python list
with open("custom-stopwords.txt", "r") as file_object:
    custom_stopwords = [s.rstrip('\n') for s in file_object.readlines()] 

custom_stopwords

In [None]:
#Define a function to remove stopwords from tokens
def remove_stopwords(list_of_tokens, stopwords):
    return [token for token in list_of_tokens if token not in stopwords]

all_docs_no_stop = []

for file in all_docs: 
    nostop = remove_stopwords(file, custom_stopwords)
    all_docs_no_stop.append(nostop)
    
all_docs_no_stop[0]

In [None]:
#Define a function to return list of ngrams
def make_ngrams(tokens, n):
    ngrams = []
    for i in range(len(tokens)-(n-1)):
        ngrams.append(tokens[i:i+n])
    return ngrams

In [None]:
#Displays words that appear around a particular word

#Define a function to create a dictionary from n-grams, using middle word as the key.
#To figure out the keyword for each n-gram we can use the index positions of the list.
def ngrams_to_dictionary(ngrams):
    keyindex = len(ngrams[0]) // 2

    ngram_dictionary = {}

    for ngram in ngrams:
        if ngram[keyindex] not in ngram_dictionary:
            ngram_dictionary[ngram[keyindex]] = [ngram]
        else:
            ngram_dictionary[ngram[keyindex]].append(ngram)
    return ngram_dictionary

keywords = []

#Loop through the files and append the dictionaries for each file
#to a list called keywords
#Change the number (6) to change the size of the window of words around the keyword
for file in all_docs_no_stop:
        ngrams = make_ngrams(file, 6)
        keywords_in_context = ngrams_to_dictionary(ngrams)
        keywords.append(keywords_in_context)

In [None]:
#Define a function that will loop through the list of ngram dictionaries
#and print out a given keyword with its ngrams
#or print a line that the keyword is not in the dictionary
def lookup_keyword(kw, dictionaries):
    for i in range(len(dictionaries)):
        text_name = text_files[i]
        dictionary = dictionaries[i]
        if kw in dictionary:
            print(text_name, dictionary[kw], "\n")
        else:
            print(f"{kw} not in file: {text_name}\n")

In [None]:
#Look up the words that appear next to a given keyword
lookup_keyword('people', keywords)

In [None]:
#Look up most frequent words that appear next to a given keyword

#Make a function to return most frequent words that appear next to a particular keyword
from collections import Counter
def get_neighbor_words(keyword, ngrams):
    
    neighbor_words = []
    keyword = keyword.lower()
    
    for ngram in ngrams:
        if keyword in ngram:
            for word in ngram:
                if word != keyword:
                        neighbor_words.append(word)
    return Counter(neighbor_words).most_common()

all_ngrams = []

#Loop through the files and append the ngrams for each file
#to a list called all_ngrams
#If you want more nearest neighbours increase the number '2'
#which increases the window of ngrams
for file in all_docs_no_stop:
    ngrams = make_ngrams(file, 2)
    all_ngrams.append(ngrams)

#Define a function to loop through ngrams above
# and look up most common neighbor words for a given keyword
def lookup_neighbor_words(keyword, ngram_list):
    for i in range(len(ngram_list)):
        text_name = text_files[i]
        text_ngrams = ngram_list[i]
        print(text_name, get_neighbor_words(keyword, text_ngrams), '\n')


In [None]:
#Look up the most common neighbor words for a given keyword
lookup_neighbor_words('people', all_ngrams)