# Keywords in Context

### Finding sentences that contain a specific keyword

We use SpaCy to break up our text into sentences and search for sentences that contain the specified keyword. 

In [None]:
#Import the libraries we need
import spacy
import re
from IPython.display import Markdown, display

#Download the language model you're interested in
!python -m spacy download en_core_web_md

In [None]:
#Load language model
nlp = spacy.load('en_core_web_md')

#Create spaCy document
text = open('soderberg-corpus/1897_Drizzle.txt', encoding='utf-8').read()
document = nlp(text)

#Define a function to break text into sentences
#and find sentences that contain a given keyword
def find_sentences_with_keyword(keyword, document):
        for sentence in document.sents:
            sentence = sentence.text
            if keyword.lower() in sentence.lower():
                #Use the regex library to replace linebreaks and to make the keyword bolded, ignoring capitalization
                sentence = re.sub('\n', ' ', sentence)
                sentence = re.sub(f"{keyword}", f"**{keyword}**", sentence, flags=re.IGNORECASE)
                display(Markdown(sentence))

In [None]:
#Call the function to find a specific keyword
#You can change which keyword to look up
find_sentences_with_keyword(keyword='sun', document=document)

#### Finding sentences that contain a specific keyword in multiple files within a directory

This does the same as above but for multiple files within a directory.

In [None]:
#Import the libraries we need
import spacy
import re
from IPython.display import Markdown, display

from pathlib import Path  
import glob

#Download the language model you're interested in
!python -m spacy download en_core_web_md

In [None]:
#Load language model
nlp = spacy.load('en_core_web_md')

#Define a function to break text into sentences
#and find sentences with particular keyword
def find_sentences_with_keyword(keyword, document):
        for sentence in document.sents:
            sentence = sentence.text
            if keyword.lower() in sentence.lower():
                #Use the regex library to replace linebreaks and to make the keyword bolded, ignoring capitalization
                sentence = re.sub('\n', ' ', sentence)
                sentence = re.sub(f"{keyword}", f"**{keyword}**", sentence, flags=re.IGNORECASE)
                display(Markdown(sentence))

In [None]:
#Set your filepath and define your text files
filepath = 'soderberg-corpus/'
text_files = glob.glob(f'{filepath}/*.txt')

In [None]:
#Loop through the files and open as spaCy document
#Then print for each document the sentences containing a given keyword
#If document does not contain the keyword print the document name and 'none'
#Change keyword to keyword you are looking for
for file in text_files:
    with open(file, 'r', encoding='utf-8') as f:
        text = f.read()
        print(file)
        document = nlp(text)
        kwic = find_sentences_with_keyword(keyword='sun', document=document)
        print(kwic)

# Keywords in Context

We can find a keyword’s immediate context, i.e.  its neighboring words to the left and right. To do so, we will first create a list of what’s called ngrams. “Ngrams” are any sequence of n tokens in a text. We’re going to use these n-grams to find the neighboring words that appear alongside particular keywords.

In [None]:
#Imports
from collections import Counter

In [None]:
#Set path to our file
text_file = 'soderberg-corpus/1897_Drizzle.txt'

In [None]:
#Define a tokenizing function
"""This function keeps only words, no numbers.
It lowcases
splits at and removes anything that is not a "word" character
(i.e. a letter or digit or underbar)
so it will split at and remove whitspace and punctuation
Then keeps only alphabetic characters 
(i.e. remove numbers) with .isalpha()
"""

def tokenize(text):
    lowercase_text = text.lower()
    split_words = re.split(r'\W+', lowercase_text)
    tokenized = [word for word in split_words if word.isalpha()]
    return tokenized

In [None]:
#Open the file and tokenize it 
#(creates a list of all the words/tokens in all_words)
with open(text_file, 'r', encoding='utf-8') as file:
    text = file.read()
    all_words = tokenize(text)

In [None]:
#Define a function to remove stopwords from your list of tokens
#using custom stopwords list
with open("custom-stopwords.txt", "r") as file_object:
    custom_stopwords = file_object.read()

def remove_stopwords(list_of_tokens, stopwords):
    return [token for token in list_of_tokens if token not in stopwords]

all_words_no_stop = remove_stopwords(all_words, custom_stopwords)

In [None]:
#Define a function to return list of ngrams
def make_ngrams(tokens, n):
    ngrams = []
    for i in range(len(tokens)-(n-1)):
        ngrams.append(tokens[i:i+n])
    return ngrams

In [None]:
#Define a function to create a dictionary from n-grams, using middle word as the key.
def ngrams_to_dictionary(ngrams):
    keyindex = len(ngrams[0]) // 2

    ngram_dictionary = {}

    for ngram in ngrams:
        if ngram[keyindex] not in ngram_dictionary:
            ngram_dictionary[ngram[keyindex]] = [ngram]
        else:
            ngram_dictionary[ngram[keyindex]].append(ngram)
    return ngram_dictionary

In [None]:
#Call your functions
#Change the number to change your context window
#(i.e. how many words you want around the keyword)

ngrams = make_ngrams(all_words_no_stop, 6)

keyword_in_context = ngrams_to_dictionary(ngrams)

keyword_in_context['sun']

#### Most Frequent Neighboring Words

What if we want to find the most frequent neighoring words that appear close to a particular keyword?

In [None]:
#Define a function to return most frequent words 
#that appear next to a particular keyword
def get_neighbor_words(keyword, ngrams):
    
    neighbor_words = []
    keyword = keyword.lower()
    
    for ngram in ngrams:
        if keyword in ngram:
            for word in ngram:
                if word != keyword:
                        neighbor_words.append(word)
    return Counter(neighbor_words).most_common()

In [None]:
#Call your functions
ngrams = make_ngrams(all_words_no_stop, 6)

keyword_in_context = ngrams_to_dictionary(ngrams)

get_neighbor_words('sun', ngrams)

### Keywords in Context across multiple files within a directory

Same as above but for multiple files within a directory.

In [None]:
#Set path to your corpus
#define that you want to analyze all .txt files in the directory
directory_path = 'soderberg-corpus'
text_files = glob.glob(f'{directory_path}/*.txt')
print(text_files)

In [None]:
#Tokenize the text files and append tokens to all_docs
#This create a list of lists of all tokens from all the documents
all_docs = []

def tokenize(text):
    lowercase_text = text.lower()
    split_words = re.split(r'\W+', lowercase_text)
    tokenized = [word for word in split_words if word.isalpha()]
    return tokenized

for filepath in text_files:
    with open(filepath, 'r', encoding='utf-8') as file:
        text = file.read()
        tokenized_text = tokenize(text)
        all_docs.append(tokenized_text)

In [None]:
#Remove stopwords with custom stopwords

#Read in custom stopwords txt as list
with open("custom-stopwords.txt", "r") as file_object:
    custom_stopwords = [s.rstrip('\n') for s in file_object.readlines()] 

#Define function to remove stopwrods
def remove_stopwords(list_of_tokens, stopwords):
    return [token for token in list_of_tokens if token not in stopwords]

#Loop over all_docs to remove stopwords
all_docs_no_stop = []

for file in all_docs: 
    nostop = remove_stopwords(file, custom_stopwords)
    all_docs_no_stop.append(nostop)

In [None]:
#Define a function to return list of ngrams
def make_ngrams(tokens, n):
    ngrams = []
    for i in range(len(tokens)-(n-1)):
        ngrams.append(tokens[i:i+n])
    return ngrams

In [None]:
#Define a function to create a dictionary from n-grams, using middle word as the key.
#To figure out the keyword for each n-gram we can use the index positions of the list.
def ngrams_to_dictionary(ngrams):
    keyindex = len(ngrams[0]) // 2

    ngram_dictionary = {}

    for ngram in ngrams:
        if ngram[keyindex] not in ngram_dictionary:
            ngram_dictionary[ngram[keyindex]] = [ngram]
        else:
            ngram_dictionary[ngram[keyindex]].append(ngram)
    return ngram_dictionary

In [None]:
#Loop through the files and append the dictionaries for each file
#to a list called keywords
#Change the number (6) to change the size of the context window 
#(i.e. the number of words around the keyword)

keywords = []

for file in all_docs_no_stop:
        ngrams = make_ngrams(file, 6)
        keywords_in_context = ngrams_to_dictionary(ngrams)
        keywords.append(keywords_in_context)

In [None]:
#Define a function that will loop through the list of ngram dictionaries
#and print out a given keyword with its ngrams
#or print a line that the keyword is not in the dictionary
def lookup_keyword(kw, dictionaries):
    for i in range(len(dictionaries)):
        text_name = text_files[i]
        dictionary = dictionaries[i]
        if kw in dictionary:
            print(text_name, dictionary[kw], "\n")
        else:
            print(f"{kw} not in file: {text_name}\n")

In [None]:
#Look up the words that appear next to a given keyword 
#for each text in the corpus 
lookup_keyword('god', keywords)

#### Most Frequent Neighboring Words across multiple files within a directory

In [None]:
from collections import Counter

In [None]:
#Look up most frequent words that appear next to a given keyword

#Define a function to return most frequent words 
#that appear next to a particular keyword
def get_neighbor_words(keyword, ngrams):
    
    neighbor_words = []
    keyword = keyword.lower()
    
    for ngram in ngrams:
        if keyword in ngram:
            for word in ngram:
                if word != keyword:
                        neighbor_words.append(word)
    return Counter(neighbor_words).most_common()

In [None]:
#Loop through the files and append the ngrams for each file
#to a list called all_ngrams
#Change the number to change the context window

all_ngrams = []

for file in all_docs_no_stop:
    ngrams = make_ngrams(file, 6)
    all_ngrams.append(ngrams)

In [None]:
#Define a function to loop through ngrams above
# and look up most common neighbor words for a given keyword
def lookup_neighbor_words(keyword, ngram_list):
    for i in range(len(ngram_list)):
        text_name = text_files[i]
        text_ngrams = ngram_list[i]
        print(text_name, get_neighbor_words(keyword, text_ngrams), '\n')

In [None]:
#Look up the most common neighbor words for a given keyword
#for each text in the corpus
lookup_neighbor_words('sun', all_ngrams)

_Acknowledgements_: This notebook is inspired by Melanie Walsh’s [_Introduction to Cultural Analytics & Python_](https://melaniewalsh.github.io/Intro-Cultural-Analytics/05-Text-Analysis/Multilingual/Chinese/03-POS-Keywords-Chinese.html#keyword-extraction) and William Turkel and Adam Crymble's ["Keywords in Context (using n-grams) with Python"](https://programminghistorian.org/en/lessons/keywords-in-context-using-n-grams).