## Merging all articles into one text file

In [None]:
import glob

In [None]:
# Set path to your corpus directory
directory_path = 'soderberg-corpus'

In [None]:
# Use glob to get all the text files in the directory
text_files = glob.glob(f'{directory_path}/*.txt')
text_files

In [None]:
# Open a new file called "soderberg-corpus.txt for writing out to
#and loop through each text file 
#open it and read and write out the content 
#(with a newline to separate the contents of different files)
with open("soderberg-corpus.txt", "w", encoding='utf-8') as output_file:
    for filepath in text_files:
        with open(filepath, 'r', encoding='utf-8') as input_file:
            text = input_file.read()
            output_file.write(text + "\n")

## Write out Keywords in Context results into text file

### Keyword in Context across multiple files within a directory

In [None]:
import glob
import re

In [None]:
#Set path to your corpus
#define that you want to analyze all .txt files in the directory
directory_path = 'soderberg-corpus'
text_files = glob.glob(f'{directory_path}/*.txt')
print(text_files)

In [None]:
#Tokenize the text files and append tokens to all_docs
#This create a list of lists of all tokens from all the documents
all_docs = []

def tokenize(text):
    lowercase_text = text.lower()
    split_words = re.split(r'\W+', lowercase_text)
    tokenized = [word for word in split_words if word.isalpha()]
    return tokenized

for filepath in text_files:
    with open(filepath, 'r', encoding='utf-8') as file:
        text = file.read()
        tokenized_text = tokenize(text)
        all_docs.append(tokenized_text)

In [None]:
#Remove stopwords with custom stopwords

#Read in custom stopwords txt as list
with open("custom-stopwords.txt", "r") as file_object:
    custom_stopwords = [s.rstrip('\n') for s in file_object.readlines()] 

#Define function to remove stopwrods
def remove_stopwords(list_of_tokens, stopwords):
    return [token for token in list_of_tokens if token not in stopwords]

#Loop over all_docs to remove stopwords
all_docs_no_stop = []

for file in all_docs: 
    nostop = remove_stopwords(file, custom_stopwords)
    all_docs_no_stop.append(nostop)

In [None]:
#Define a function to return list of ngrams
def make_ngrams(tokens, n):
    ngrams = []
    for i in range(len(tokens)-(n-1)):
        ngrams.append(tokens[i:i+n])
    return ngrams

In [None]:
#Define a function to create a dictionary from n-grams, using middle word as the key.
#To figure out the keyword for each n-gram we can use the index positions of the list.
def ngrams_to_dictionary(ngrams):
    keyindex = len(ngrams[0]) // 2

    ngram_dictionary = {}

    for ngram in ngrams:
        if ngram[keyindex] not in ngram_dictionary:
            ngram_dictionary[ngram[keyindex]] = [ngram]
        else:
            ngram_dictionary[ngram[keyindex]].append(ngram)
    return ngram_dictionary

In [None]:
#Loop through the files and append the dictionaries for each file
#to a list called keywords
#Change the number (6) to change the size of the context window 
#(i.e. the number of words around the keyword)

keywords = []

for file in all_docs_no_stop:
        ngrams = make_ngrams(file, 6)
        keywords_in_context = ngrams_to_dictionary(ngrams)
        keywords.append(keywords_in_context)

In [None]:
#Define a function that will loop through the list of ngram dictionaries
#and print out a given keyword with its ngrams
#or print a line that the keyword is not in the dictionary
def lookup_keyword(kw, dictionaries):
    result_string = ""  # Initialize an empty string to collect the results
    for i in range(len(dictionaries)):
        text_name = text_files[i]
        dictionary = dictionaries[i]
        if kw in dictionary:
            result_string += text_name + " " + str(dictionary[kw]) + "\n\n"
        else:
            result_string += f"{kw} not in file: {text_name}\n\n"
    return result_string

In [None]:
#Look up the words that appear next to a given keyword 
#for each text in the corpus 
kw_in_context = lookup_keyword('god', keywords)
kw_in_context

In [None]:
#Write out the keywords in context to a file called "keyword_in_context.txt"
with open("keyword_in_context.txt", mode="w") as file_object:
    file_object.write(str(kw_in_context))

#### Most Frequent Neighboring Words across multiple files within a directory

In [None]:
from collections import Counter

In [None]:
#Look up most frequent words that appear next to a given keyword

#Define a function to return most frequent words 
#that appear next to a particular keyword
def get_neighbor_words(keyword, ngrams):
    
    neighbor_words = []
    keyword = keyword.lower()
    
    for ngram in ngrams:
        if keyword in ngram:
            for word in ngram:
                if word != keyword:
                        neighbor_words.append(word)
    return Counter(neighbor_words).most_common()

In [None]:
#Loop through the files and append the ngrams for each file
#to a list called all_ngrams
#Change the number to change the context window

all_ngrams = []

for file in all_docs_no_stop:
    ngrams = make_ngrams(file, 6)
    all_ngrams.append(ngrams)

In [None]:
#Define a function to loop through ngrams above
# and look up most common neighbor words for a given keyword
def lookup_neighbor_words(keyword, ngram_list):
    results_string = ""  # Initialize an empty string to collect the results
    for i in range(len(ngram_list)):
        text_name = text_files[i]
        text_ngrams = ngram_list[i]
        neighbor_words = get_neighbor_words(keyword, text_ngrams)
        results_string += text_name + " " + str(neighbor_words) + "\n\n"
    return results_string

In [None]:
#Look up the most common neighbor words for a given keyword
#for each text in the corpus
most_frequent_neighbor_words = lookup_neighbor_words('sun', all_ngrams)
most_frequent_neighbor_words

In [None]:
# Write out the most frequent neighbor words to a file called "most_frequent_neighbor_words.txt"
with open("most_frequent_neighbor_words.txt", mode="w") as file_object:
    file_object.write(str(most_frequent_neighbor_words))