# Word Frequency: Counting Words

## Counting word frequencies for a single text

In [None]:
#Import the libraries we will need
import re
from collections import Counter

In [None]:
#Set path to our file
text_file = 'soderberg-corpus/1897_Drizzle.txt'

### Tokenizing: splitting your texts into units of analysis (in this case words)

In [None]:
#Define a tokenizing function
"""This function keeps only words, no numbers.
It lowcases
splits at and removes anything that is not a "word" character
(i.e. a letter or digit or underbar)
so it will split at and remove whitspace and punctuation
Then keeps only alphabetic characters 
(i.e. remove numbers) with .isalpha()
"""

def tokenize(text):
    lowercase_text = text.lower()
    split_words = re.split(r'\W+', lowercase_text)
    tokenized = [word for word in split_words if word.isalpha()]
    return tokenized

In [None]:
#Open the file and tokenize it 
#(creates a list of all the words/tokens in all_words)
with open(text_file, 'r', encoding='utf-8') as file:
    text = file.read()
    all_words = tokenize(text)

In [None]:
#Print first twenty tokens in your list of tokens
print(all_words[:20])

### Unfiltered Most Frequent words

In [None]:
#Define a function to count the most frequent words
#You can change the number of most frequent words in number_of_top_words variable
def print_frequent_words(list_of_tokens):
    words_tally = Counter(list_of_tokens)
    number_of_top_words = 50
    most_frequent_words = words_tally.most_common(number_of_top_words)
    return most_frequent_words

In [None]:
#Count the most frequent words
#by calling the function on your list of tokens
most_frequent = print_frequent_words(all_words)
print(most_frequent)

### Removing Stopwords

To make the results more meaninful we can filter out words we don't want in our analyses by using a stopwords list.

In [None]:
#Getting the stopwords list from spaCy
import spacy

#Download the language model you're interested in 
#(this is for english)
!python -m spacy download en_core_web_md

In [None]:
#Load language model and stopwords list
nlp = spacy.load('en_core_web_md')
stopwords = nlp.Defaults.stop_words
sorted(list(stopwords))

In [None]:
#Write out the spacy stopwords list to a txt file
with open("custom-stopwords.txt", "a") as file_object:
    for word in sorted(list(stopwords)): 
        file_object.write(word + '\n')

Open the file and look over the stopwords list. Are there any you want to keep or remove?

In [None]:
#Define a function to remove stopwords from your list of tokens
with open("custom-stopwords.txt", "r") as file_object:
    custom_stopwords = file_object.read()

def remove_stopwords(list_of_tokens, stopwords):
    return [token for token in list_of_tokens if token not in stopwords]

all_words_no_stop = remove_stopwords(all_words, custom_stopwords)

In [None]:
#Run word frequency function on your list of tokens with stopwords removed
most_frequent_filtered = print_frequent_words(all_words_no_stop)
print(most_frequent_filtered)

Are there still words you want to add to your stopwords list? Open you stopwords file and add the words to the list, then run the two cells above again.

### Visualizing the most frequent words with pandas

In [None]:
import pandas as pd

In [None]:
#Make dataframe of the frequent words in the text
top_words_df = pd.DataFrame(most_frequent_filtered, columns = ['word', 'count'])
top_words_df

In [None]:
#Write out as csv file (if you want to save the frequent words)
#top_words_df.to_csv('drizzle_frequent.csv', index = False)

In [None]:
#Take top 10 most frequent words
top_10_df = top_words_df[:10]
top_10_df

In [None]:
#Create bar plot of top 10 frequent words
import matplotlib.pyplot as plt
import seaborn as sns

fig, ax = plt.subplots(figsize=(2,7))
figure = sns.barplot(data=top_10_df, x='count', y='word')

# Save the figure
#plt.savefig("drizzle_top_10.pdf", bbox_inches = 'tight')

## Counting the most frequent words across all the documents

In [None]:
from pathlib import Path  
import glob

In [None]:
#Set path to your corpus
#define that you want to analyze all .txt files in the directory
directory_path = 'soderberg-corpus'
text_files = glob.glob(f'{directory_path}/*.txt')
print(text_files)

In [None]:
#Tokenize the text files and append tokens to all_docs
#This create a list of all the words from all the documents
all_docs = []

for filepath in text_files:
    with open(filepath, 'r', encoding='utf-8') as file:
        text = file.read()
        tokenized_text = tokenize(text)
        all_docs.extend(tokenized_text)

In [None]:
#Remove stopwords
all_docs_no_stop = remove_stopwords(all_docs, custom_stopwords)

In [None]:
#Run word frequency function on your list of tokens with stopwords removed
#These are the most frequent words from all the documents
most_frequent_all_docs_filtered = print_frequent_words(all_docs_no_stop)
print(most_frequent_all_docs_filtered)

## What are the most frequent words across all the short stories?

In [None]:
#Read in the metdata csv to a pandas dataframe
metadata_df = pd.read_csv('Söderberg-corpus-metadata.csv')
metadata_df

In [None]:
#Subset the dataframe to select just the short stories
short_stories = metadata_df[metadata_df.genre == 'short story']
short_stories

In [None]:
#Get the filenames for every short story
short_stories_filenames = short_stories.filename.values
short_stories_filenames

In [None]:
#Tokenize the text files and append tokens to all_short_stories
#This create a list of tokens for all the short stories
all_short_stories = []

for filepath in text_files:
    for short_story in short_stories_filenames:
        if short_story in filepath:
            with open(filepath, 'r', encoding='utf-8') as file:
                text = file.read()
                tokenized_text = tokenize(text)
                all_short_stories.extend(tokenized_text)

In [None]:
#Remove stopwords
all_words_ss_no_stop = remove_stopwords(all_short_stories, custom_stopwords)

In [None]:
#Count most frequent words for all short stories
most_frequent_ss = print_frequent_words(all_words_ss_no_stop)
print(most_frequent_ss)

### Creating a barplot of most frequent words for short stories

In [None]:
#Make dataframe of the most frequent words from all short stories
ss_top_words_df = pd.DataFrame(most_frequent_ss, columns = ['word', 'count'])

ss_top_words_df 

In [None]:
#Take top 10 most frequent words
ss_top_10_df = ss_top_words_df[:10]
ss_top_10_df

In [None]:
#Create bar plots of top 10 most frequent words for all short stories
import matplotlib.pyplot as plt
import seaborn as sns

fig, ax = plt.subplots(figsize=(2,7))
figure = sns.barplot(data=ss_top_10_df, x='count', y='word')

## Save the figure
#plt.savefig("ss_top_10.pdf", bbox_inches = 'tight')

What if we want to count the most frequent words for novels? How would you go about doing that?