In [1]:
# 3.2 - corpus word co-occurrence
#
# code for the Intuitive Text Mining book
# Tariq Rashid, 2018

In [2]:
# collections for counter
import collections

# pandas for dataframes
import pandas

# glob module for finding files that match a pattern, os module for file basename
import glob
import os

In [3]:
# function to clean and simplify text for indexing and also querying

def clean_text(text):
    
    # make lowercase
    cleaned_text = text.lower()

    # keep only normal letters a-z, numbers, spaces, and new lines, remove punctuation
    cleaned_text = [char for char in cleaned_text if char.isalnum() or char==" " or char=='\n']
    cleaned_text = "".join(cleaned_text)
    
    return cleaned_text

In [4]:
# read text documents and build occurrence matrix

# corpus location and text filename pattern
corpus_directory = 'data_sets/recipes/'
text_filename_pattern = 'txt/??.txt'

# list of text files
list_of_text_files = glob.glob(corpus_directory + text_filename_pattern)

# count cooccurrences of words
word_pair_ctr = collections.Counter()

# read text from every text file
for document_file_path in list_of_text_files:
    document_name = os.path.basename(document_file_path)
    print("reading from ...", document_name)
    with open(document_file_path, "r") as f:
        
        # read text content of file
        text_content = f.read()
        
        # clean text content
        cleaned_text = clean_text(text_content)

        # split the text into words
        word_list = cleaned_text.split()
        
        # min word length
        word_list = [word for word in word_list if len(word) > 4]
        
        # word pairs
        first_words = word_list[:-1]
        second_words = word_list[1:]
        word_pair_list = zip(first_words, second_words)
        
        # reorder word pairs so (b,a) is counted as (a,b)
        word_pair_list = [(a,b) if (a < b) else (b,a) for (a,b) in word_pair_list]

        # counts for each word pair
        word_pair_ctr += collections.Counter(word_pair_list)
        
        pass
    pass

reading from ... 15.txt
reading from ... 01.txt
reading from ... 00.txt
reading from ... 14.txt
reading from ... 02.txt
reading from ... 16.txt
reading from ... 17.txt
reading from ... 03.txt
reading from ... 07.txt
reading from ... 13.txt
reading from ... 12.txt
reading from ... 06.txt
reading from ... 10.txt
reading from ... 04.txt
reading from ... 05.txt
reading from ... 11.txt
reading from ... 08.txt
reading from ... 20.txt
reading from ... 21.txt
reading from ... 09.txt
reading from ... 19.txt
reading from ... 18.txt


In [5]:
# most common word pairs

word_pair_series= pandas.Series(word_pair_ctr).sort_values(ascending=False)
word_pair_series.sort_values(ascending=False)

cheese          grated            16
sauce           tomato            10
bread           crumbs             7
brown           stock              6
sieve           through            5
quantity        small              4
paste           tomato             4
salsa           sauce              4
butter          piece              4
little          pieces             4
broth           water              4
boiled          chicken            3
froth           white              3
concentrated    paste              3
little          pepper             3
board           bread              3
pepper          season             3
paste           sheet              3
butter          cheese             3
nutmeg          taste              3
tablespoonfuls  three              3
boiling         water              3
celery          piece              3
browned         butter             3
sauce           white              3
celery          parsley            2
changing        water              2
p

In [6]:
word_pair_series['grated']

pinch            2
ground           1
teaspoonful      1
lukewarm         1
layers           1
tablespoonful    1
sprinkle         1
pepper           1
moderately       1
little           1
marrow           1
dtype: int64

In [7]:
word_pair_series['cheese']

grated         16
parmesan        2
seasoned        1
taste           1
nutmeg          1
crumbs          1
three           1
tablespoons     1
little          1
dispose         1
melted          1
crumb           1
remove          1
dtype: int64

In [43]:
# create dataframe from series

word_pair_df = word_pair_series.reset_index()
word_pair_df.columns =['source', 'target', 'weight']

In [44]:
import d3fdgraph

In [57]:
d3fdgraph.plot_force_directed_graph(word_pair_df[word_pair_df['weight']>1])

In [55]:
word_pair_df.head()

Unnamed: 0,source,target,weight
0,cheese,grated,16
1,sauce,tomato,10
2,bread,crumbs,7
3,brown,stock,6
4,sieve,through,5
