# Creating Co-occurence Statistics from Text

This tutorial shows how to compute **Co-occurrence statistics** for words in a given text corups.

The statistics we compute for each co-occurring pair is the [Pointwise mutual information](https://en.wikipedia.org/wiki/Pointwise_mutual_information) (pmi), which is used by the [Swivel](https://arxiv.org/pdf/1602.02215.pdf) algorithm for learning embeddings.

The steps of the tutorial are the following:
1. Download the movie reviews dataset from the [NLTK data repository](http://www.nltk.org/nltk_data/).
2. Implement and run the Co-occurrance calculation Beam pipeline.
3. Read the the produces stats.

<a href="https://colab.research.google.com/github/ksalama/data2cooc2emb2ann/blob/master/text2emb/01-Creating_Co-occurence_Statistics_from_Text.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Setup

In [None]:
#!pip3 install apache_beam[gcp]

In [2]:
import os
import sys
import multiprocessing
import math
import apache_beam as beam
import tensorflow as tf
from datetime import datetime

In [3]:
WORKSPACE = './workspace'
DATASET = 'movie_reviews'
DATA_DIR = '{}/data'.format(WORKSPACE)
COOC_DIR = '{}/cooc'.format(WORKSPACE)

## 1. Download Dataset

In [4]:
# !wget https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/packages/corpora/movie_reviews.zip -P {DATA_DIR}/
# !unzip {DATA_DIR}/movie_reviews.zip -d {DATA_DIR}/
# !rm {DATA_DIR}/movie_reviews.zip
# !mv {DATA_DIR}/movie_reviews/pos/* {DATA_DIR}/{DATASET}/
# !mv {DATA_DIR}/movie_reviews/neg/* {DATA_DIR}/{DATASET}/

In [5]:
!ls {DATA_DIR}/{DATASET}

cv000_29416.txt  cv250_26462.txt  cv501_11657.txt  cv751_17208.txt
cv000_29590.txt  cv251_22636.txt  cv501_12675.txt  cv752_24155.txt
cv001_18431.txt  cv251_23901.txt  cv502_10406.txt  cv752_25330.txt
cv001_19502.txt  cv252_23779.txt  cv502_10970.txt  cv753_10875.txt
cv002_15918.txt  cv252_24974.txt  cv503_10558.txt  cv753_11812.txt
cv002_17424.txt  cv253_10077.txt  cv503_11196.txt  cv754_7216.txt
cv003_11664.txt  cv253_10190.txt  cv504_29120.txt  cv754_7709.txt
cv003_12683.txt  cv254_5870.txt   cv504_29243.txt  cv755_23616.txt
cv004_11636.txt  cv254_6027.txt   cv505_12090.txt  cv755_24881.txt
cv004_12641.txt  cv255_13683.txt  cv505_12926.txt  cv756_22540.txt
cv005_29357.txt  cv255_15267.txt  cv506_15956.txt  cv756_23676.txt
cv005_29443.txt  cv256_14740.txt  cv506_17521.txt  cv757_10189.txt
cv006_15448.txt  cv256_16529.txt  cv507_9220.txt   cv757_10668.txt
cv006_17022.txt  cv257_10975.txt  cv507_9509.txt   cv758_9671.txt
cv007_4968.txt	 cv257_11856.txt  cv508_16006.txt  cv758_9740.txt


## 2. Implement and Run the Beam pipeline

### Stop words

In [6]:
stop_words = ["i", "me", "my", "myself", "we", "our", "ours", "ourselves", 
             "you", "your", "yours", "yourself", "yourselves", "he", "him", 
             "his", "himself", "she", "her", "hers", "herself", "it", "its", 
             "itself", "they", "them", "their", "theirs", "themselves", "what", 
             "which", "who", "whom", "this", "that", "these", "those", "am", "is", 
             "are", "was", "were", "be", "been", "being", "have", "has", "had", 
             "having", "do", "does", "did", "doing", "a", "an", "the", "and", "but", 
             "if", "or", "because", "as", "until", "while", "of", "at", "by", "for", 
             "with", "about", "against", "between", "into", "through", "during", "before", 
             "after", "above", "below", "to", "from", "up", "down", "in", "out", "on", "off", 
             "over", "under", "again", "further", "then", "once", "here", "there", "when", "where", 
             "why", "how", "all", "any", "both", "each", "few", "more", "most", "other", "some", 
             "such", "no", "nor", "not", "only", "own", "same", "so", "than", "too", "very", "s", "t", 
             "can", "will", "just", "don", "should", "now"]

### Pipeline steps

In [7]:
def read_text(pipeline, source_data_location):
    lines = ( 
        pipeline
        | 'Read from files'>> beam.io.ReadFromText(
            file_pattern=source_data_location)
    )
    return lines


def tokenize(lines, min_sentence_length, stop_words):

    import string
    
    def _tokenize(line, stop_words):
        line = ''.join(ch for ch in line if ch not in set(string.punctuation)).strip().lower()
        tokens = [
            token.strip() for token in line.split() 
            if True
            #and len(token) > 2 
            and not token.isdigit()
            and token not in stop_words
        ]
        
        return tokens
    
    def _valid(tokens, min_sentence_length):
        return len(list(tokens)) >= min_sentence_length

    
    line_words = ( 
        lines
        | 'Tokenize lines'>> beam.Map(_tokenize, stop_words)
        | 'Filter invalid lines'>> beam.Filter(_valid, min_sentence_length)
    )
    return line_words


def compute_word_frequency(line_words, min_freq):
    
    def _count_words(line_words):
        from collections import Counter
        word_counts = list(Counter(line_words).items())
        return word_counts
    
    def _include(word_counts):
        word, count = word_counts
        return count >= min_freq
    
    word_counts = (
        line_words
        | 'Word count per line'>> beam.FlatMap(_count_words)
        | 'Compute word frequency' >> beam.CombinePerKey(sum)
        | 'Filter infrequent words' >> beam.Filter(_include)
    )
    
    return word_counts


def adjust_word_frequency(cooccurance):
    
    def _extract_entries(record):
        item1, item2, cooc = record
        return [
            (item1, cooc),
            (item2, cooc)
        ]
    
    word_freq = (
        cooccurance
        | "Extract entries" >> beam.FlatMap(_extract_entries)
        | "Adjust word frequency" >> beam.CombinePerKey(sum)
    )
    
    return word_freq


def vocabulary(item_frequency):
    
    def _get_vocab(record):
        item, _ = record
        return item
    
    vocab = (
        item_frequency
        | "Extract item vocabulary" >> beam.Map(_get_vocab)

    )
    
    return vocab


def compute_cooccurrence(line_words, window_size, vocab):
    
    from collections import defaultdict
    
    def _extract_coocs_from_line(line_words, window_size, vocab):

        coocs = defaultdict(float)
        
        line_words = [word for word in line_words if word in vocab]
        
        for position, word1 in enumerate(line_words):
            window_extent = min(window_size + 1, len(line_words) - position)
            for offset in range(1, window_extent):
                word2 = line_words[position + offset]
                if word1 == word2: continue
                pair = (min(word1, word2), max(word1, word2))
                coocs[pair] += (1.0 / offset)
                
            #coocs[(word1, word1)] += 0.5
        
        return list(coocs.items())

    def _format_output(record):
        key, value = record
        item_1, item_2 = key
        return (item_1, item_2, value)
    
    cooccurrence = (
        line_words
        | 'Extract cooccurrence from line' >> beam.FlatMap(
            _extract_coocs_from_line, window_size, beam.pvalue.AsList(vocab))
        | 'Aggregate cooccurrences' >> beam.CombinePerKey(sum)
        | 'Format cooc' >> beam.Map(_format_output)
        
    )
    return cooccurrence


def compute_total_cooc(cooccurrence):
    
    def _get_cooc_values(record):
        _, _, cooc = record
        return cooc
    
    result = (
        cooccurrence
        | "Get cooc values" >> beam.Map(_get_cooc_values)
        | "Sum cooc values" >> beam.CombineGlobally(sum)
    )
    
    return result

    
def create_top_pairs(index, item_frequency, top_count):
    
    def _compare(item_freq1, item_freq2):
        _, freq1 = item_freq1
        _, freq2 = item_freq2
        return freq1 < freq2
    
    def _generate_pairs(items):
        results = []
        items = list(items)
        count = len(items)
        for i in range(count):
            for j in range(i+1, count):
                item1, item2 = items[i][0], items[j][0]
                first, second = min(item1, item2), max(item1, item2)
                results.append((first, second, 0))
        return results
    
    pairs = (
        item_frequency
        | "Get top items - shard {}".format(index) >> beam.CombineGlobally(
            beam.combiners.TopCombineFn(top_count, _compare))
        | "Generate item pairs - shard {}".format(index) >> beam.FlatMap(_generate_pairs)
    )
    
    return pairs


def merge_pairs(pairs):
    
    def _pair_as_key(record):
        item1, item2, cooc = record
        return ((item1, item2), cooc)
    
    def _format_pair(record):
        key, cooc = record
        item1, item2 = key
        return (item1, item2, cooc)
    
    pairs = (
        pairs
        | "Merge item pairs" >> beam.Flatten()
        | "Use pair as key" >> beam.Map(_pair_as_key)
        | "Group pairs by key" >> beam.CombinePerKey(sum)
        | "Format pairs" >> beam.Map(_format_pair)
        
    )
    return pairs


def compute_score(data, word_frequency, total):
    
    def _compute_pmi(record, word_frequency, total):
        
        import math
        
        item1, item2, value = record
        freq1, freq2 = word_frequency[item1], word_frequency[item2]
        
        cooc = 1 if value == 0 else value
        weight = math.sqrt(cooc)
        sample_type = 'N' if value == 0 else 'P'
        pmi = math.log(cooc) - math.log(freq1) - math.log(freq2) + math.log(total)
        
        return (item1, item2, pmi, weight, sample_type)
    
    stats = (
        data
        | "Compute pairewise mutual infromation" >> beam.Map(
            _compute_pmi, beam.pvalue.AsDict(word_frequency), beam.pvalue.AsSingleton(total))
    )
    return stats


def get_info(stats):
    
    def _make_type_as_key(record):
        _, _, _, _, record_type = record
        return (record_type, 1)
    
    def _get_scores(record):
        _, _, score, _, _ = record
        return score
    
    counts = (
        stats
        | "Group by record type" >> beam.Map(_make_type_as_key)
        | "Count records" >> beam.CombinePerKey(sum)
        | "Fromat counts" >> beam.Map(lambda entry: '{}: {}'.format(entry[0], entry[1]))
    )
    
    scores = (
        stats
        | "Get scores" >> beam.Map(_get_scores)
    )
    
    mins = (
        scores
        | "Get min score" >> beam.CombineGlobally(min).without_defaults()
        | "Format min score" >> beam.Map(lambda value: 'min: {}'.format(value))
    )
    
    maxs = (
        scores
        | "Get max score" >> beam.CombineGlobally(max).without_defaults()
        | "Format max score" >> beam.Map(lambda value: 'max: {}'.format(value))
    )
    
    info = (
        (counts, mins, maxs)
        | "Combine info" >> beam.Flatten()
    )
    
    return info
    
    
def write_debug(data, sink_data_location, name):
    
    (
        data
        | 'Write debug {}'.format(name) >> beam.io.WriteToText(
            file_path_prefix = sink_data_location+"/debug-{}".format(name),
            file_name_suffix = ".txt",
            shard_name_template ='',
            num_shards = 1)
    )
    
    
def write_log(info, sink_data_location):
    
    (
        info
        | 'Write logs' >> beam.io.WriteToText(
            file_path_prefix = sink_data_location+"/info",
            file_name_suffix = ".log",
            shard_name_template ='',
            num_shards = 1)
    )

    
def write_vocab(vocab, sink_data_location):
    
    (
        vocab
        | 'Write vocabulary file' >> beam.io.WriteToText(
            file_path_prefix = sink_data_location+"/vocab", 
            file_name_suffix = ".txt",
            shard_name_template ='',
            num_shards = 1)
    )
    
    
def write_to_tfrecords(stats, sink_data_location):
    
    def _to_tf_example(record):
        
        import tensorflow as tf
        
        item1, item2, score, weight, record_type = record
        
        entry = {
            'item1': tf.train.Feature(
                bytes_list=tf.train.BytesList(value=[item1.encode('utf-8')])),
            'item2': tf.train.Feature(
                bytes_list=tf.train.BytesList(value=[item2.encode('utf-8')])),
            'score': tf.train.Feature(
                float_list=tf.train.FloatList(value=[float(score)])),
            'weight': tf.train.Feature(
                float_list=tf.train.FloatList(value=[float(weight)])),
            'type': tf.train.Feature(
                bytes_list=tf.train.BytesList(value=[record_type.encode('utf-8')])),
        }
        
        return tf.train.Example(features=tf.train.Features(feature=entry))
        
    (
        stats
        | 'Encode to tf.example' >> beam.Map(_to_tf_example)
        | 'Serialize to string' >> beam.Map(lambda example: example.SerializeToString(deterministic=True))
        #| 'Shuffle data' >> beam.Reshuffle()
        | 'Write to TFRecords files' >> beam.io.WriteToTFRecord(
                file_path_prefix = sink_data_location+"/cooc",
                file_name_suffix = '.tfrecords')
    ) 

### Pipeline

In [8]:
def run_text2cooc_pipeline(args):

    source_data_location = args['source_data_location']
    sink_data_location = args['sink_data_location']
    runner = args['runner']
    stop_words = args['stop_words']
    min_freq = args['min_freq']
    top_count = args['top_count']
    min_sentence_length = args['min_sentence_length']
    window_size = args['window_size']
    
    pipeline_options = beam.options.pipeline_options.PipelineOptions(**args)
    
    with beam.Pipeline(runner, options=pipeline_options) as pipeline:
       
        # Read text from source files as lines
        lines = read_text(pipeline, source_data_location)
        #write_debug(lines, sink_data_location, 'lines')
        
        # Tokenize lines into words
        line_words = tokenize(lines, min_sentence_length, stop_words)
        #write_debug(line_words, sink_data_location, 'tokens')

        # Compute frequency of each word (word, frequency)
        word_frequency = compute_word_frequency(line_words, min_freq)
        #write_debug(word_frequency, sink_data_location, 'word_freq')

        # Extract distinct list of items (vocabulary)
        vocab = vocabulary(word_frequency)
        write_vocab(vocab, sink_data_location)
        
        # Generate pairs (item_1, item_2, cooc) for the top frequent items. cooc is set to 0.
        negative_pairs = create_top_pairs(0, word_frequency, top_count)
        #write_debug(top_pairs, sink_data_location, 'top_pairs')
        
        # For the co-occuring items, compute the cooccurrence (item_1, item_2, cooc)
        cooccurrence = compute_cooccurrence(line_words, window_size, vocab)
        #write_debug(cooccurrence, sink_data_location, 'cooc')
        
        # Merge all pairs: the co-occuring (positive) and not co-occuring (negative)
        all_pairs = merge_pairs((cooccurrence, negative_pairs))
        #write_debug(all_pairs, sink_data_location, 'all_pairs')
        
        # Compute |D| 
        total = compute_total_cooc(cooccurrence)
        #write_debug(total, sink_data_location, 'total')
        
        # Adjust word frequency
        adjusted_word_frequency = adjust_word_frequency(cooccurrence)
        #write_debug(total, sink_data_location, 'adjusted-freq')
        
        # Compute statistics (pmi), weights, and record type => (item_1, item_2, pmi, weight, type)
        stats = compute_score(all_pairs, adjusted_word_frequency, total) #word_frequency, total)
        
        # Write results as tfrecords
        write_to_tfrecords(stats, sink_data_location)
        
        # Log information about the created dataset
        info = get_info(stats)
        write_log(info, sink_data_location)
    

## Run pipeline

In [9]:
from apache_beam.runners.portability import fn_api_runner
from apache_beam.portability.api import beam_runner_api_pb2
from apache_beam.portability import python_urns

#runner = 'DirectRunner'
runner = fn_api_runner.FnApiRunner(
          default_environment=beam_runner_api_pb2.Environment(
              urn=python_urns.SUBPROCESS_SDK,
              payload=b'%s -m apache_beam.runners.worker.sdk_worker_main'
                        % sys.executable.encode('ascii')))

job_name = 'text2cooc-{}'.format(datetime.utcnow().strftime('%y%m%d-%H%M%S'))

args = {
    'job_name': job_name,
    'runner': runner,
    'source_data_location': '{}/{}/*.txt'.format(DATA_DIR, DATASET),
    'sink_data_location': COOC_DIR,
    'min_freq': 25,
    'top_count': 3000,
    'min_sentence_length': 0,
    'window_size': 5,
    'stop_words': [], #stop_words,
    'direct_num_workers': multiprocessing.cpu_count(),
}

print("Pipeline args are set.")

Pipeline args are set.


In [10]:
if tf.io.gfile.exists(COOC_DIR):
    print("Removing {} contents...".format(COOC_DIR))
    tf.io.gfile.rmtree(COOC_DIR)

print("Creating output: {}".format(COOC_DIR))
tf.io.gfile.makedirs(COOC_DIR)

print("Running pipeline...")
%time run_text2cooc_pipeline(args)
print("Pipeline is done.")

Removing ./workspace/cooc contents...
Creating output: ./workspace/cooc
Running pipeline...
CPU times: user 49.2 s, sys: 3.24 s, total: 52.5 s
Wall time: 4min 18s
Pipeline is done.


In [11]:
!ls {COOC_DIR}
!wc -l  {COOC_DIR}/vocab.txt
!head {COOC_DIR}/info.log

cooc-00000-of-00016.tfrecords  cooc-00009-of-00016.tfrecords
cooc-00001-of-00016.tfrecords  cooc-00010-of-00016.tfrecords
cooc-00002-of-00016.tfrecords  cooc-00011-of-00016.tfrecords
cooc-00003-of-00016.tfrecords  cooc-00012-of-00016.tfrecords
cooc-00004-of-00016.tfrecords  cooc-00013-of-00016.tfrecords
cooc-00005-of-00016.tfrecords  cooc-00014-of-00016.tfrecords
cooc-00006-of-00016.tfrecords  cooc-00015-of-00016.tfrecords
cooc-00007-of-00016.tfrecords  info.log
cooc-00008-of-00016.tfrecords  vocab.txt
4632 ./workspace/cooc/vocab.txt
min: -5.136295533922434
P: 905289
max: 9.184321336159051
N: 3821288


## 3. Read TFRecords using tf.data APIs

In [12]:
def make_input_fn(file_pattern, batch_size):
    
    features = {
        'item1': tf.FixedLenFeature(dtype=tf.string, shape=()),
        'item2': tf.FixedLenFeature(dtype=tf.string, shape=()),
        'score': tf.FixedLenFeature(dtype=tf.float32, shape=()),
        'weight': tf.FixedLenFeature(dtype=tf.float32, shape=()),
        'type': tf.FixedLenFeature(dtype=tf.string, shape=())
    }

    def _input_fn():
        dataset = tf.data.experimental.make_batched_features_dataset(
            file_pattern,
            batch_size,
            features,
            reader=tf.data.TFRecordDataset,
            label_key='score',
            num_epochs=1,
            shuffle=False
        )
        return dataset
    
    return _input_fn

In [14]:
tf.enable_eager_execution()

DATA_FILES = "{}/cooc-*".format(COOC_DIR)

dataset = make_input_fn(DATA_FILES, batch_size=5)()
for i, record in enumerate(dataset.take(5)):
    print()
    print("Record: {}".format(i+1))
    features, target = record
    print("Target: {}".format(target))
    for key in features:
        print("-{}:{}".format(key, features[key]))


Record: 1
Target: [3.847532  4.2651424 2.5682628 1.8438249 1.0896049]
-weight:[1. 1. 1. 1. 1.]
-type:[b'N' b'N' b'N' b'N' b'N']
-item1:[b'horribly' b'foot' b'expectations' b'keeping' b'mess']
-item2:[b'laughable' b'li' b'overall' b'police' b'past']

Record: 2
Target: [1.8474944  1.3184849  0.59754634 3.7067454  2.2249036 ]
-weight:[1.        1.        0.4472136 1.        1.       ]
-type:[b'N' b'N' b'P' b'N' b'N']
-item1:[b'sequel' b'passes' b'bruce' b'sorry' b'rolling']
-item2:[b'yourself' b'world' b'track' b'thankfully' b'starts']

Record: 3
Target: [ 1.9283465  1.844815   3.1941037 -2.214259   2.337698 ]
-weight:[1.        1.        1.        0.4472136 1.       ]
-type:[b'N' b'N' b'N' b'P' b'N']
-item1:[b'richard' b'con' b'memorable' b'its' b'clear']
-item2:[b'sent' b'until' b'warriors' b'tend' b'mob']

Record: 4
Target: [-0.04764934  0.39477628  2.3241832  -0.14011888  4.109247  ]
-weight:[1.         1.         0.70710677 1.4142135  1.        ]
-type:[b'N' b'N' b'P' b'P' b'N']
-it