# Creating Co-occurence Statistics from Text

This tutorial shows how to compute **Co-occurrence statistics** for words in a given text corups.

The statistics we compute for each co-occurring pair is the [Pointwise mutual information](https://en.wikipedia.org/wiki/Pointwise_mutual_information) (pmi), which is used by the [Swivel](https://arxiv.org/pdf/1602.02215.pdf) algorithm for learning embeddings.

<a href="https://colab.research.google.com/github/ksalama/data2cooc2emb2ann/blob/master/text2emb/01-Creating_Co-occurence_Statistics_from_Text.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Setup

In [None]:
# !pip install -r ../requirements.txt

In [None]:
import os
import math
import apache_beam as beam
import tensorflow as tf
from datetime import datetime
from random import random

In [None]:
PROJECT_ID = 'ksalama-cloudml'
REGION = 'europe-west1'
WORKSPACE = './workspace'
DATA_DIR = '{}/data'.format(WORKSPACE)
COOC_DIR = '{}/cooc'.format(WORKSPACE)

In [None]:
if tf.io.gfile.exists(COOC_DIR):
    print("Removing {} contents...".format(COOC_DIR))
    tf.io.gfile.rmtree(COOC_DIR)

print("Creating output: {}".format(COOC_DIR))
tf.io.gfile.makedirs(COOC_DIR)

### Stop words

In [None]:
stop_words = ["i", "me", "my", "myself", "we", "our", "ours", "ourselves", 
             "you", "your", "yours", "yourself", "yourselves", "he", "him", 
             "his", "himself", "she", "her", "hers", "herself", "it", "its", 
             "itself", "they", "them", "their", "theirs", "themselves", "what", 
             "which", "who", "whom", "this", "that", "these", "those", "am", "is", 
             "are", "was", "were", "be", "been", "being", "have", "has", "had", 
             "having", "do", "does", "did", "doing", "a", "an", "the", "and", "but", 
             "if", "or", "because", "as", "until", "while", "of", "at", "by", "for", 
             "with", "about", "against", "between", "into", "through", "during", "before", 
             "after", "above", "below", "to", "from", "up", "down", "in", "out", "on", "off", 
             "over", "under", "again", "further", "then", "once", "here", "there", "when", "where", 
             "why", "how", "all", "any", "both", "each", "few", "more", "most", "other", "some", 
             "such", "no", "nor", "not", "only", "own", "same", "so", "than", "too", "very", "s", "t", 
             "can", "will", "just", "don", "should", "now"]

## Pipeline steps

In [None]:
def read_text(pipeline, source_data_location):
    lines = ( 
        pipeline
        | 'Read from files'>> beam.io.ReadFromText(
            file_pattern=source_data_location)
        | 'Skip empty lines'>> beam.Filter(lambda line: len(line) > 2)
    )
    return lines


def tokenize(lines, min_sentence_length, stop_words):

    import string
    
    def _tokenize(line, stop_words):
        line = ''.join(ch for ch in line if ch not in set(string.punctuation)).strip().lower()
        tokens = [token for token in line.split() if len(token) > 2 and not token.isdigit()]
        for word in list(stop_words):
            if word in tokens:
                tokens.remove(word)
        return tokens
    
    def _valid(tokens, min_sentence_length):
        return len(list(tokens)) >= min_sentence_length

    
    line_words = ( 
        lines
        | 'Tokenize lines'>> beam.Map(_tokenize, stop_words)
        | 'Filter invalid lines'>> beam.Filter(_valid, min_sentence_length)
    )
    return line_words


def compute_word_frequency(line_words, min_freq):
    
    def _count_words(line_words):
        from collections import Counter
        word_counts = list(Counter(line_words).items())
        return word_counts
    
    def _include(word_counts):
        word, count = word_counts
        return count >= min_freq
    
    word_counts = (
        line_words
        | 'Word count per line'>> beam.FlatMap(_count_words)
        | 'Compute word frequency' >> beam.CombinePerKey(sum)
        | 'Filter infrequent words' >> beam.Filter(_include)
    )
    return word_counts


def compute_cooccurrence(line_words, window_size):
    
    from collections import defaultdict
    
    def _extract_coocs_from_line(line_words, window_size):

        coocs = defaultdict(float)
        
        for position, word1 in enumerate(line_words):
            window_extent = min(window_size + 1, len(line_words) - position)
            for offset in range(1, window_extent):
                word2 = line_words[position + offset]
                if word1 != word2:
                    pair = (min(word1, word2), max(word1, word2))
                    count = 1.0 / offset
                    coocs[pair] += count
        
        return list(coocs.items())

    def _format_output(record):
        key, value = record
        item_1, item_2 = key
        return (item_1, item_2, value)
    
    cooccurrence = (
        line_words
        | 'Extract cooccurrence from line' >> beam.FlatMap(_extract_coocs_from_line, window_size)
        | 'Aggregate cooccurrences' >> beam.CombinePerKey(sum)
        | 'Format cooc' >> beam.Map(_format_output)
        
    )
    return cooccurrence

def compute_total_cooc(cooccurrence):
    
    def _get_cooc_values(record):
        _, _, cooc = record
        return cooc
    
    result = (
        cooccurrence
        | "Get cooc values" >> beam.Map(_get_cooc_values)
        | "Sum cooc values" >> beam.CombineGlobally(sum)
    )
    
    return result

def join_with_item_frequency(cooccurrence, item_frequency):
    
    def _make_item1_key(record):
        item1, item2, cooc = record
        return (item1, (item2, cooc))
        
    def _make_item2_key(record):
        item1, item2, cooc, freq1 = record
        return (item2, (item1, cooc, freq1))
    
    def _is_frequent_item(record):
        _, entry = record
        return len(list(entry['freq'])) == 1
    
    def _reformat1(record):
        result = []
        item1, entry = record
        cooc_items = entry['cooc']
        item1_freq = entry['freq']
        
        for item2, cooc in cooc_items:
            result.append(
                (item1, item2, cooc, item1_freq[0]))
            
        return result
    
    def _reformat2(record):
        result = []
        item2, entry = record
        cooc_items = entry['cooc']
        item2_freq = entry['freq']
        
        for item1, cooc, freq1 in cooc_items:
            result.append(
                (item1, item2, cooc, freq1, item2_freq[0]))
            
        return result
    
    item1_cooc = (
        cooccurrence
        | "Make item 1 as key" >> beam.Map(_make_item1_key)
    )
    
    cooc_and_freq1 = (
        {'cooc': item1_cooc, 'freq': item_frequency}
        | "Join with item 1 frequency" >> beam.CoGroupByKey()
        | "Filter frequent items 1" >> beam.Filter(_is_frequent_item)
        | "Format" >> beam.FlatMap(_reformat1)
        
    )
    
    item2_cooc = (
        cooc_and_freq1
        | "Make item 2 as key" >> beam.Map(_make_item2_key)
    )
    
    stats = (
        {'cooc': item2_cooc, 'freq': item_frequency}
        | "Join with item 2 frequency" >> beam.CoGroupByKey()
        | "Filter frequent items 2" >> beam.Filter(_is_frequent_item)
        | "Reformat" >> beam.FlatMap(_reformat2)
    )
    
    return stats


def vocabulary(item_frequency):
    
    def _get_vocab(record):
        item, _ = record
        return item
    
    vocab = (
        item_frequency
        | "Extract item vocabulary" >> beam.Map(_get_vocab)

    )
    
    return vocab

def create_partitions(vocab, num_shards):
    
    def _partition_fn(item, num_shards):
        return abs(hash(item)) % num_shards
    
    partitions = (
        vocab 
        | "Parition items" >> beam.Partition(_partition_fn, num_shards)
    )
    
    return partitions

    
def create_top_pairs(index, item_frequency, top_count):
    
    def _compare(item_freq1, item_freq2):
        _, freq1 = item_freq1
        _, freq2 = item_freq2
        return freq1 < freq2
    
    def _generate_pairs(items):
        results = []
        items = list(items)
        count = len(items)
        for i in range(count):
            for j in range(i+1, count):
                item1 = items[i][0]
                item2 = items[j][0]
                first, second = (min(item1, item2), max(item1, item2))
                results.append((first, second, 0))
        return results
    
    pairs = (
        item_frequency
        | "Get top items - shard {}".format(index) >> beam.CombineGlobally(
            beam.combiners.TopCombineFn(top_count, _compare))
        | "Generate item pairs - shard {}".format(index) >> beam.FlatMap(_generate_pairs)
    )
    
    return pairs

def generate_and_union_pairs(partitions, top_count):
    
    pair_list = []
    for i, partition in enumerate(partitions):
        pairs = create_top_pairs(i+1, partition, top_count)
        pair_list.append(pairs)
            
    pairs = (
        pair_list
        | "Union item pairs" >> beam.Flatten()
    )
    return pairs
    
def merge_pairs(pairs):
    
    def _pair_as_key(record):
        item1, item2, cooc = record
        return ((item1, item2), cooc)
    
    def _process_pair(record):
        key, value = record
        item1, item2 = key
        cooc = max(value)
        return (item1, item2, cooc)
    
    pairs = (
        pairs
        | "Merge item pairs" >> beam.Flatten()
        | "Use pair as key" >> beam.Map(_pair_as_key)
        | "Group pairs by key" >> beam.GroupByKey()
        | "Process pairs" >> beam.Map(_process_pair)
        
    )
    return pairs


def compute_score(data, total):
    
    def _compute_pmi(record, total):
        import math
        item1, item2, cooc, freq1, freq2 = record
        
        if cooc > 0:
            pmi = math.log(cooc) - math.log(freq1) - math.log(freq2) + math.log(total)
            weight = math.sqrt(cooc)
            sample_type = 'P'
        else:
            pmi = math.log(1) - math.log(freq1) - math.log(freq2) + math.log(total)
            weight = 1
            sample_type = 'N'
        return (item1, item2, round(pmi, 5), round(weight, 5), sample_type)
    
    
    stats = (
        data
        | "Compute pairewise mutual infromation" >> beam.Map(_compute_pmi, beam.pvalue.AsSingleton(total))
    )
    return stats

def get_info(stats):
    
    def _make_type_as_key(record):
        _, _, _, _, record_type = record
        return (record_type, 1)
    
    def _get_scores(record):
        _, _, score, _, _ = record
        return score
    
    counts = (
        stats
        | "Group by record type" >> beam.Map(_make_type_as_key)
        | "Count records" >> beam.CombinePerKey(sum)
        | "Fromat counts" >> beam.Map(lambda entry: '{}: {}'.format(entry[0], entry[1]))
    )
    
    scores = (
        stats
        | "Get scores" >> beam.Map(_get_scores)
    )
    
    mins = (
        scores
        | "Get min score" >> beam.CombineGlobally(min).without_defaults()
        | "Format min score" >> beam.Map(lambda value: 'min: {}'.format(value))
    )
    
    maxs = (
        scores
        | "Get max score" >> beam.CombineGlobally(max).without_defaults()
        | "Format max score" >> beam.Map(lambda value: 'max: {}'.format(value))
    )
    
    info = (
        (counts, mins, maxs)
        | "Combine info" >> beam.Flatten()
    )
    
    return info
    
    
def write_debug(data, sink_data_location):
    
    (
        data
        | 'Write debug' >> beam.io.WriteToText(
            file_path_prefix = sink_data_location+"/debug")
    )
    

def write_log(info, sink_data_location):
    
    (
        info
        | 'Write logs' >> beam.io.WriteToText(
            file_path_prefix = sink_data_location+"/info",
            file_name_suffix = ".log",
            shard_name_template ='',
            num_shards = 1)
    )

def write_vocab(vocab, sink_data_location):
    
    (
        vocab
        | 'Write vocabulary file' >> beam.io.WriteToText(
            file_path_prefix = sink_data_location+"/vocab", 
            file_name_suffix = ".txt",
            shard_name_template ='',
            num_shards = 1)
    )
    

def write_to_tfrecords(stats, sink_data_location):
    
    def _to_tf_example(record):
        item1, item2, score, weight, record_type = record
        feature = {
            'item1': tf.train.Feature(
                bytes_list=tf.train.BytesList(value=[tf.compat.as_bytes(item1)])),
            'item2': tf.train.Feature(
                bytes_list=tf.train.BytesList(value=[tf.compat.as_bytes(item2)])),
            'score': tf.train.Feature(
                float_list=tf.train.FloatList(value=[float(score)])),
            'weight': tf.train.Feature(
                float_list=tf.train.FloatList(value=[float(weight)])),
            'type': tf.train.Feature(
                bytes_list=tf.train.BytesList(value=[tf.compat.as_bytes(record_type)])),
        }
        return tf.train.Example(features=tf.train.Features(feature=feature))
        
    (
        stats
        | 'Encode to tf.example' >> beam.Map(_to_tf_example)
        | 'Serialize to string' >> beam.Map(lambda example: example.SerializeToString(deterministic=True))
        | 'Write to TFRecords files' >> beam.io.WriteToTFRecord(
                file_path_prefix = sink_data_location+"/cooc",
                file_name_suffix = '.tfrecords')
    ) 

## Pipeline

In [None]:
def run_text2cooc_pipeline(args):

    source_data_location = args['source_data_location']
    sink_data_location = args['sink_data_location']
    runner = args['runner']
    stop_words = args['stop_words']
    min_freq = args['min_freq']
    num_shards = args['num_shards']
    top_count = args['top_count']
    min_sentence_length = args['min_sentence_length']
    window_size = args['window_size']
    
    pipeline_options = beam.options.pipeline_options.GoogleCloudOptions(**args)
    
    with beam.Pipeline(runner, options=pipeline_options) as pipeline:
       
        # Read text from source files as lines
        lines = read_text(pipeline, source_data_location)
        
        # Tokenize lines into words
        line_words = tokenize(lines, min_sentence_length, stop_words)

        # Compute frequency of each word (word, frequency)
        word_frequency = compute_word_frequency(line_words, min_freq)
        #write_debug(word_frequency, sink_data_location)

        # Extract distinct list of items (vocabulary)
        vocab = vocabulary(word_frequency)
        write_vocab(vocab, sink_data_location)
        
        # Generate pairs (item_1, item_2, cooc) for the top frequent items. cooc is set to 0.
        top_pairs = create_top_pairs(0, word_frequency, top_count)
        #write_debug(top_pairs, sink_data_location)
        
        # Split items to partitions
        partitions = create_partitions(word_frequency, num_shards)
        #write_debug(partitions[0], sink_data_location)
        
        # For each partition, generate  (item_1, item_2, score) for the top frequent items. Then union.
        top_pairs_per_partition = generate_and_union_pairs(partitions, top_count)
        #write_debug(pairs, sink_data_location)
        
        #For the co-occuring items, compute the cooccurrence (item_1, item_2, cooc)
        cooccurrence = compute_cooccurrence(line_words, window_size)
        #write_debug(cooccurrence, sink_data_location)
        
        # Merge all pairs: the co-occuring (positive) and not co-occuring (negative)
        all_pairs = merge_pairs((cooccurrence, top_pairs, top_pairs_per_partition))
        #write_debug(all_pairs, sink_data_location)
        
        # Compute |D| = \sum_{ij} x_{ij} 
        total = compute_total_cooc(cooccurrence)
        #write_debug(total, sink_data_location)
        
        # Join Cooc with item frequency => (item_1, item_2, cooc, freq_1, freq_2)
        join = join_with_item_frequency(all_pairs, word_frequency)
        #write_debug(join, sink_data_location)
        
        # Compute statistics (pmi), weights, and record type => (item_1, item_2, pmi, weight, type)
        stats = compute_score(join, total)
        #write_debug(stats, sink_data_location)
        write_to_tfrecords(stats, sink_data_location)
        
        # Log information about the created dataset
        info = get_info(stats)
        write_log(info, sink_data_location)
    

## Run 

In [None]:
runner = 'DirectRunner'
job_name = 'text2cooc-{}'.format(datetime.utcnow().strftime('%y%m%d-%H%M%S'))

args = {
    'job_name': job_name,
    'runner': runner,
    'source_data_location': '{}/file.txt'.format(DATA_DIR),
    'sink_data_location': COOC_DIR,
    'min_freq': 5,
    'top_count': 300,
    'num_shards': 100,
    'min_sentence_length': 3,
    'window_size': 10,
    'stop_words': stop_words,
    'project': PROJECT_ID,
}

print("Pipeline args are set.")

In [None]:
time_start = datetime.utcnow() 
print("Running pipeline...")
run_text2cooc_pipeline(args)
print("Pipeline is done.")
time_end = datetime.utcnow() 
time_elapsed = time_end - time_start
print("Execution elapsed time: {} seconds".format(time_elapsed.total_seconds()))

In [None]:
!ls {COOC_DIR}

In [None]:
!wc -l  {COOC_DIR}/vocab.txt
!head {COOC_DIR}/vocab.txt

In [None]:
!head {COOC_DIR}/info.log

## [Optional] Run on Cloud Dataflow

In [None]:
PROJECT_ID = 'ksalama-cloudml'
REGION = 'europe-west1'
WORKSPACE = 'gs://ksalama-cloudml/text_workspace'
DATA_DIR = '{}/corpus'.format(WORKSPACE)
COOC_DIR = '{}/cooc'.format(WORKSPACE)
STAGING_DIR ='{}/stg'.format(WORKSPACE)
TEMP_DIR ='{}/tmp'.format(WORKSPACE)

if tf.io.gfile.exists(COOC_DIR):
    print("Removing {} contents...".format(COOC_DIR))
    tf.io.gfile.rmtree(COOC_DIR)

print("Creating output: {}".format(COOC_DIR))
tf.io.gfile.makedirs(COOC_DIR)

runner = 'DataflowRunner'
job_name = 'text2cooc-{}'.format(datetime.utcnow().strftime('%y%m%d-%H%M%S'))

args = {
    'job_name': job_name,
    'runner': runner,
    'source_data_location': '{}/00*.txt'.format(DATA_DIR),
    'sink_data_location': COOC_DIR,
    'stop_words':  stop_words,
    'min_freq': 100,
    'top_count': 300,
    'num_shards': 100,
    'min_sentence_length': 3,
    'window_size': 10,
    'project': PROJECT_ID,
    'region': REGION,
    'staging_location': STAGING_DIR,
    'temp_location': TEMP_DIR,
    'save_main_session': True,
}

print("Pipeline args are set.\n")
print(args)

In [None]:
time_start = datetime.utcnow() 
print("Running pipeline...")
run_text2cooc_pipeline(args)
print("Pipeline is done.")
time_end = datetime.utcnow() 
time_elapsed = time_end - time_start
print("Execution elapsed time: {} seconds".format(time_elapsed.total_seconds()))

In [None]:
!gsutil ls {COOC_DIR}/

In [None]:
!rm -r ./workspace/cooc/
!gsutil -m cp gs://ksalama-cloudml/text_workspace/cooc/info.log ./workspace/cooc/info.log
!gsutil -m cp gs://ksalama-cloudml/text_workspace/cooc/vocab.txt ./workspace/cooc/vocab.txt

In [None]:
!head  ./workspace/cooc/info.log
!wc -l   ./workspace/cooc/vocab.txt
!head  ./workspace/cooc/vocab.txt

## 3. Read TFRecords using tf.data APIs

In [None]:
!gsutil -m cp gs://ksalama-cloudml/text_workspace/cooc/cooc-00000-* ./workspace/cooc/

In [None]:
def make_input_fn(file_pattern, batch_size):
    
    features = {
        'item1': tf.FixedLenFeature(dtype=tf.string, shape=()),
        'item2': tf.FixedLenFeature(dtype=tf.string, shape=()),
        'score': tf.FixedLenFeature(dtype=tf.float32, shape=()),
        'weight': tf.FixedLenFeature(dtype=tf.float32, shape=()),
        'type': tf.FixedLenFeature(dtype=tf.string, shape=())
    }

    def _input_fn():
        dataset = tf.data.experimental.make_batched_features_dataset(
            file_pattern,
            batch_size,
            features,
            reader=tf.data.TFRecordDataset,
            label_key=None,
            num_epochs=1,
            shuffle=True
        )
        return dataset
    
    return _input_fn

In [None]:
tf.enable_eager_execution()

DATA_FILES = "{}/cooc-*".format(COOC_DIR)

dataset = make_input_fn(DATA_FILES, batch_size=5)()
for i, features in enumerate(dataset.take(5)):
    print()
    print("Record {}:".format(i+1))
    for key in features:
        print("-{}:{}".format(key, features[key]))