In [1]:
import collections
import math
import os
import random

import numpy as np
import tensorflow as tf

In [2]:
def read_data(filename):
    with open(filename, 'r') as _f:
        lines = _f.readlines()
    return ' '.join(lines)



In [3]:
from nltk.corpus import stopwords 
from nltk.stem.wordnet import WordNetLemmatizer
import string
stop = set(stopwords.words('english'))
exclude = set(string.punctuation) 
lemma = WordNetLemmatizer()
def clean(doc):
    stop_free = " ".join([i for i in doc.lower().split() if i not in stop])
    punc_free = ''.join(ch for ch in stop_free if ch not in exclude)
    normalized = " ".join(lemma.lemmatize(word) for word in punc_free.split())
    return normalized

In [4]:
words = (clean(read_data('data/out_defs.txt')) + clean(read_data('data/out_rand.txt'))).split()

In [5]:
len(words)

4505

In [6]:
# Step 2: Build the dictionary and replace rare words with UNK token.
vocabulary_size = 50


def build_dataset(words, vocabulary_size):
  count = [['UNK', -1]]
  count.extend(collections.Counter(words).most_common(vocabulary_size - 1))
  dictionary = dict()
  for word, _ in count:
    dictionary[word] = len(dictionary)
  data = list()
  unk_count = 0
  for word in words:
    if word in dictionary:
      index = dictionary[word]
    else:
      index = 0  # dictionary['UNK']
      unk_count += 1
    data.append(index)
  count[0][1] = unk_count
  reverse_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
  return data, count, dictionary, reverse_dictionary

data, count, dictionary, reverse_dictionary = build_dataset(words, vocabulary_size)

In [7]:
print('Most common words (+UNK)', count[:5])
print('Sample data', data[:10], [reverse_dictionary[i] for i in data[:10]])

Most common words (+UNK) [['UNK', 3261], ('let', 96), ('bundle', 44), ('complex', 44), ('module', 38)]
Sample data [0, 0, 0, 0, 0, 0, 0, 0, 0, 0] ['UNK', 'UNK', 'UNK', 'UNK', 'UNK', 'UNK', 'UNK', 'UNK', 'UNK', 'UNK']


In [53]:
data_index = 0


# Step 3: Function to generate a training batch for the skip-gram model.
def generate_batch(batch_size, num_skips, skip_window):
  global data_index
  assert batch_size % num_skips == 0
  assert num_skips <= 2 * skip_window
  batch = np.ndarray(shape=(batch_size), dtype=np.int32)
  labels = np.ndarray(shape=(batch_size, 1), dtype=np.int32)
  span = 2 * skip_window + 1  # [ skip_window target skip_window ]
  buffer = collections.deque(maxlen=span)
  for _ in range(span):
    buffer.append(data[data_index])
    data_index = (data_index + 1) % len(data)
  for i in range(batch_size // num_skips):
    target = skip_window  # target label at the center of the buffer
    targets_to_avoid = [skip_window]
    for j in range(num_skips):
      while target in targets_to_avoid:
        target = random.randint(0, span - 1)
      targets_to_avoid.append(target)
      batch[i * num_skips + j] = buffer[skip_window]
      labels[i * num_skips + j, 0] = buffer[target]
    buffer.append(data[data_index])
    data_index = (data_index + 1) % len(data)
  # Backtrack a little bit to avoid skipping words in the end of a batch
  data_index = (data_index + len(data) - span) % len(data)
  return batch, labels

batch, labels = generate_batch(batch_size=8, num_skips=2, skip_window=1)
for i in range(8):
  print(batch[i], reverse_dictionary[batch[i]],
        '->', labels[i, 0], reverse_dictionary[labels[i, 0]])


0 UNK -> 0 UNK
0 UNK -> 0 UNK
0 UNK -> 0 UNK
0 UNK -> 0 UNK
0 UNK -> 0 UNK
0 UNK -> 0 UNK
0 UNK -> 0 UNK
0 UNK -> 0 UNK


In [8]:
sweet = '''
Where it began, I can't begin to know when
But then I know it's growing strong
Oh, wasn't the spring, whooo
And spring became the summer
Who'd believe you'd come along

Hands, touching hands, reaching out
Touching me, touching you
Oh, sweet Caroline
Good times never seem so good
I've been inclined to believe it never would

And now I, I look at the night, whooo
And it don't seem so lonely
We fill it up with only two, oh
And when I hurt
Hurting runs off my shoulder
How can I hurt when holding you

Oh, one, touching one, reaching out
Touching me, touching you
Oh, sweet Caroline
Good times never seem so good
Oh I've been inclined to believe it never would

Ohhh, sweet Caroline, good times never seem so good
'''
data, count, dictionary, reverse_dictionary = build_dataset(clean(sweet).split(), vocabulary_size)

In [9]:
count

[['UNK', 0],
 ('oh', 6),
 ('touching', 6),
 ('good', 6),
 ('never', 5),
 ('seem', 4),
 ('believe', 3),
 ('sweet', 3),
 ('caroline', 3),
 ('time', 3),
 ('know', 2),
 ('spring', 2),
 ('whooo', 2),
 ('hand', 2),
 ('reaching', 2),
 ('me', 2),
 ('ive', 2),
 ('inclined', 2),
 ('would', 2),
 ('hurt', 2),
 ('one', 2),
 ('began', 1),
 ('cant', 1),
 ('begin', 1),
 ('growing', 1),
 ('strong', 1),
 ('became', 1),
 ('summer', 1),
 ('whod', 1),
 ('come', 1),
 ('along', 1),
 ('i', 1),
 ('look', 1),
 ('night', 1),
 ('lonely', 1),
 ('fill', 1),
 ('two', 1),
 ('hurting', 1),
 ('run', 1),
 ('shoulder', 1),
 ('holding', 1),
 ('ohhh', 1)]

In [12]:
import math

def cooc(_count):
    for a in _count:
        for b in _count:
            count = 0

            for x in clean(sweet).split():
                if a != b:
                    if a in x and b in x:
                        count += 1

                else:
                    n = x.count(a)
                    if n >= 2:
                        count += math.factorial(n)/math.factorial(n - 2)/2

            print('{} x {} = {}'.format(a, b, count))
    return 1

In [13]:
cooc(count)

TypeError: must be str, not list