In [2]:
import collections
import datetime
import re
from hazm import *

FILE_NAME = 'data/fas_newscrawl_2017_100K/fas_newscrawl_2017_100K-sentences.txt'



def read_input(input_file):
    with open(input_file, encoding='utf-8') as f:
        content = f.readlines()
        return content


begin = datetime.datetime.now()
raw_content = read_input(FILE_NAME)
end = datetime.datetime.now()
print(end-begin)

0:00:00.077271


In [3]:
normalizer = Normalizer()

def my_normalizer(content):
    for i in range(len(content)):
        sentence = content[i]
        # remove number of sentence and \t
        sentence = sentence[sentence.index('\t')+1:]
        # hazm normalizer
        normalizer.normalize(sentence)
        # my tokenizer
        sentence = list(filter(lambda s: s != '', re.compile('[ /\'"،؛ء–«»():\-_.$,\[\]!؟\n\t]').split(sentence)))
        content[i] = sentence
#     content = [token for sentence in content for token in sentence]
    return content

begin = datetime.datetime.now()
document = my_normalizer(raw_content)
end = datetime.datetime.now()
print(end-begin)

0:00:08.335652


In [4]:
print(document[111])

['13', 'دوره', 'بازپرداخت', 'تسهیلات', 'موضوع', 'این', 'تصویب', 'نامه', 'حداکثر', 'یک', 'سال', 'تعیین', 'می', 'گردد']


In [5]:
from collections import Counter
vocab = Counter()
for i in range(len(document)):
    vocab.update(document[i])
print(len(vocab))
print(vocab['دانشگاه'])

107618
1618


In [10]:
import gensim

model = gensim.models.Word2Vec(
    document,
    size=300,
    window=5,
    min_count=2,
    sg=0,
    workers=10)

word_vectors = model.wv

In [11]:
model.wv.most_similar_cosmul(positive=['تهران', 'فرانسه'], negative=['ایران'])

[('\u202a۸۴', 1.3345073461532593),
 ('دادگستري', 1.2980746030807495),
 ('نماينده', 1.2795106172561646),
 ('اعضاي', 1.276827335357666),
 ('جلسه\u200cي', 1.2642039060592651),
 ('كميته', 1.2526651620864868),
 ('نمايشگاه', 1.2411503791809082),
 ('مركز', 1.2402410507202148),
 ('مطبوعاتي', 1.2330622673034668),
 ('عواد', 1.2327313423156738)]

In [12]:
print(len(word_vectors.vocab))

47213


In [3]:
from keras.models import Model
from keras.layers import Input, Dense, Reshape, merge
from keras.layers.embeddings import Embedding
from keras.preprocessing.sequence import skipgrams
from keras.preprocessing import sequence

import urllib
import collections
import os
import zipfile

import numpy as np
import tensorflow as tf

# Read the data into a list of strings.
def read_data(filename):
    """Extract the first file enclosed in a zip file as a list of words."""
    with zipfile.ZipFile(filename) as f:
        data = tf.compat.as_str(f.read(f.namelist()[0])).split()
    return data


def build_dataset(words, n_words):
    """Process raw inputs into a dataset."""
    count = [['UNK', -1]]
    count.extend(collections.Counter(words).most_common(n_words - 1))
    dictionary = dict()
    for word, _ in count:
        dictionary[word] = len(dictionary)
    data = list()
    unk_count = 0
    for word in words:
        if word in dictionary:
            index = dictionary[word]
        else:
            index = 0  # dictionary['UNK']
            unk_count += 1
        data.append(index)
    count[0][1] = unk_count
    reversed_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
    return data, count, dictionary, reversed_dictionary

def collect_data(vocabulary_size=10000):
    url = 'http://mattmahoney.net/dc/'
    filename = 'data/text8.zip'
    vocabulary = read_data(filename)
    print(vocabulary[:7])
    data, count, dictionary, reverse_dictionary = build_dataset(vocabulary,
                                                                vocabulary_size)
    del vocabulary  # Hint to reduce memory.
    return data, count, dictionary, reverse_dictionary

vocab_size = 10000
data, count, dictionary, reverse_dictionary = collect_data(vocabulary_size=vocab_size)
print(data[:7])

['anarchism', 'originated', 'as', 'a', 'term', 'of', 'abuse']
[5234, 3081, 12, 6, 195, 2, 3134]


In [4]:
window_size = 3
vector_dim = 300
epochs = 1000000

valid_size = 16     # Random set of words to evaluate similarity on.
valid_window = 100  # Only pick dev samples in the head of the distribution.
valid_examples = np.random.choice(valid_window, valid_size, replace=False)

In [5]:
sampling_table = sequence.make_sampling_table(vocab_size)
couples, labels = skipgrams(data, vocab_size, window_size=window_size, sampling_table=sampling_table)
word_target, word_context = zip(*couples)
word_target = np.array(word_target, dtype="int32")
word_context = np.array(word_context, dtype="int32")

print(couples[:10], labels[:10])

[[3845, 462], [1636, 296], [3621, 181], [2586, 14], [1713, 1384], [3820, 2670], [3166, 1816], [125, 5], [4116, 6], [6951, 1]] [1, 1, 1, 1, 1, 0, 0, 1, 1, 1]


In [6]:
input_target = Input((1,))
input_context = Input((1,))

embedding = Embedding(vocab_size, vector_dim, input_length=1, name='embedding')

In [7]:
target = embedding(input_target)
target = Reshape((vector_dim, 1))(target)
context = embedding(input_context)
context = Reshape((vector_dim, 1))(context)

In [12]:
from keras.layers import Merge
similarity = Merge([target, context], mode='cos', dot_axes=0)

ImportError: cannot import name 'Merge'