In [1]:
from __future__ import absolute_import, division, print_function

In [2]:
import codecs
import glob
import multiprocessing
import os
import pprint
import re

In [3]:
import nltk
import gensim.models.word2vec as w2v
import sklearn.manifold
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

In [4]:
%pylab inline

Populating the interactive namespace from numpy and matplotlib


In [5]:
nltk.download("punkt")

[nltk_data] Downloading package punkt to
[nltk_data]     /home/divesh_pandey/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

## Getting and cleaning data

In [6]:
hindi_filenames = sorted(glob.glob("../data/hin_corp_unicode/*txt"))
#hindi_filenames

In [7]:
corpus_raw = u""
for file_name in hindi_filenames:
    print("Reading '{0}'...".format(file_name))
    with codecs.open(file_name, "r", "utf-8") as f:
        # Starting two lines are not useful in corpus
        temp = f.readline()
        temp = f.readline()
        corpus_raw += f.read()
    print("Corpus is now {0} characters long".format(len(corpus_raw)))
    print()

Reading '../data/hin_corp_unicode/1000_utf.txt'...
Corpus is now 15764 characters long

Reading '../data/hin_corp_unicode/1001_utf.txt'...
Corpus is now 33663 characters long

Reading '../data/hin_corp_unicode/1002_utf.txt'...
Corpus is now 48153 characters long

Reading '../data/hin_corp_unicode/1003_utf.txt'...
Corpus is now 63362 characters long

Reading '../data/hin_corp_unicode/1004_utf.txt'...
Corpus is now 77899 characters long

Reading '../data/hin_corp_unicode/1005_utf.txt'...
Corpus is now 95324 characters long

Reading '../data/hin_corp_unicode/1006_utf.txt'...
Corpus is now 106578 characters long

Reading '../data/hin_corp_unicode/1007_utf.txt'...
Corpus is now 118142 characters long

Reading '../data/hin_corp_unicode/1008_utf.txt'...
Corpus is now 132864 characters long

Reading '../data/hin_corp_unicode/1009_utf.txt'...
Corpus is now 144821 characters long

Reading '../data/hin_corp_unicode/100_utf.txt'...
Corpus is now 154593 characters long

Reading '../data/hin_corp_un

In [8]:
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

In [9]:
raw_sentences = tokenizer.tokenize(corpus_raw)

In [10]:
def sentence_to_wordlist(raw):
    clean = re.sub("[.\r\n]"," ", raw)
    words = clean.split()
    return words

sentences = []
for raw_sentence in raw_sentences:
    if len(raw_sentence) > 0:
        sentences.append(sentence_to_wordlist(raw_sentence))

In [11]:
token_count = sum([len(sentence) for sentence in sentences])
print("The Hindi corpus contains {0:,} tokens".format(token_count))

The Hindi corpus contains 2,840,476 tokens


In [12]:
sentences[0]

[u'\u0915\u093f\u0938\u094d\u092e\u094b\u0902',
 u'\u0915\u0947',
 u'\u0935\u093f\u0915\u093e\u0938',
 u'\u092e\u0947\u0902',
 u'\u0938\u0902\u0915\u0940\u0930\u094d\u0923',
 u'\u091c\u0940\u0928',
 u'\u0906\u0927\u093e\u0930\u094b\u0902',
 u'\u0915\u0947',
 u'\u092c\u095d\u0924\u0947',
 u'\u0909\u092a\u092f\u094b\u0917',
 u'\u0938\u0947',
 u'\u090f\u0915',
 u'\u0914\u0930',
 u'\u0939\u093e\u0928\u093f',
 u'\u0939\u0941\u0908']

##  Word Vectors

In [13]:
# Dimensionality of the resulting word vectors.
# More dimensions = more generalized
num_features = 50
# Minimum word count threshold.
min_word_count = 3

# Number of threads to run in parallel.
num_threads = multiprocessing.cpu_count()

# Context window length.
context_size = 8

# Downsample setting for frequent words.
#0 - 1e-5 is good for this
downsampling = 1e-3

# Seed for the RNG, to make the results reproducible.
# Random Number Generator
seed = 1

In [14]:
# Defining the model
model = w2v.Word2Vec(
    sg=1,
    seed=seed,
    workers=num_threads,
    size=num_features,
    min_count=min_word_count,
    window=context_size,
    sample=downsampling
)

In [15]:
model.build_vocab(sentences)

In [16]:
model.train(sentences)

10730520

In [17]:
# Save our model
model.save(os.path.join("../data/", "hindi_word2Vec_small.w2v"))

## Explore the model

In [None]:
trained_model = w2v.Word2Vec.load(os.path.join("../data/", "hindi_word2Vec_small.w2v"))

In [None]:
# For reducing dimensiomns, to visualize vectors
tsne = sklearn.manifold.TSNE(n_components=2, random_state=0)
all_word_vectors_matrix = trained_model.syn1neg[:200] # Currently giving memory error for all words
# Reduced dimensions
all_word_vectors_matrix_2d = tsne.fit_transform(all_word_vectors_matrix)

In [None]:
points = pd.DataFrame(
    [
        (word, coords[0], coords[1])
        for word, coords in [
            (word, all_word_vectors_matrix_2d[trained_model.wv.vocab[word].index])
            for word in trained_model.wv.vocab
            if trained_model.wv.vocab[word].index < 200
        ]
    ],
    columns=["word", "x", "y"]
)

In [None]:
s = trained_model.wv[u"आधार"]