# mahabharatha2Vec

## Imports

In [1]:
from __future__ import absolute_import, division, print_function

In [1]:
import nltk
import gensim.models.word2vec as w2v
import sklearn.manifold
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import codecs

ModuleNotFoundError: No module named 'gensim'

In [63]:
import glob
import multiprocessing
import os
import re

In [65]:
%pylab inline
%matplotlib inline

Populating the interactive namespace from numpy and matplotlib


In [66]:
# downlaoding sentence tokenizer and stopwords
nltk.download("punkt")
nltk.download("stopwords")

[nltk_data] Downloading package punkt to /home/harsha/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/harsha/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

## Prepare Corpus

**Load books from files**

In [67]:
book_filenames = sorted(glob.glob("data/*.txt"))

In [68]:
print("Found books:")
book_filenames

Found books:


['data/ajaya1.txt', 'data/ajaya2.txt', 'data/asura.txt', 'data/jaya.txt']

**Combine the books into one string**

In [69]:
corpus_raw = u""
for book_filename in book_filenames:
    print("Reading '{0}'...".format(book_filename))
    with codecs.open(book_filename, "r", "utf-8") as book_file:
        corpus_raw += book_file.read()
    print("Corpus is now {0} characters long".format(len(corpus_raw)))
    print()

Reading 'data/ajaya1.txt'...
Corpus is now 949412 characters long

Reading 'data/ajaya2.txt'...
Corpus is now 1889681 characters long

Reading 'data/asura.txt'...
Corpus is now 2953032 characters long

Reading 'data/jaya.txt'...
Corpus is now 3533460 characters long



**Split the corpus into sentences**

In [70]:
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

In [71]:
raw_sentences = tokenizer.tokenize(corpus_raw)

In [72]:
def sentence_to_wordlist(raw):
    clean = re.sub("[^a-zA-Z]"," ", raw)
    words = clean.split()
    return words

In [73]:
sentences = []
for raw_sentence in raw_sentences:
    if len(raw_sentence) > 0:
        sentences.append(sentence_to_wordlist(raw_sentence))

In [74]:
print(raw_sentences[51])
print(sentence_to_wordlist(raw_sentences[51]))

It	made	me	rush	home	to	revisit	the	Mahabharata,	an	epic
that	has	inspired	countless	writers	over	the	centuries.
['It', 'made', 'me', 'rush', 'home', 'to', 'revisit', 'the', 'Mahabharata', 'an', 'epic', 'that', 'has', 'inspired', 'countless', 'writers', 'over', 'the', 'centuries']


In [75]:
token_count = sum([len(sentence) for sentence in sentences])
print("The book corpus contains {0:,} tokens".format(token_count))

The book corpus contains 605,503 tokens


In [76]:
len(sentences)

43384

## Train to obtain Word Vectors

In [77]:
# Dimensionality of the resulting word vectors.
num_features = 300

# Minimum word count threshold.
min_word_count = 3

# Context window length.
context_size = 7

# Downsample setting for frequent words.
downsampling = 0.5*1e-5

In [78]:
# Number of threads to run in parallel.
num_workers = multiprocessing.cpu_count()

# Seed for the RNG, to make the results reproducible.
seed = 1

In [79]:
maha2vec = w2v.Word2Vec(
    sg=1,
    seed=seed,
    workers=num_workers,
    size=num_features,
    min_count=min_word_count,
    window=context_size,
    sample=downsampling
)

In [80]:
maha2vec.build_vocab(sentences)

In [81]:
print("Word2Vec vocabulary length:", len(maha2vec.wv.vocab))

Word2Vec vocabulary length: 10749


In [82]:
maha2vec.train(sentences,total_examples=maha2vec.corpus_count,epochs=maha2vec.iter)

  """Entry point for launching an IPython kernel.


(512899, 3027515)

In [83]:
if not os.path.exists("trained"):
    os.makedirs("trained")

In [84]:
maha2vec.save(os.path.join("trained", "maha2vec.w2v"))

## Explore the trained model.

## Loading the trained model

In [85]:
maha2vec = w2v.Word2Vec.load(os.path.join("trained", "maha2vec.w2v"))

### Explore semantic similarities between book characters

**Words closest to the given word**

In [86]:
maha2vec.most_similar("Karna")

  """Entry point for launching an IPython kernel.


[('Ltd', 0.999782919883728),
 ('Present', 0.9997819662094116),
 ('E', 0.9997773170471191),
 ('THE', 0.9997721910476685),
 ('DHARMA', 0.9997717142105103),
 ('Books', 0.9997711777687073),
 ('unstitched', 0.9997687935829163),
 ('Ancient', 0.9997684955596924),
 ('Penguin', 0.9997639060020447),
 ('OF', 0.9997628927230835)]

In [87]:
maha2vec.most_similar("Arjuna")

  """Entry point for launching an IPython kernel.


[('com', 0.9998261332511902),
 ('E', 0.999822735786438),
 ('Indian', 0.999822199344635),
 ('Ltd', 0.9998149871826172),
 ('translation', 0.9998130798339844),
 ('THE', 0.9998112916946411),
 ('OF', 0.9998096227645874),
 ('DHARMA', 0.9998090267181396),
 ('Group', 0.9998084902763367),
 ('Hindu', 0.9998077154159546)]

In [88]:
maha2vec.most_similar("Suyodhana")

  """Entry point for launching an IPython kernel.


[('THE', 0.9997886419296265),
 ('Penguin', 0.9997876882553101),
 ('used', 0.9997820258140564),
 ('Indian', 0.9997796416282654),
 ('division', 0.9997793436050415),
 ('com', 0.9997788667678833),
 ('DHARMA', 0.9997745752334595),
 ('Ltd', 0.9997737407684326),
 ('ancient', 0.9997716546058655),
 ('Hindu', 0.9997695088386536)]

**Linear relationships between word pairs**

In [89]:
def nearest_similarity_cosmul(start1, end1, end2):
    similarities = maha2vec.most_similar_cosmul(
        positive=[end2, start1],
        negative=[end1]
    )
    start2 = similarities[0][0]
    print("{start1} is related to {end1}, as {start2} is related to {end2}".format(**locals()))
    return start2

### Father Relation

In [90]:
nearest_similarity_cosmul("Dhritarashtra", "Suyodhana", "Arjuna")

Dhritarashtra is related to Suyodhana, as leadstartcorp is related to Arjuna


  after removing the cwd from sys.path.


'leadstartcorp'

### Mother Relation

In [91]:
nearest_similarity_cosmul("Kunti", "Arjuna", "Suyodhana")

Kunti is related to Arjuna, as Penguin is related to Suyodhana


  after removing the cwd from sys.path.


'Penguin'

### Wife Relation

In [92]:
nearest_similarity_cosmul("Subhadra", "Arjuna", "Jayadratha")

Subhadra is related to Arjuna, as GOD is related to Jayadratha


  after removing the cwd from sys.path.


'GOD'

### Sibling Relation

In [93]:
nearest_similarity_cosmul("Balarama", "Krishna", "Nakula")

Balarama is related to Krishna, as Present is related to Nakula


  after removing the cwd from sys.path.


'Present'

In [94]:
nearest_similarity_cosmul("Sahadeva", "Nakula", "Bhima")

Sahadeva is related to Nakula, as IN is related to Bhima


  after removing the cwd from sys.path.


'IN'

### Some of them very wrong....

In [95]:
nearest_similarity_cosmul("Subhadra", "Krishna", "Suyodhana")

Subhadra is related to Krishna, as used is related to Suyodhana


  after removing the cwd from sys.path.


'used'

In [96]:
nearest_similarity_cosmul("Draupadi", "Arjuna", "Suyodhana")

Draupadi is related to Arjuna, as Penguin is related to Suyodhana


  after removing the cwd from sys.path.


'Penguin'

In [49]:
nearest_similarity_cosmul("Karna", "Suyodhana", "Vidhura")

Karna is related to Suyodhana, as cycle is related to Vidhura


  after removing the cwd from sys.path.


'cycle'