In [60]:
import nltk

# Gensim is a Python library for topic modelling, document indexing and similarity retrieval with large corpora. Target audience is the natural language processing (NLP) and information retrieval (IR) community.

from gensim.models import Word2Vec # gensim is a word2vec implementation in python
import re

In [61]:
para = '''
    The changes in the urban landscape go mostly unnoticed compared to that in villages, especially in developing countries. The overall village scene appears with a new look after even minor changes occurring there. The rural look in Bangladesh has undergone radical transformations in the last 3-4 decades. Prior to this period, the villages virtually remained stuck in time, one that does not move.

Even the period shortly after the independence of Bangladesh has not seen much perceptible change in the rural panorama. The leftovers of the past squalor from colonial and neo-colonial rules still pervaded the scene. The rural masses were trapped in extreme poverty, economic hardship and scores of deprivations coupled with deep-seated helplessness. Feuds sparked by irrational acrimony and desperation ruled the roost, so did suspicion and the tendency to become spiteful for petty reasons. The days of these maladies were a reality in the period spanning from the 1970s to the 1980s.

In the days before independence, the chronic poverty in the country's villages comprised a dominant place among the common rural features. To the newer generations in the second decade of the 21st century many earlier rural scenes might seem incredible. During winters these days hardly any poor villager is found shivering in cold due to the dearth of warm clothing. Yet the whole season of winter used to be seen wear away with most of the villagers enduring the bite of cold. People wrapped in thin woolen shawl or wearing a shirt and shoes would be considered lucky or privileged. The elders' common winter wear in those days generally included a worn-out cotton 'chadar' and lungi. Most of them moved barefoot outside their homes, with an earthen pot filled with simmering bran ashes (Ailla) in their lap. Few small children owned a shirt. They would be found covered from ankle to neck with the lungis of their fathers or other elders. The upper end of the improvised winter garment remained tied in a knot at the neck. The thatched or mud-built huts were ramshackle, virtually ineffective in blocking the chilly air.

Winter in most of today's Bangladesh villages is full of colours, not much different from the urban spectacles. Males wearing coats, jackets and pullovers, women covered in fancifully designed shawls are a common view. In many rural areas, makeshift dwellings have been replaced by houses built with corrugated iron sheets. These are interspersed by brick-built buildings.

'''

In [62]:
# clean the para by regualar expression
text = re.sub(r'\[[0-9]*\]', ' ', para)
text = re.sub(r'\s+', ' ', text)
text = text.lower()
text = re.sub(r'\d+', ' ', text)
text = re.sub(r'\s+', ' ', text)

In [63]:

# tokenize the sentences
sentences = nltk.sent_tokenize(text)
words = [nltk.word_tokenize(sentence) for sentence in sentences]


In [64]:
# remove stop words
for i in range(len(words)):
    words[i] = [word for word in words[i] if word not in nltk.corpus.stopwords.words('english')]


In [65]:
# traan the word2vec model
model = Word2Vec(words, min_count=1)

In [81]:
vector = model.wv['independence']
vector

array([ 9.7502070e-03,  8.1926789e-03,  1.2723766e-03,  5.1052626e-03,
        1.4124915e-03, -6.5201335e-03, -1.3933675e-03,  6.5278257e-03,
       -4.6696602e-03, -4.0010950e-03,  4.9233926e-03,  2.6794276e-03,
       -1.8516511e-03, -2.8795451e-03,  5.9808106e-03, -5.7477546e-03,
       -3.2201621e-03, -6.5247174e-03, -4.2460454e-03, -8.6610680e-03,
       -4.4907969e-03, -8.5070319e-03,  1.4108202e-03, -8.6483825e-03,
       -9.9407220e-03, -8.2005644e-03, -6.7757261e-03,  6.6489144e-03,
        3.7544300e-03,  3.4951061e-04, -2.9463517e-03, -7.4375565e-03,
        5.6546956e-04,  4.7266617e-04,  2.0326296e-04,  8.9387683e-04,
        7.8175071e-04, -7.1007147e-05, -7.9806130e-03, -5.9171482e-03,
       -8.3590718e-03, -1.3791053e-03,  1.8082398e-03,  7.4281627e-03,
       -1.9573977e-03, -2.3317384e-03,  9.4608674e-03,  6.6110777e-05,
       -2.3793622e-03,  8.6263614e-03,  2.6878349e-03, -5.3811129e-03,
        6.6169924e-03,  4.5353384e-03, -7.0555797e-03, -2.8444899e-04,
      

In [84]:
similar = model.wv.most_similar('independence', topn=3)

similar

[('irrational', 0.3158555030822754),
 ('spanning', 0.3144017159938812),
 ('leftovers', 0.18732719123363495)]