In [1]:
import nltk



In [2]:
from gensim.models import Word2Vec
from nltk.corpus import stopwords
import re

### A structured and much larger corpus is ideal to get the most out of Word2Vec.

In [3]:
corpus = "D for Definition turns misguided common-sense upside down and introduces the rules and objectives of the new game. It replaces self-defeating assumptions and explains concepts such as relative wealth and eustress. This section explains the overall lifestyle design recipe—the fundamentals—before we add the three ingredients. E for Elimination kills the obsolete notion of time management once and for all. It shows exactly how I used the words of an often-forgotten Italian economist to turn 12- hour days into two-hour days in 48 hours. Increase your per-hour results ten times or more with counterintuitive NR techniques for cultivating selective ignorance, developing a low-information diet, and otherwise ignoring the unimportant. This section provides the first of the three luxury lifestyle design ingredients: time. A for Automation puts cash flow on autopilot using geographic arbitrage, outsourcing, and rules of nondecision. From bracketing to the routines of ultra-successful NR, it’s all here. This section provides the second ingredient of luxury lifestyle design: income. L for Liberation is the mobile manifesto for the globally inclined. The concept of mini-retirements is introduced, as are the means for flawless remote control and escaping the boss. Liberation is not about cheap travel; it is about forever breaking the bonds that confine you to a single location. This section delivers the third and final ingredient for luxury lifestyle design: mobility. I should note that most bosses are less than pleased if you spend one hour in the office each day, and employees should therefore read the steps in the entrepreneurially minded DEAL order but implement them as DELA. If you decide to remain in your current job, it is necessary to create freedom of location before you cut your work hours by 80%. Even if you have never considered becoming an entrepreneur in the modern sense, the DEAL process will turn you into an entrepreneur in the purer sense as first coined by French economist J. B. Say - one who shifts economic resources out of an area of lower and into an area of higher yield."

### Pre-Processing

In [4]:
sentences = nltk.sent_tokenize(corpus)

In [5]:
contents = []

In [6]:
for i in range(len(sentences)):
    temp = re.sub('[^a-zA-Z]', ' ', sentences[i])
    temp = temp.lower()
    temp = temp.split()
    temp = ' '.join(temp)
    contents.append(temp)
    

In [7]:
contents = [nltk.word_tokenize(content) for content in contents]

In [8]:
for i in range(len(contents)):
     contents[i] = [word for word in contents[i] if word not in stopwords.words('english')]

In [10]:
contents[:1]

[['definition',
  'turns',
  'misguided',
  'common',
  'sense',
  'upside',
  'introduces',
  'rules',
  'objectives',
  'new',
  'game']]

### Modelling

In [11]:
model = Word2Vec(contents, min_count=1)

In [19]:
len(model.wv.index_to_key) # Size of Vocabulary



162

In [20]:
vector = model.wv['work']

In [21]:
vector # W2V vectorizes each word against 100 dimensions

array([ 0.00249443,  0.005998  , -0.0096834 , -0.001436  ,  0.00511557,
        0.00711384,  0.00688231,  0.00261018,  0.00684766, -0.0088863 ,
        0.00538482, -0.00250233,  0.0037569 , -0.00961233, -0.00902436,
        0.00498293, -0.00117535,  0.00740738, -0.00934754,  0.00988288,
       -0.00594907,  0.0005359 ,  0.00461876, -0.00217725,  0.00733888,
        0.00047434, -0.00292022, -0.00115797, -0.0094634 , -0.0041403 ,
       -0.00395264,  0.00819757, -0.00997359, -0.0021321 ,  0.00739047,
        0.0079375 , -0.00850597,  0.00132996, -0.00932409,  0.00691463,
        0.00957812,  0.00330149, -0.00936762, -0.00060145, -0.00176938,
        0.00432783,  0.0069582 ,  0.00450644, -0.0092511 ,  0.00739412,
        0.00273037, -0.0007884 , -0.00289593,  0.00059524, -0.005703  ,
        0.00528487, -0.00907622,  0.00029061,  0.00231306,  0.00241561,
        0.00910039, -0.005272  ,  0.00964045,  0.0095163 , -0.00717658,
        0.00274492,  0.00967668,  0.00303848, -0.00220403,  0.00

### Semantics

In [22]:
workx = model.wv.most_similar('work')
freedomx = model.wv.most_similar('freedom')

In [23]:
workx

[('french', 0.2453354150056839),
 ('modern', 0.23271015286445618),
 ('low', 0.23143276572227478),
 ('entrepreneurially', 0.18203189969062805),
 ('introduces', 0.17594988644123077),
 ('say', 0.16798441112041473),
 ('obsolete', 0.16628308594226837),
 ('ingredient', 0.16496604681015015),
 ('section', 0.15149471163749695),
 ('lower', 0.14939026534557343)]

In [24]:
freedomx

[('eustress', 0.2815510332584381),
 ('modern', 0.2375253140926361),
 ('pleased', 0.2141195684671402),
 ('defeating', 0.20186085999011993),
 ('ingredient', 0.20124268531799316),
 ('shifts', 0.19086191058158875),
 ('economic', 0.18483036756515503),
 ('becoming', 0.18419398367404938),
 ('yield', 0.17575308680534363),
 ('ultra', 0.16725118458271027)]