In [1]:
texts = ["The future king is the prince",
"Daughter is the princess",
"Son is the prince", 
"Only a man can be a king", 
"Only a woman can be a queen", 
"The princess will be a queen",
"Queen and king rule the realm",
"The prince is a strong man",
"The princess is a beautiful woman", 
"The royal family is the king and queen and their children",
"Prince is only a boy now",
"A boy will be a man"]

In [2]:
import re

def clean_text(
    string: str, 
    punctuations=r'''!()-[]{};:'"\,<>./?@#$%^&*_~''',
    stop_words=['the', 'a', 'and', 'is', 'be', 'will']) -> str:
    """
    A method to clean text 
    """
    # Cleaning the urls
    string = re.sub(r'https?://\S+|www\.\S+', '', string)

    # Cleaning the html elements
    string = re.sub(r'<.*?>', '', string)

    # Removing the punctuations
    for x in string.lower(): 
        if x in punctuations: 
            string = string.replace(x, "") 

    # Converting the text to lower
    string = string.lower()

    # Removing stop words
    string = ' '.join([word for word in string.split() if word not in stop_words])

    # Cleaning the whitespaces
    string = re.sub(r'\s+', ' ', string).strip()

    return string

In [28]:
# Defining the window for context
window = 2

# Creating a placeholder for the scanning of the word list
word_lists = []
all_text = []

for text in texts:

    # Cleaning the text
    text = clean_text(text)

    # Appending to the all text list
    all_text += text.split() 

    # Creating a context dictionary
    for i, word in enumerate(text.split()):
        for w in range(window):
            # Getting the context that is ahead by *window* words
            if i + 1 + w < len(text.split()): 
                word_lists.append([word] + [text.split()[(i + 1 + w)]])
            # Getting the context that is behind by *window* words    
            if i - w - 1 >= 0:
                word_lists.append([word] + [text.split()[(i - w - 1)]])

In [29]:
word_lists

[['future', 'king'],
 ['future', 'prince'],
 ['king', 'prince'],
 ['king', 'future'],
 ['prince', 'king'],
 ['prince', 'future'],
 ['daughter', 'princess'],
 ['princess', 'daughter'],
 ['son', 'prince'],
 ['prince', 'son'],
 ['only', 'man'],
 ['only', 'can'],
 ['man', 'can'],
 ['man', 'only'],
 ['man', 'king'],
 ['can', 'king'],
 ['can', 'man'],
 ['can', 'only'],
 ['king', 'can'],
 ['king', 'man'],
 ['only', 'woman'],
 ['only', 'can'],
 ['woman', 'can'],
 ['woman', 'only'],
 ['woman', 'queen'],
 ['can', 'queen'],
 ['can', 'woman'],
 ['can', 'only'],
 ['queen', 'can'],
 ['queen', 'woman'],
 ['princess', 'queen'],
 ['queen', 'princess'],
 ['queen', 'king'],
 ['queen', 'rule'],
 ['king', 'rule'],
 ['king', 'queen'],
 ['king', 'realm'],
 ['rule', 'realm'],
 ['rule', 'king'],
 ['rule', 'queen'],
 ['realm', 'rule'],
 ['realm', 'king'],
 ['prince', 'strong'],
 ['prince', 'man'],
 ['strong', 'man'],
 ['strong', 'prince'],
 ['man', 'strong'],
 ['man', 'prince'],
 ['princess', 'beautiful'],
 ['p

In [30]:
all_text

['future',
 'king',
 'prince',
 'daughter',
 'princess',
 'son',
 'prince',
 'only',
 'man',
 'can',
 'king',
 'only',
 'woman',
 'can',
 'queen',
 'princess',
 'queen',
 'queen',
 'king',
 'rule',
 'realm',
 'prince',
 'strong',
 'man',
 'princess',
 'beautiful',
 'woman',
 'royal',
 'family',
 'king',
 'queen',
 'their',
 'children',
 'prince',
 'only',
 'boy',
 'now',
 'boy',
 'man']

In [31]:
def create_unique_word_dict(text:list) -> dict:
    """
    A method that creates a dictionary where the keys are unique words
    and key values are indices
    """
    # Getting all the unique words from our text and sorting them alphabetically
    words = list(set(text))
    words.sort()

    # Creating the dictionary for the unique words
    unique_word_dict = {}
    for i, word in enumerate(words):
        unique_word_dict.update({
            word: i
        })

    return unique_word_dict 

In [45]:
words = list(set(texts.split()))

AttributeError: 'list' object has no attribute 'split'

In [44]:
words

['The prince is a strong man',
 'Queen and king rule the realm',
 'Only a woman can be a queen',
 'The future king is the prince',
 'A boy will be a man',
 'Only a man can be a king',
 'Prince is only a boy now',
 'The princess is a beautiful woman',
 'The princess will be a queen',
 'Son is the prince',
 'Daughter is the princess',
 'The royal family is the king and queen and their children']

In [40]:
create_unique_word_dict(word_lists)

TypeError: unhashable type: 'list'

# Word2Vec in Python with Gensim Library

In [46]:
! pip install beautifulsoup4

Defaulting to user installation because normal site-packages is not writeable



[notice] A new release of pip available: 22.3 -> 22.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [47]:
! pip install lxml

Defaulting to user installation because normal site-packages is not writeable



[notice] A new release of pip available: 22.3 -> 22.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [49]:
import bs4 as bs
import urllib.request
import re
import nltk

scraped_data = urllib.request.urlopen('https://en.wikipedia.org/wiki/Artificial_intelligence')
article = scraped_data.read()

parsed_article = bs.BeautifulSoup(article,'lxml')

paragraphs = parsed_article.find_all('p')

article_text = ""

for p in paragraphs:
    article_text += p.text

In [50]:
# Cleaning the text
processed_article = article_text.lower()
processed_article = re.sub('[^a-zA-Z]', ' ', processed_article )
processed_article = re.sub(r'\s+', ' ', processed_article)

# Preparing the dataset
all_sentences = nltk.sent_tokenize(processed_article)

all_words = [nltk.word_tokenize(sent) for sent in all_sentences]

# Removing Stop Words
from nltk.corpus import stopwords
for i in range(len(all_words)):
    all_words[i] = [w for w in all_words[i] if w not in stopwords.words('english')]

In [56]:
from gensim.models import Word2Vec

word2vec = Word2Vec(all_words, min_count=2)
word2vec

<gensim.models.word2vec.Word2Vec at 0x1eceb222310>

In [62]:
vocabulary = word2vec.wv.key_to_index
print(vocabulary)

{'ai': 0, 'intelligence': 1, 'artificial': 2, 'learning': 3, 'human': 4, 'used': 5, 'research': 6, 'machine': 7, 'use': 8, 'many': 9, 'problems': 10, 'networks': 11, 'also': 12, 'intelligent': 13, 'data': 14, 'systems': 15, 'agent': 16, 'knowledge': 17, 'search': 18, 'world': 19, 'researchers': 20, 'field': 21, 'information': 22, 'neural': 23, 'algorithms': 24, 'computer': 25, 'general': 26, 'logic': 27, 'may': 28, 'symbolic': 29, 'machines': 30, 'would': 31, 'states': 32, 'system': 33, 'mind': 34, 'problem': 35, 'cyber': 36, 'example': 37, 'goal': 38, 'reasoning': 39, 'recognition': 40, 'one': 41, 'using': 42, 'could': 43, 'applications': 44, 'solve': 45, 'risk': 46, 'humans': 47, 'approaches': 48, 'diplomacy': 49, 'goals': 50, 'however': 51, 'technology': 52, 'decision': 53, 'specific': 54, 'computing': 55, 'theory': 56, 'since': 57, 'scientific': 58, 'developed': 59, 'optimization': 60, 'cybersecurity': 61, 'ability': 62, 'language': 63, 'program': 64, 'security': 65, 'mathematical'

In [57]:
v1 = word2vec.wv['artificial']

In [63]:
sim_words = word2vec.wv.most_similar('intelligence')
sim_words

[('ai', 0.613765299320221),
 ('used', 0.535531222820282),
 ('artificial', 0.5345665216445923),
 ('search', 0.5322884321212769),
 ('information', 0.4942713975906372),
 ('logic', 0.4893563985824585),
 ('learning', 0.4892669916152954),
 ('networks', 0.48703572154045105),
 ('knowledge', 0.4859626889228821),
 ('use', 0.4855497479438782)]