In [287]:
"""
pip install wikipedia

import wikipedia
from tqdm import tqdm

# Set the language of Wikipedia to English
wikipedia.set_lang('en')

# Fetch Wikipedia articles
articles = []
num_texts = 120

for title in tqdm(wikipedia.random(pages=num_texts), total=num_texts):
    try:
        page = wikipedia.page(title)
        article_content = page.content
        articles.append(article_content)
    except (wikipedia.exceptions.PageError,
            wikipedia.exceptions.DisambiguationError,
            wikipedia.exceptions.HTTPTimeoutError):
        # Handle exceptions by skipping the current article
        continue

# File path to save the text file
file_path = 'output.txt'

with open(file_path, 'wt', encoding='utf-8') as f:
    for article in articles:
        f.write(article + '\n')

"""

"\npip install wikipedia\n\nimport wikipedia\nfrom tqdm import tqdm\n\n# Set the language of Wikipedia to English\nwikipedia.set_lang('en')\n\n# Fetch Wikipedia articles\narticles = []\nnum_texts = 120\n\nfor title in tqdm(wikipedia.random(pages=num_texts), total=num_texts):\n    try:\n        page = wikipedia.page(title)\n        article_content = page.content\n        articles.append(article_content)\n    except (wikipedia.exceptions.PageError,\n            wikipedia.exceptions.DisambiguationError,\n            wikipedia.exceptions.HTTPTimeoutError):\n        # Handle exceptions by skipping the current article\n        continue\n\n# File path to save the text file\nfile_path = 'output.txt'\n\nwith open(file_path, 'wt', encoding='utf-8') as f:\n    for article in articles:\n        f.write(article + '\n')\n        \n"

In [288]:
# Data exploration

file = open("output.txt",'r')
wiki_corpus = file.readlines()
file.close


print(wiki_corpus[3:5])
print(len(wiki_corpus))

['Bouthaina Shaaban, Syrian politician\n', 'Bouthayna Shaya, Syrian actress and voice actress\n']
3579


In [289]:
for i in range(len(wiki_corpus)):
  wiki_corpus[i] = wiki_corpus[i].lower().replace('\n', '')

print(wiki_corpus[3:5])
print(len(wiki_corpus))

['bouthaina shaaban, syrian politician', 'bouthayna shaya, syrian actress and voice actress']
3579


In [290]:
# Creating tokens

tokens = []

for sentence in wiki_corpus:
  temp = []
  for word in sentence.split():
    temp.append(word)
  tokens.append(temp)

print(tokens[3:5])
print(len(tokens))

[['bouthaina', 'shaaban,', 'syrian', 'politician'], ['bouthayna', 'shaya,', 'syrian', 'actress', 'and', 'voice', 'actress']]
3579


In [291]:
# Finding unique words

vocabulary = []

for sentence in tokens:
  vocabulary.extend(sentence)

print(vocabulary[:5])
print(len(vocabulary))

vocabulary = list(set(vocabulary))
print(vocabulary[:5])
print(len(vocabulary))

['boutheina', '(also', 'spelled', 'bothayna', 'or']
62544
['shirvanshahs', 'ohio', 'regular', 'contributor)', 'lead']
15652


In [292]:
# Sentencepiece

import sentencepiece as spm
import torchtext

In [293]:
spm.SentencePieceTrainer.Train('--input=output.txt --model_prefix=model_1 --vocab_size=10000')
sp = spm.SentencePieceProcessor()
sp.load('model_1.model')

sp.get_piece_size()

10000

In [294]:
print(sp.encode_as_pieces('Eli lilly!'))
print(sp.encode_as_ids('Eli lilly!'))

['▁E', 'li', '▁li', 'lly', '!']
[74, 1610, 1714, 1599, 976]


In [295]:
print(sp.decode_pieces(['▁E', 'li', '▁li', 'lly', '!']))
print(sp.decode_ids([74, 1610, 1714, 1599, 976]))

Eli lilly!
Eli lilly!


In [296]:
# Embedding using gensim in word2vec format

from gensim.models import Word2Vec

def tokenize_and_embed(filepath):

  with open(filepath, 'r') as f:
    text = f.read()

  tokens = sp.encode_as_pieces(text)

  model = Word2Vec(sentences = [tokens], vector_size = 50, min_count = 1)
  word2vec_embedding = model.wv

  return word2vec_embedding

embeddings = tokenize_and_embed("/content/output.txt")

In [297]:
print(f"The length of the embedding is {len(embeddings)}")

The length of the embedding is 9746


In [298]:
embeddings.index_to_key[:5]

[',', '▁the', '.', 's', '▁']

In [299]:
for word in embeddings.index_to_key[:5]:
  embedding = embeddings[word]
  print(f"{word}")
  print(f"Embedding: {embedding}")
  print(f"length is {len(embedding)}")

,
Embedding: [-0.15821798  0.13665175  0.10172564 -0.21292017 -0.17330092 -0.08970051
  0.30831704  0.16143093 -0.51108974 -0.06177684  0.08528311 -0.28096092
  0.21434954 -0.1873156  -0.03638269  0.0653494   0.42317116  0.19100742
 -0.46234775 -0.4033026   0.15554841  0.31748298  0.52794516 -0.07680685
  0.6703945   0.23705672 -0.03625747 -0.02997481 -0.40711364 -0.05873359
 -0.14257386 -0.09305119 -0.10717768 -0.17465073 -0.07319134 -0.03081069
  0.42560765 -0.01629002  0.18698648 -0.29209745  0.14180495 -0.0508297
 -0.24794997  0.12888551  0.3618283  -0.25009656  0.0471344  -0.1930906
  0.00328653  0.3401396 ]
length is 50
▁the
Embedding: [-0.21542375  0.17808495  0.10409442 -0.28904685 -0.17329203 -0.10410284
  0.37617794  0.16310264 -0.6295749  -0.04979402  0.0826259  -0.34410292
  0.27059838 -0.2641184  -0.03553003  0.08953472  0.51768506  0.24261105
 -0.55695647 -0.47323278  0.18559986  0.39946407  0.6351292  -0.11207359
  0.82369816  0.30535495 -0.02723609 -0.05178813 -0.492112

In [300]:
embeddings['the'], embeddings['the'].shape

(array([-1.4755122e-02,  1.6703237e-02, -6.7306925e-03,  7.0618093e-03,
        -1.6353590e-02, -9.0560876e-03, -5.8977380e-03,  1.9603815e-02,
        -2.2777053e-02, -2.5047460e-03,  1.8867176e-02, -1.8022401e-05,
         1.0996536e-02,  1.1844702e-02, -7.6020239e-03, -7.6250988e-03,
        -9.6891355e-03, -5.6149662e-03, -6.5517779e-03,  1.2277971e-02,
        -2.3040778e-03,  8.9241695e-03,  1.3468783e-02,  1.0891717e-02,
         4.8256754e-03,  9.4104540e-03,  1.2818528e-02, -1.5557145e-02,
         6.3929046e-03, -1.9859388e-02, -5.3406367e-03, -7.4768914e-03,
         4.9003791e-03,  1.9406274e-04, -2.5476601e-03,  7.0284400e-03,
         2.0419199e-02,  1.1902789e-02, -1.5950717e-02, -9.4479499e-03,
        -3.9276760e-03,  1.7385030e-02,  3.8404425e-03,  5.1047145e-03,
         1.7041249e-02, -8.6543644e-03, -7.6797637e-03, -1.5518800e-02,
        -4.7737965e-03,  1.3970410e-02], dtype=float32),
 (50,))

In [301]:
# Importing our new data

with open('Royal_data.txt', 'r') as f:
  new_data = f.readlines()
  f.close

print(len(new_data))
print(new_data)

11
['The future king is the prince\n', 'Daughter is the princess\n', 'Son is the prince\n', 'Only a man can be a king\n', 'Only a woman can be a queen\n', 'The princess will be a queen\n', 'The prince is a strong man\n', 'The princess is a beautiful woman\n', 'Prince is only a boy now\n', 'Prince will be king\n', 'A boy will be a man']


In [302]:
import numpy as np

# Define a function to handle word embeddings
def get_embedding(word, embeddings):
  # Check if the word exists in the vocabulary
  if word in embeddings:
    return embeddings[word]
  else:
    # Handle unknown words (e.g., average vector, random vector)
    return np.zeros(embeddings.vector_size)  # Zero vector for unknown words



# Create a list to store word embeddings for the new data
new_data_embeddings = []

for text in new_data:
  # Preprocess the text (e.g., tokenization, lowercase)
  text = text.lower()  # Simple example, adjust pre-processing as needed
  tokens = sp.encode_as_pieces(text)
  print(tokens)

  # Get word embeddings for each token and average them
  text_embedding = np.mean([get_embedding(token, embeddings) for token in tokens], axis=0)
  new_data_embeddings.append(text_embedding)

new_data_embeddings[0:2], len(new_data_embeddings)


['▁the', '▁fu', 'ture', '▁king', '▁is', '▁the', '▁pr', 'ince']
['▁daughter', '▁is', '▁the', '▁pr', 'ince', 's', 's']
['▁son', '▁is', '▁the', '▁pr', 'ince']
['▁on', 'ly', '▁a', '▁man', '▁can', '▁be', '▁a', '▁king']
['▁on', 'ly', '▁a', '▁woman', '▁can', '▁be', '▁a', '▁quee', 'n']
['▁the', '▁pr', 'ince', 's', 's', '▁will', '▁be', '▁a', '▁quee', 'n']
['▁the', '▁pr', 'ince', '▁is', '▁a', '▁strong', '▁man']
['▁the', '▁pr', 'ince', 's', 's', '▁is', '▁a', '▁be', 'au', 'ti', 'ful', '▁woman']
['▁pr', 'ince', '▁is', '▁on', 'ly', '▁a', '▁boy', '▁now']
['▁pr', 'ince', '▁will', '▁be', '▁king']
['▁a', '▁boy', '▁will', '▁be', '▁a', '▁man']


([array([-0.0584887 ,  0.04671758,  0.03322815, -0.07456634, -0.04393948,
         -0.02910887,  0.10412631,  0.04553461, -0.17704867, -0.01801847,
          0.01598099, -0.09317356,  0.06912085, -0.07342655, -0.01406843,
          0.02503313,  0.14242955,  0.06450056, -0.15973845, -0.1305278 ,
          0.04967967,  0.10948216,  0.17884652, -0.03406755,  0.2243917 ,
          0.08994204, -0.01137875, -0.00373624, -0.14020863, -0.01685864,
         -0.0509486 , -0.03558093, -0.0393089 , -0.0502714 , -0.02532208,
         -0.00863226,  0.13969   ,  0.00076205,  0.0510409 , -0.09248054,
          0.05609575, -0.02401601, -0.07151686,  0.05032519,  0.13016705,
         -0.08505597,  0.0214482 , -0.07697316, -0.00141675,  0.10806104],
        dtype=float32),
  array([-0.0685121 ,  0.05638577,  0.04459536, -0.10180177, -0.06281793,
         -0.02869456,  0.14473526,  0.06641027, -0.2310745 , -0.02058047,
          0.02353842, -0.11906494,  0.09626527, -0.10088223, -0.00923041,
          0.0

In [303]:
embeddings['man']

array([-0.00341826,  0.02465613,  0.01783841,  0.00158785,  0.00278857,
       -0.00297462,  0.03290349,  0.00417481, -0.05106325,  0.00656121,
        0.01420123, -0.00950183, -0.00237943,  0.00171774,  0.00599508,
        0.0227341 ,  0.03991064,  0.02076855, -0.03000442, -0.02447276,
        0.01256079,  0.0155857 ,  0.0142398 ,  0.0017586 ,  0.03260358,
        0.01494514,  0.0056525 , -0.01759565, -0.02613164,  0.01469611,
       -0.00548983, -0.02610622, -0.00783861,  0.00857327,  0.00539587,
        0.0167386 ,  0.0360363 ,  0.00428325, -0.00484205, -0.02013864,
        0.01644858, -0.01868846, -0.02313053,  0.02348558,  0.0209784 ,
       -0.00457763, -0.00214687, -0.03169463, -0.00497517,  0.03055826],
      dtype=float32)

In [304]:
(embeddings['is'] + embeddings['the'])/2

array([-0.0192496 ,  0.0011788 , -0.00426882,  0.00085883, -0.00931476,
       -0.00205181,  0.00669172,  0.02083366, -0.02579372, -0.00726645,
        0.01578971, -0.01387489,  0.01279379,  0.0034175 , -0.00806058,
       -0.00173574,  0.00635048,  0.00307376, -0.00514222, -0.00770317,
        0.00826695,  0.00581983,  0.02437011,  0.01272547,  0.00440492,
        0.0163493 ,  0.00917638, -0.00492582,  0.0035567 , -0.00474111,
       -0.00312569, -0.00783431,  0.00193981,  0.00578733, -0.00444977,
       -0.00586052,  0.01603447, -0.00117432, -0.01285248, -0.01830493,
       -0.00253329,  0.00117634, -0.00668852,  0.01145128,  0.02095903,
       -0.00140911,  0.00468783, -0.007789  , -0.0011566 ,  0.01466139],
      dtype=float32)

In [305]:
dummy = np.mean([embeddings['is'], embeddings['the']], axis =0)
dummy

array([-0.0192496 ,  0.0011788 , -0.00426882,  0.00085883, -0.00931476,
       -0.00205181,  0.00669172,  0.02083366, -0.02579372, -0.00726645,
        0.01578971, -0.01387489,  0.01279379,  0.0034175 , -0.00806058,
       -0.00173574,  0.00635048,  0.00307376, -0.00514222, -0.00770317,
        0.00826695,  0.00581983,  0.02437011,  0.01272547,  0.00440492,
        0.0163493 ,  0.00917638, -0.00492582,  0.0035567 , -0.00474111,
       -0.00312569, -0.00783431,  0.00193981,  0.00578733, -0.00444977,
       -0.00586052,  0.01603447, -0.00117432, -0.01285248, -0.01830493,
       -0.00253329,  0.00117634, -0.00668852,  0.01145128,  0.02095903,
       -0.00140911,  0.00468783, -0.007789  , -0.0011566 ,  0.01466139],
      dtype=float32)

In [306]:
"""# ['daughter', 'is', 'the', 'princess']

embeddings['daughter'], embeddings['princess']"""

"# ['daughter', 'is', 'the', 'princess']\n\nembeddings['daughter'], embeddings['princess']"

In [307]:
import torch
import torch.nn as nn

y = [100, 30, 120, 200, 50, 45, 150, 300, 5, 500, 350]


new_data_embeddings_tensor = torch.tensor(new_data_embeddings)
new_data_embeddings_tensor = new_data_embeddings_tensor.float()

y_tensor = torch.tensor(y)  # Replace y with your target values
y_tensor = y_tensor.float()

In [308]:

# Define model parameters
input_size = embeddings.vector_size
print(input_size)

n_hidden_units = 128

# Model architecture (without class)
model = nn.Sequential(
    nn.Linear(input_size, n_hidden_units),
    nn.ReLU(),
    nn.Linear(n_hidden_units, 1)
)

criterion = nn.MSELoss()  # Mean squared error loss
optimizer = torch.optim.Adam(model.parameters())


for epoch in range(1000):  # Train for 10 epochs
  optimizer.zero_grad()  # Clear gradients before each epoch
  outputs = model(new_data_embeddings_tensor)
  loss = criterion(outputs, y_tensor)
  loss.backward()  # Backpropagation
  optimizer.step()  # Update model parameters
  if (epoch % 100 == 0):
    print(f"Epoch {epoch+1}, Loss: {loss.item():.4f}")


50
Epoch 1, Loss: 50470.0742
Epoch 101, Loss: 49468.2227
Epoch 201, Loss: 46607.7305
Epoch 301, Loss: 42046.6016
Epoch 401, Loss: 36541.7031
Epoch 501, Loss: 31339.1328
Epoch 601, Loss: 27413.3457
Epoch 701, Loss: 24970.6094
Epoch 801, Loss: 23806.9883
Epoch 901, Loss: 23397.8906


In [309]:
# Extract weights from the first and second linear layers
first_layer_weights = model[0].weight.data  # Weights from first linear layer
second_layer_weights = model[2].weight.data  # Weights from second linear layer

# Extract biases (if needed)
first_layer_bias = model[0].bias.data  # Bias from first linear layer
second_layer_bias = model[2].bias.data  # Bias from second linear layer

# You can access weights and biases for further analysis or saving
first_layer_weights.shape, first_layer_bias.shape

(torch.Size([128, 50]), torch.Size([128]))

In [310]:
first_layer_weights

tensor([[ 0.0040,  0.0962,  0.1092,  ...,  0.0219, -0.1181, -0.0226],
        [-0.1333,  0.1343, -0.1181,  ..., -0.1010, -0.0138, -0.0042],
        [ 0.0835, -0.0508, -0.0057,  ..., -0.1232,  0.1342, -0.0280],
        ...,
        [-0.0934, -0.0017,  0.0751,  ...,  0.1391,  0.1658, -0.1121],
        [-0.0565, -0.0181, -0.0867,  ..., -0.0624,  0.0751,  0.0682],
        [-1.1039,  0.9849,  0.9954,  ..., -1.0973, -1.0629,  0.8952]])

In [311]:
# Importing our prediction data

with open('prediction.txt', 'r') as f:
  pred_data = f.readlines()
  f.close

print(len(pred_data))
print(pred_data)

1
['prince is a king']


In [312]:
embeddings['king']

# ['the', 'future', 'king', 'is', 'the', 'prince']

array([-4.4298125e-05,  3.9429478e-03, -1.5459769e-02, -8.9199580e-03,
       -1.6907291e-02,  1.2838637e-03,  1.3465955e-02,  1.6838247e-02,
       -2.8736066e-02, -1.7014154e-06, -8.6970851e-03, -1.7211254e-03,
       -1.4359734e-02,  1.0985725e-02,  1.4949586e-03,  1.9903922e-02,
       -1.4082628e-04, -1.1637617e-02,  5.6815159e-04, -2.1725811e-02,
       -9.4473055e-03, -1.0368950e-02, -4.9052117e-03, -7.8403106e-04,
       -2.2173332e-04,  6.5942556e-03, -1.6602093e-02,  1.7075479e-02,
       -8.6540859e-03, -1.4339705e-02, -1.9728344e-02, -2.0861762e-02,
        1.3077459e-03,  1.5472771e-03, -1.1016614e-02, -1.9121943e-03,
        2.2075087e-02,  1.0621300e-02,  1.6670739e-02, -1.0366590e-02,
        1.7508181e-02,  6.0665756e-03, -2.1607546e-02, -1.2380616e-02,
       -8.6373296e-03, -5.4553207e-03, -3.5631363e-03,  4.3178229e-03,
        1.6214946e-02,  1.2907632e-02], dtype=float32)

In [313]:
# Define a function to handle word embeddings
def get_embedding(word, embeddings):
  # Check if the word exists in the vocabulary
  if word in embeddings:
    return embeddings[word]
  else:
    # Handle unknown words (e.g., average vector, random vector)
    return np.zeros(embeddings.vector_size)  # Zero vector for unknown words



# Create a list to store word embeddings for the new data
pred_data_embeddings = []

for text in pred_data:
  # Preprocess the text (e.g., tokenization, lowercase)
  tokens = text.lower().split()  # Simple example, adjust pre-processing as needed
  print(tokens)

  # Get word embeddings for each token and average them
  text_embedding = np.mean([get_embedding(token, embeddings) for token in tokens], axis=0)
  new_data_embeddings.append(text_embedding)

pred_data_embeddings, len(pred_data_embeddings)

['prince', 'is', 'a', 'king']


([], 0)

In [314]:
pred_embeddings_tensor = torch.tensor(embeddings['man']).float()  # Convert unseen data embeddings to tensor
predictions = model(pred_embeddings_tensor)
predictions

tensor([57.9188], grad_fn=<ViewBackward0>)