In [None]:
"""pip install wikipedia

import wikipedia
from tqdm import tqdm

# Set the language of Wikipedia to English
wikipedia.set_lang('en')

# Fetch Wikipedia articles
articles = []
num_texts = 300

for title in tqdm(wikipedia.random(pages=num_texts), total=num_texts):
    try:
        page = wikipedia.page(title)
        article_content = page.content
        articles.append(article_content)
    except (wikipedia.exceptions.PageError,
            wikipedia.exceptions.DisambiguationError,
            wikipedia.exceptions.HTTPTimeoutError):
        # Handle exceptions by skipping the current article
        continue

# File path to save the text file
file_path = 'wiki.txt'

with open(file_path, 'wt', encoding='utf-8') as f:
    for article in articles:
        f.write(article + '\n')"""

"\npip install wikipedia\n\nimport wikipedia\nfrom tqdm import tqdm\n\n# Set the language of Wikipedia to English\nwikipedia.set_lang('en')\n\n# Fetch Wikipedia articles\narticles = []\nnum_texts = 120\n\nfor title in tqdm(wikipedia.random(pages=num_texts), total=num_texts):\n    try:\n        page = wikipedia.page(title)\n        article_content = page.content\n        articles.append(article_content)\n    except (wikipedia.exceptions.PageError,\n            wikipedia.exceptions.DisambiguationError,\n            wikipedia.exceptions.HTTPTimeoutError):\n        # Handle exceptions by skipping the current article\n        continue\n\n# File path to save the text file\nfile_path = 'output.txt'\n\nwith open(file_path, 'wt', encoding='utf-8') as f:\n    for article in articles:\n        f.write(article + '\n')\n\n"

In [1]:
# Data exploration

file = open("wiki.txt",'r')
wiki_corpus = file.readlines()
file.close


print(wiki_corpus[3:5])
print(len(wiki_corpus))

['\n', '== Teams ==\n']
8733


In [2]:
for i in range(len(wiki_corpus)):
  wiki_corpus[i] = wiki_corpus[i].lower().replace('\n', '')

print(wiki_corpus[3:5])
print(len(wiki_corpus))

['', '== teams ==']
8733


In [5]:
# Creating tokens

tokens = []

for sentence in wiki_corpus:
  temp = []
  for word in sentence.split():
    temp.append(word)
  tokens.append(temp)

print(tokens[1:2])
print(len(tokens))

[['sporting', 'cp', 'was', 'the', 'defending', 'champion', 'which', 'was', 'swept', 'by', 'fc', 'porto', 'in', 'semifinals.', 'sl', 'benfica', 'won', 'their', '28th', 'title', 'earning', 'their', 'first', 'title', 'since', 'the', '2016–17', 'season.']]
8733


In [6]:
# Finding unique words

vocabulary = []

for sentence in tokens:
  vocabulary.extend(sentence)

print(vocabulary[:5])
print(len(vocabulary))

vocabulary = list(set(vocabulary))
print(vocabulary[:5])
print(len(vocabulary))

['the', '2021–22', 'lpb', 'season,', 'also']
155535
['meteoritics,', 'sylvester', '1892', 'compact.', 'laboratory']
30877


In [7]:
# Sentencepiece

import sentencepiece as spm
import torchtext

In [11]:
spm.SentencePieceTrainer.Train('--input=wiki.txt --model_prefix=model_1 --vocab_size=19128 --model_type=unigram --model_type=word')
sp = spm.SentencePieceProcessor()
sp.load('model_1.model')

sp.get_piece_size()

19128

In [12]:
print(sp.encode_as_pieces('Eli lilly!'))
print(sp.encode_as_ids('Eli lilly!'))

['▁Eli', '▁li', 'lly', '!']
[13784, 3448, 3665, 4279]


In [14]:
print(sp.decode_pieces(['▁E', 'li', '▁li', 'lly', '!']))
print(sp.decode_ids([13784, 3448, 3665, 4279]))

Eli lilly!
Eli lilly!


In [22]:
# Embedding using gensim in word2vec format

from gensim.models import Word2Vec

embedding_size = 50

def tokenize_and_embed(filepath):

  with open(filepath, 'r') as f:
    text = f.read()

  tokens = sp.encode_as_pieces(text)

  model = Word2Vec(sentences = [tokens], vector_size = embedding_size, min_count = 1)
  word2vec_embedding = model.wv

  return word2vec_embedding

embeddings = tokenize_and_embed("/content/wiki.txt")

In [23]:
print(f"The length of the embedding is {len(embeddings)}")

The length of the embedding is 18226


In [24]:
embeddings.index_to_key[:5]

['▁the', ',', '.', '▁of', 's']

In [27]:
embeddings['the'], embeddings['the'].shape

(array([-0.01297385, -0.0159397 , -0.00861603,  0.00244697,  0.00082133,
         0.00169984,  0.01824684,  0.0250407 , -0.01656317, -0.0167652 ,
         0.00916714, -0.02567785,  0.00847883, -0.00036265, -0.01262266,
         0.00767779,  0.00897919,  0.00812245,  0.00831348, -0.01753588,
         0.01318977,  0.00182098,  0.02804287,  0.01833987, -0.01720589,
         0.01633936,  0.0042914 ,  0.01137361,  0.0150646 ,  0.01569795,
         0.00540742, -0.00703834,  0.00440182,  0.01540083, -0.00791423,
        -0.01825288,  0.00142883, -0.01398616, -0.01356268, -0.02004848,
        -0.00055353, -0.01210959, -0.0124638 ,  0.01803497,  0.02043474,
         0.01426068,  0.01370634,  0.00523269,  0.00370597,  0.0099492 ],
       dtype=float32),
 (50,))

In [29]:
embeddings.vector_size

50

In [36]:
import numpy as np

missing_embedding = []
properly_embedded = []

# Define a function to handle word embeddings
def get_embedding(word, embeddings):
  # Check if the word exists in the vocabulary
  if word in embeddings:
    properly_embedded.append(word)
    return embeddings[word]
  else:
    missing_embedding.append(word)
    #print(f"### MISSING EMBEDDING for {word} ###")
    # Handle unknown words (e.g., average vector, random vector)
    return np.zeros(embeddings.vector_size)  # Zero vector for unknown words

# Preprocess text columns (adjust as needed)
def preprocess_text(text):
  text = text.lower().replace('\n', '')  # Basic example
  token = sp.encode_as_pieces(text)
  """print(token)
  print(len(token))"""
  return token


# Sample data assuming your database has title, comment, and target columns
data = [
    ("Great movie!", "A must-watch!", 5),
    ("Not impressed", "Disappointing story", 2),
    ("Entertaining film", "Good visuals", 4),
]


# Preprocess text columns (adjust as needed)
def preprocess_text(text):
  text = text.lower().replace('\n', '')  # Basic example
  token = sp.encode_as_pieces(text)
  return token

# Create embedding lists for title and comment columns
title_embeddings, comment_embeddings, y = [], [], []


for title, comment, g_truth in data:
  title_embedding = np.mean([get_embedding(word, embeddings) for word in preprocess_text(title)], axis=0)
  comment_embedding = np.mean([get_embedding(word, embeddings) for word in preprocess_text(comment)], axis=0)

  title_embeddings.append(title_embedding)
  comment_embeddings.append(comment_embedding)
  y.append(g_truth)

# Concatenate embeddings from both columns
all_embeddings = np.concatenate((np.array(title_embeddings), np.array(comment_embeddings)), axis=1)

print(f"The missing_embedding are {len(missing_embedding)}")
print(f"The properly_embedded are {len(properly_embedded)}")

per = (len(missing_embedding) + len(properly_embedded)) / 100

print(f"Missing embedding for {len(missing_embedding)/per}%")
print(f"embedded for {len(properly_embedded)/per}%")

len(all_embeddings), all_embeddings[0], all_embeddings[0].shape, y

The missing_embedding are 0
The properly_embedded are 22
Missing embedding for 0.0%
embedded for 100.0%


(3,
 array([-4.7609378e-03, -1.1473961e-02,  1.5484076e-04,  8.7474883e-03,
        -1.9001651e-04,  8.8349897e-03, -4.4296482e-03,  5.4279593e-04,
        -3.3310328e-03,  2.1902062e-03,  7.1072769e-03, -1.2451406e-02,
         1.3924822e-02, -2.7603291e-03, -5.5631590e-03, -8.3397878e-03,
         1.0284283e-03, -1.2832959e-02, -1.2419790e-02, -3.8788675e-03,
         5.9220293e-03,  1.0127478e-02,  6.8716556e-03, -1.1852175e-02,
         9.5995981e-04,  9.2426511e-03, -6.9633205e-03, -1.5719045e-03,
         4.1475389e-03,  1.8295813e-02, -6.1703660e-04,  1.2126143e-02,
         6.9709085e-03,  7.9943817e-03, -9.7190039e-03, -5.7595414e-03,
        -9.0738414e-03,  2.6517652e-03,  5.6178519e-03,  4.3914802e-03,
         2.9988913e-04, -5.2399994e-03,  1.4261864e-03, -1.6963322e-03,
         1.5030630e-02,  9.5872739e-03,  5.3392150e-03, -7.0011611e-03,
         7.9210615e-05, -2.2304307e-04,  2.5743328e-02,  7.0502707e-03,
        -1.7679697e-02, -6.8297754e-03, -9.2087174e-03, -2.1

In [39]:
import torch
import torch.nn as nn

new_data_embeddings_tensor = torch.tensor(all_embeddings)
new_data_embeddings_tensor = new_data_embeddings_tensor.float()

y_tensor = torch.tensor(y)  # Replace y with your target values
y_tensor = y_tensor.float()

features = 2

# Define model parameters
input_size = embedding_size * features
print(input_size)

n_hidden_units = 1000
n_hidden_units_1 = 1000

epoch = 1000

# Model architecture (without class)
model = nn.Sequential(
                    nn.Linear(input_size, n_hidden_units),
                    nn.ReLU(),
                    nn.Linear(n_hidden_units, n_hidden_units_1),
                    nn.ReLU(),
                    nn.Linear(n_hidden_units_1, 1)
                      )

criterion = nn.MSELoss()  # Mean squared error loss
optimizer = torch.optim.Adam(model.parameters())


for epoch in range(epoch):  # Train for 10 epochs
  optimizer.zero_grad()  # Clear gradients before each epoch
  outputs = model(new_data_embeddings_tensor)
  loss = criterion(outputs, y_tensor)
  loss.backward()  # Backpropagation
  optimizer.step()  # Update model parameters
  if (epoch % 100 == 0):
    print(f"Epoch {epoch+1}, Loss: {loss.item():.4f}")


100
Epoch 1, Loss: 14.9650
Epoch 101, Loss: 1.5562
Epoch 201, Loss: 1.5556
Epoch 301, Loss: 1.5556
Epoch 401, Loss: 1.5556
Epoch 501, Loss: 1.5556
Epoch 601, Loss: 1.5556
Epoch 701, Loss: 1.5556
Epoch 801, Loss: 1.5556
Epoch 901, Loss: 1.5556


In [40]:
"""from google.colab import drive
drive.mount('/content/drive')"""


Mounted at /content/drive


In [47]:
import pandas as pd

df = pd.read_csv("/content/drive/MyDrive/hacknews.csv")
df.head()

  df = pd.read_csv("/content/drive/MyDrive/hacknews.csv")


Unnamed: 0,id,by,type,title,url,score,time,text,parent,descendants,poll,parts,kids
0,187281,phlitesoft,comment,,,,1210567000.0,Just get into your router config and block sit...,187132.0,,,,
1,187294,LogicHoleFlaw,comment,,,,1210568000.0,The payment gateway software I've interfaced w...,187275.0,,,,[187307]
2,187301,babo,comment,,,,1210569000.0,"An open office environment, where others shoul...",187132.0,,,,
3,187381,mstefff,comment,,,,1210578000.0,Oh yea..<p>I'm currently using Google/Yahoo mo...,187362.0,,,,
4,187428,austinwells1984,story,Unique Wedding Favors,http://www.uniquewedingfavors.com,1.0,1210585000.0,"Unique Weding Favors offers elegant, inexpensi...",,2.0,,,"[187433, 187512, 187430]"


In [48]:
columns_to_keep = ['title', 'text', 'score']
df.dropna(subset = columns_to_keep, inplace=True)

data = df[columns_to_keep]
data.count()

title    27717
text     27717
score    27717
dtype: int64

In [49]:
data

Unnamed: 0,title,text,score
4,Unique Wedding Favors,"Unique Weding Favors offers elegant, inexpensi...",1.0
42,Ask HN: Free (Quality) Financial/Stock Content...,"Hey,<p>Quick question. I've been searching end...",2.0
47,Google Is A Malware Site (Says Yahoo),some Yahoo search results that point to Google...,1.0
80,Java Web Framework Tournament 2008,"Found this today, really really liked it thoug...",2.0
168,Ask PG: What Marketing/PR strategies do you su...,How do you help YC companies to create maximum...,30.0
...,...,...,...
813913,Livestock Shelters | Mare Motels | Barn Kits,"Mare motels, open air barn kits, paddock shelt...",1.0
813918,Ask YC: Cybersquatting,Hello.<p>Could someone please tell me if it is...,1.0
813970,Carribean vacations,"All inclusive Carribean travel, vacations, cru...",1.0
813971,"Ask YC: SMS Application, any help?",I am building an application that will need to...,2.0


In [50]:
data.describe()

Unnamed: 0,score
count,27717.0
mean,5.441895
std,13.71889
min,0.0
25%,1.0
50%,1.0
75%,4.0
max,928.0


In [52]:
data[data["score"] >5]

Unnamed: 0,title,text,score
168,Ask PG: What Marketing/PR strategies do you su...,How do you help YC companies to create maximum...,30.0
397,Ask YC: Writing an open source software or usi...,Recently I got lucky to be in a situation wher...,6.0
529,An FAQ for Hacker News?,If there was an FAQ section that had threads t...,25.0
534,Ask YC: Have you built a good website that nob...,Did you ever launch a project that deserved po...,103.0
978,Ask YC: PHP6?,How do fellow YC'ers feel about the direction ...,8.0
...,...,...,...
813578,Proving that ideas are not worth a lot.,We always argue on the relative worth of ideas...,15.0
813722,"FloatingTime, a Google App Engine application ...",Sign up now and I'll let you in on the private...,9.0
813785,YC: Don't forget to call your mother.,It's mothers day.,37.0
813810,Ask YC: Textile vs Markdown,"I'm sick of writing html tags. It is a bore, e...",21.0


In [54]:
# Specify the output CSV file name
csv_file_path = "hacker_news_final.csv"

# Export the DataFrame to a CSV file with 'to_csv'
data.to_csv(csv_file_path, index=False)  # Optionally exclude the index column


In [165]:
data.title[0:10]

AttributeError: 'list' object has no attribute 'title'

In [55]:
import csv

# Define the path to your CSV file
csv_file_path = "/content/hacker_news_final.csv"

# Create an empty list to store the processed data
data = []

# Open the CSV file in read mode
with open(csv_file_path, 'r', newline='') as csvfile:
  # Create a CSV reader object
  reader = csv.reader(csvfile)

  # Skip the header row (if present)
  next(reader, None)  # Assuming the first row is a header

  # Iterate through rows in the CSV file
  for row in reader:
    # Extract title, comment, and target values
    title = row[0]
    comment = row[1]
    target = float(row[2])  # Assuming the target is a numerical value

    # Append a tuple with the extracted data to the data list
    data.append((title, comment, target))

# Now you have the data in the desired format:

print(data[2:3])



In [170]:
features = 2
missing_embedding = []
properly_embedded = []

# Define a function to handle word embeddings
def get_embedding(word, embeddings):
  # Check if the word exists in the vocabulary
  if word in embeddings:
    properly_embedded.append(word)
    return embeddings[word]
  else:
    missing_embedding.append(word)
    #print(f"### MISSING EMBEDDING for {word} ###")
    # Handle unknown words (e.g., average vector, random vector)
    return np.zeros(embeddings.vector_size)  # Zero vector for unknown words

# Preprocess text columns (adjust as needed)
def preprocess_text(text):
  text = text.lower().replace('\n', '')  # Basic example
  token = sp.encode_as_pieces(text)
  """print(token)
  print(len(token))"""
  return token

# Create embedding lists for title and comment columns
title_embeddings, comment_embeddings, all_embeddings, y = [], [], [], []

count = 1

for title, comment, g_truth in data[:20000]:
  title_embedding = np.mean([get_embedding(word, embeddings) for word in preprocess_text(title)], axis=0)
  """if count ==1:
    print(title_embedding)"""
  comment_embedding = np.mean([get_embedding(word, embeddings) for word in preprocess_text(comment)], axis=0)
  """if count ==1:
    print(comment_embedding)"""

  # Concatenate embeddings from both columns
  all_embedding = (title_embedding + comment_embedding)/features
  """if count ==1:
    print(all_embedding)
    print(all_embedding.shape)
    count += 1"""

  title_embeddings.append(title_embedding)
  comment_embeddings.append(comment_embedding)
  all_embeddings.append(all_embedding)
  y.append(g_truth)

print(f"The missing_embedding are {len(missing_embedding)}")
print(f"The properly_embedded are {len(properly_embedded)}")
per = (len(missing_embedding) + len(properly_embedded)) / 100
print(f"tot is {len(missing_embedding) + len(properly_embedded)}")

print(f"\nMissing embedding for {len(missing_embedding)/per}%")
print(f"embedded for {len(properly_embedded)/per}%\n")

len(all_embeddings), all_embeddings[0], all_embeddings[0].shape, len(y)

The missing_embedding are 57570
The properly_embedded are 2296160
tot is 2353730

Missing embedding for 2.4459050103452817%
embedded for 97.55409498965471%



(20000,
 array([ 0.05130332,  0.02413601, -0.04160256, -0.00094028, -0.01981839,
        -0.04589368,  0.08215793,  0.06880328, -0.04613211, -0.05924923,
        -0.00958673, -0.07314713,  0.02026604, -0.02269147, -0.04436878,
         0.05188602,  0.00957028,  0.02992623, -0.03854349, -0.0284401 ,
        -0.00724261,  0.08765827,  0.09634409,  0.00695292,  0.01640814,
         0.01192177, -0.01417532,  0.03326701,  0.00048219,  0.03800485,
         0.01852639, -0.02140378, -0.00110776, -0.00941067, -0.03359887,
        -0.00681732,  0.03508276,  0.00087298,  0.02708487, -0.01673411,
         0.05427176,  0.00716442, -0.02759383,  0.04717784,  0.07627361,
         0.00523061, -0.00907822, -0.02076351,  0.00572577,  0.04256276],
       dtype=float32),
 (50,),
 20000)

In [173]:
tot_words = missing_embedding + properly_embedded
len(tot_words), len(set(tot_words))

(2353730, 10978)

In [169]:
missing_embedding = set(missing_embedding)
properly_embedded = set(properly_embedded)


print(len(missing_embedding) + len(properly_embedded))
len(missing_embedding), len(properly_embedded)

10978


(746, 10232)

In [150]:
new_data_embeddings_tensor = torch.tensor(all_embeddings)
new_data_embeddings_tensor = new_data_embeddings_tensor.float()

y_tensor = torch.tensor(y)
y_tensor = y_tensor.unsqueeze(1)
y_tensor = y_tensor.float()

new_data_embeddings_tensor[0], y_tensor.shape, new_data_embeddings_tensor[0].shape, torch.mean(y_tensor)

(tensor([ 0.0513,  0.0241, -0.0416, -0.0009, -0.0198, -0.0459,  0.0822,  0.0688,
         -0.0461, -0.0592, -0.0096, -0.0731,  0.0203, -0.0227, -0.0444,  0.0519,
          0.0096,  0.0299, -0.0385, -0.0284, -0.0072,  0.0877,  0.0963,  0.0070,
          0.0164,  0.0119, -0.0142,  0.0333,  0.0005,  0.0380,  0.0185, -0.0214,
         -0.0011, -0.0094, -0.0336, -0.0068,  0.0351,  0.0009,  0.0271, -0.0167,
          0.0543,  0.0072, -0.0276,  0.0472,  0.0763,  0.0052, -0.0091, -0.0208,
          0.0057,  0.0426]),
 torch.Size([20000, 1]),
 torch.Size([50]),
 tensor(5.0665))

In [156]:
# Define model parameters
input_size = embedding_size
n_hidden_units = 100
n_hidden_units_1 = 200
epoch = 1000

# Model architecture (without class)
model = nn.Sequential(
                    nn.Linear(input_size, n_hidden_units),
                    nn.ReLU(),
                    nn.Linear(n_hidden_units, n_hidden_units_1),
                    nn.ReLU(),
                    nn.Linear(n_hidden_units_1, 1)
                      )

criterion = nn.MSELoss()  # Mean squared error loss
optimizer = torch.optim.Adam(model.parameters())


for epoch in range(epoch):  # Train for 10 epochs
  outputs = model(new_data_embeddings_tensor)
  loss = criterion(outputs, y_tensor)
  loss.backward()  # Backpropagation
  optimizer.step()  # Update model parameters
  optimizer.zero_grad()  # Clear gradients before each epoch
  if (epoch % 100 == 0):
    print(f"Epoch {epoch+1}, Loss: {loss.item()}")

Epoch 1, Loss: 231.748046875
Epoch 101, Loss: 205.7823486328125
Epoch 201, Loss: 205.6969757080078
Epoch 301, Loss: 205.57899475097656
Epoch 401, Loss: 205.2915496826172
Epoch 501, Loss: 204.84620666503906
Epoch 601, Loss: 203.6278076171875
Epoch 701, Loss: 200.77561950683594
Epoch 801, Loss: 198.28860473632812
Epoch 901, Loss: 197.32887268066406


In [157]:
"""# Importing our prediction data

with open('/content/prediction_final', 'r') as f:
  pred_data = f.readlines()
  f.close

print(len(pred_data))
print(pred_data)"""

"# Importing our prediction data\n\nwith open('/content/prediction_final', 'r') as f:\n  pred_data = f.readlines()\n  f.close\n\nprint(len(pred_data))\nprint(pred_data)"

In [161]:
pred_data = [('mario movie is a hit com movie in recent time')]

# Create embedding lists for title and comment columns
title_embeddings = []

for title in pred_data:
  print(title)

  title_embedding = np.mean([get_embedding(word, embeddings) for word in preprocess_text(title)], axis=0)
  print(title_embedding)

pred_data_embeddings = title_embedding

len(pred_data_embeddings), pred_data_embeddings[0].shape

pred_embeddings_tensor = torch.tensor(pred_data_embeddings).float()  # Convert unseen data embeddings to tensor
predictions = model(pred_embeddings_tensor)
predictions

mario movie is a hit com movie in recent time
[ 0.03409271  0.02231258 -0.02609445 -0.00032081 -0.01069341 -0.03544645
  0.06711969  0.05510368 -0.04005481 -0.04916792 -0.00725382 -0.05749001
  0.01771778 -0.01846448 -0.03721418  0.03690248  0.00767441  0.01811466
 -0.02304493 -0.01797671 -0.0003578   0.06914311  0.07990763  0.01004995
  0.00505029  0.0119848  -0.02319112  0.02851867  0.00102214  0.02782373
  0.02052049 -0.01440495  0.0047368  -0.00313675 -0.02858497 -0.00411864
  0.02565012 -0.00168543  0.01742792 -0.01234486  0.04604072  0.00618614
 -0.02045999  0.04120018  0.06398307  0.00479702 -0.00739243 -0.01303452
  0.00868049  0.03041896]


tensor([5.0861], grad_fn=<ViewBackward0>)