In [1]:
import pandas as pd

titles = [l.strip() for l in open("hackernews").readlines()]
df = pd.DataFrame(titles).drop_duplicates()
df = pd.DataFrame(df[0].tolist(), columns=["title"])
df

Unnamed: 0,title
0,The first Oxide rack being prepared for custom...
1,"Build your own Docker with Linux namespaces, c..."
2,Make your programs run faster by better using ...
3,An open source web-based flashcard studying sy...
4,Millions of GitHub repos likely vulnerable to ...
...,...
1820,U.S. Feds Seized Nearly $1B in Bitcoin from Wa...
1821,Facebook was used as a proxy by web scraping bots
1822,How is Visual Basic still ranked #6 programmin...
1823,Confessions of a voter fraud: I was a master a...


In [2]:
# df = pd.read_csv("data.csv").drop_duplicates()
# df = df[["id","title"]]
# df

In [3]:
import re
from nltk.corpus import stopwords
stopwords_en = stopwords.words('english')

def preprocess(text:str):

    # remove the url at the end
    text = text[:text.find('(')]

    # split text
    text = text.lower()
    words = text.split()
    ret = []
    for word in words:
        # strip whitespaces
        word = word.strip()
        # remove stopwords
        if word in stopwords_en:
            continue
        # remove words len<3
        if len(word) < 3:
            continue

        # regex
        word = re.sub(r'[0-9]+', '', word)
        word = re.sub(r'\W', '', word)
        ret.append(word)
    return ' '.join(ret)

df["text"] = df["title"].apply(preprocess)
df

Unnamed: 0,title,text
0,The first Oxide rack being prepared for custom...,first oxide rack prepared customer shipment
1,"Build your own Docker with Linux namespaces, c...",build docker linux namespaces cgroups chroot
2,Make your programs run faster by better using ...,make programs run faster better using data cache
3,An open source web-based flashcard studying sy...,open source webbased flashcard studying system
4,Millions of GitHub repos likely vulnerable to ...,millions github repos likely vulnerable repoja...
...,...,...
1820,U.S. Feds Seized Nearly $1B in Bitcoin from Wa...,us feds seized nearly b bitcoin wallet linked ...
1821,Facebook was used as a proxy by web scraping bots,facebook used proxy web scraping bot
1822,How is Visual Basic still ranked #6 programmin...,visual basic still ranked programming language
1823,Confessions of a voter fraud: I was a master a...,confessions voter fraud master fixing mailin b...


In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

tfidf = TfidfVectorizer()
vectorized = tfidf.fit_transform(df["text"].tolist())
vectorized.resize((len(df["text"]),300))

In [5]:
X = ["Excellence is a habit, but so is failure (awesomekling.github.io)"]

test_vec = tfidf.fit_transform(X)
test_vec.resize((len(X),300))
# print(test_vec[0,294])
print(test_vec)
print(test_vec.shape)

  (0, 6)	0.2886751345948129
  (0, 4)	0.2886751345948129
  (0, 0)	0.2886751345948129
  (0, 3)	0.2886751345948129
  (0, 8)	0.2886751345948129
  (0, 1)	0.2886751345948129
  (0, 5)	0.2886751345948129
  (0, 7)	0.5773502691896258
  (0, 2)	0.2886751345948129
(1, 300)


In [6]:
# print(vectorized.shape, test_vec.shape)
cos_sim = cosine_similarity(vectorized,test_vec)
for i in range(test_vec.shape[0]):
    df["similarity"] = [x[0] for x in cos_sim]

top_k = (df.sort_values(by="similarity", ascending=False)
            .head(10)
            .query("similarity>0")
        )
print(X)
display(top_k["title"].head(3).tolist())

['Excellence is a habit, but so is failure (awesomekling.github.io)']


['Students have to jump through absurd hoops to use exam monitoring software',
 'Ableton Live 11',
 'Twitter may slow down users’ ability to ‘like’ tweets containing misinformation']