In [1]:
import pandas as pd

titles = [l.strip() for l in open("hackernews").readlines()]
df = pd.DataFrame(titles).drop_duplicates()
df = pd.DataFrame(df[0].tolist(), columns=["title"])
df

Unnamed: 0,title
0,The first Oxide rack being prepared for custom...
1,"Build your own Docker with Linux namespaces, c..."
2,Make your programs run faster by better using ...
3,An open source web-based flashcard studying sy...
4,Millions of GitHub repos likely vulnerable to ...
...,...
192,Why SQLite is so great for the edge (turso.tech)
193,Apple releasing segmentation/pose for humans a...
194,Undirected SS Shortest Paths with Positive Int...
195,US urged to reveal UFO evidence after claim th...


In [2]:
# df = pd.read_csv("data.csv").drop_duplicates()
# df = df[["id","title"]]
# df

In [3]:
from nltk.corpus import stopwords
stopwords_en = stopwords.words('english')

def preprocess(text:str):

    # remove the url at the end
    text = text[:text.find('(')]

    # split text
    text = text.lower()
    words = text.split()
    ret = []
    for word in words:
        # strip whitespaces
        word = word.strip()
        # remove numbers
        if type(word) == int:
            continue
        # remove stopwords
        if word in stopwords_en:
            continue
        # remove words len<3
        if len(word) < 3:
            continue
        ret.append(word)
    return ' '.join(ret)

df["text"] = df["title"].apply(preprocess)
df

Unnamed: 0,title,text
0,The first Oxide rack being prepared for custom...,first oxide rack prepared customer shipment
1,"Build your own Docker with Linux namespaces, c...","build docker linux namespaces, cgroups, chroot"
2,Make your programs run faster by better using ...,make programs run faster better using data cache
3,An open source web-based flashcard studying sy...,open source web-based flashcard studying system
4,Millions of GitHub repos likely vulnerable to ...,millions github repos likely vulnerable repoja...
...,...,...
192,Why SQLite is so great for the edge (turso.tech),sqlite great edge
193,Apple releasing segmentation/pose for humans a...,apple releasing segmentation/pose humans anima...
194,Undirected SS Shortest Paths with Positive Int...,undirected shortest paths positive integer wei...
195,US urged to reveal UFO evidence after claim th...,urged reveal ufo evidence claim intact alien v...


In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

tfidf = TfidfVectorizer()
vectorized = tfidf.fit_transform(df["text"].tolist())
vectorized.resize((len(df["text"]),300))

In [5]:
X = ["GPT-4 API General Availability (openai.com)"]
test_vec = tfidf.fit_transform(X)
test_vec.resize((len(X),300))
print(test_vec.shape)

(1, 300)


In [6]:
print(vectorized.shape, test_vec.shape)
cos = cosine_similarity(vectorized,test_vec, dense_output=False)
simi = pd.DataFrame([prob for prob in cos.toarray()], columns=["prob"])

top_k = simi.sort_values(by="prob", ascending=False).head(10)
# print([i for i in top_k.index])
# top_k["text"] = [df.iloc[i, "text"] for i in top_k.index]

print(X)
print()
for i in top_k.index:
    print(top_k.loc[i].prob, df.iloc[i].title)

(197, 300) (1, 300)
['GPT-4 API General Availability (openai.com)']

0.4082482904638631 The State of HTTP in 2022 (cloudflare.com)
0.2886751345948129 Show HN: Word2vec Algorithm in ~100sloc with NumPy (github.com/josephsboyle)
0.2886751345948129 The tiny corp raised $5.1M (geohot.github.io)
0.2886751345948129 Whistleblower drops 100 GB of Tesla secrets to German news site (jalopnik.com)
0.2548640335598648 ChatGPT simulates 1987 BBS System (sharegpt.com)
0.20412414523193154 50 years in filesystems: 1984 BSD FFS (koehntopp.info)
0.0 The first Oxide rack being prepared for customer shipment (hachyderm.io)
0.0 Writing summaries is more important than reading more books (andreasfragner.com)
0.0 Using ChatGPT for home automation (atomic14.com)
0.0 VSCode-WASM: Implement a first version of a WebShell (github.com/microsoft)
