In [2]:
import pandas as pd
import numpy as np
import json

# Load the labeled data

We are using the **kern export format** here, which is a simple json that can be read from pandas directly. 

If you're using a csv from an Excel export, just modify this code here!

In [17]:
path = "labeled data\datascienceweekly_labeled.json"
original_df = pd.read_json(path)

In [18]:
original_df = original_df.drop(columns=["__Interesting__WEAK_SUPERVISION", "__Interesting__WEAK_SUPERVISION__confidence"])

In [19]:
original_df.head()

Unnamed: 0,newsletter,date,topic,headline,body,__Interesting__MANUAL
0,datascienceweekly,2022-04-28 23:21:02+00:00,Editor Picks,Beyond interpretability: developing a language...,"AI will continue becoming more complex, bigger...",yes
1,datascienceweekly,2022-04-21 22:34:45+00:00,Data Science Articles & Videos,Bad ML Abstractions I (Generative vs Discrimin...,This post is part of a series on bad abstracti...,no
2,datascienceweekly,2022-03-31 23:12:31+00:00,Data Science Articles & Videos,Exploring Plain Vision Transformer Backbones f...,"We explore the plain, non-hierarchical Vision ...",yes
3,datascienceweekly,2022-03-31 23:12:31+00:00,Data Science Articles & Videos,Are we being too harsh on junior candidates? [...,As part of our hiring process for ML Engineers...,yes
4,datascienceweekly,2022-03-25 02:50:51+00:00,Data Science Articles & Videos,Sentiment Analysis on News Headlines: Classic ...,An explanatory guide to develop a binary class...,no


# Data Pre-Processing

In [1]:
import string
import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer

# required for WordNetLemmatizer
nltk.download('omw-1.4')

# required for... well stopwords
nltk.download('stopwords')

text_cols = ["topic", "headline", "body"]
df = original_df.copy(deep=True)

[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Moe\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Moe\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Expand Abbreviations

In [None]:
# TODO

## Lower Case

In [6]:
for col in text_cols:
    df[col] = df[col].apply(lambda x: x.lower())

## Remove Punctuiation

In [7]:
for col in text_cols:
    df[col] = df[col].apply(lambda x: "".join([char for char in x if char not in string.punctuation]))

## Tokenize

In [8]:
for col in text_cols:
    df[col] = df[col].apply(lambda x: word_tokenize(x))

In [9]:
stop_words = stopwords.words('english')

for col in text_cols:
    df[col] = df[col].apply(lambda x: [word for word in x if word not in stop_words])

## Lemmaziting

In [10]:
wnl = WordNetLemmatizer()

for col in text_cols:
    df[col] = df[col].apply(lambda x: [wnl.lemmatize(word) for word in x])

## Merging the texts

In [11]:
# TODO could be done easier with df.apply
for i, row in df.iterrows():
    df.loc[i,"full_text"] = " ".join(row["topic"] + row["headline"] + row["body"])

In [12]:
df.head()

Unnamed: 0,newsletter,date,topic,headline,body,__Interesting__MANUAL,full_text
0,datascienceweekly,2022-04-28 23:21:02+00:00,"[editor, pick]","[beyond, interpretability, developing, languag...","[ai, continue, becoming, complex, bigger, smar...",yes,editor pick beyond interpretability developing...
1,datascienceweekly,2022-04-21 22:34:45+00:00,"[data, science, article, video]","[bad, ml, abstraction, generative, v, discrimi...","[post, part, series, bad, abstraction, machine...",no,data science article video bad ml abstraction ...
2,datascienceweekly,2022-03-31 23:12:31+00:00,"[data, science, article, video]","[exploring, plain, vision, transformer, backbo...","[explore, plain, nonhierarchical, vision, tran...",yes,data science article video exploring plain vis...
3,datascienceweekly,2022-03-31 23:12:31+00:00,"[data, science, article, video]","[harsh, junior, candidate, reddit, discussion]","[part, hiring, process, ml, engineer, looking,...",yes,data science article video harsh junior candid...
4,datascienceweekly,2022-03-25 02:50:51+00:00,"[data, science, article, video]","[sentiment, analysis, news, headline, classic,...","[explanatory, guide, develop, binary, classifi...",no,data science article video sentiment analysis ...


# Vectorizing the text

In [13]:
vectorizer = CountVectorizer().fit(df['full_text'])

In [14]:
transformed_matrix = vectorizer.transform(df['full_text'])

In [15]:
cosine_sim = cosine_similarity(transformed_matrix, transformed_matrix)

In [25]:
def recommend_from_headline(df, headline, cosine_sim = cosine_sim):
    print("looking for similar articles to:\n", headline)
    recommended_articles = []
    idx = df[df["headline"] == headline].index[0]
    score_series = pd.Series(cosine_sim[idx]).sort_values(ascending = False)
    top_10_indices = list(score_series.iloc[1:11].index)
    
    for i in top_10_indices:
        recommended_articles.append(list(df['headline'])[i])
        
    return recommended_articles

In [26]:
recommend_from_headline(original_df, original_df.iloc[0].headline)

looking for similar articles to:  Beyond interpretability: developing a language to shape our relationships with AI


['What’s wrong with “explainable A.I.”',
 'On NYT Magazine on AI: Resist the Urge to be Impressed',
 'Announcing the 2022 AI Index Report',
 'The Weird and Wonderful World of AI Art',
 "What's next for AlphaFold and the AI protein-folding revolution",
 'Expert opinion: Regulating AI in Europe',
 'Andrew Ng: Unbiggen AI',
 'Anatomy of an AI System',
 'How Does AI Improve Human Decision-Making? Evidence from the AI-Powered Go Program',
 "ICLR Conference's First Blogpost Track Experiment was a great success with 20 accepted posts"]

In [42]:
def recommend_from_vector(original_df, vec1, transformed_matrix = transformed_matrix):
    recommended_articles = []
    similarity_matrix = cosine_similarity(vec1, transformed_matrix)
    scores = pd.Series(similarity_matrix[0]).sort_values(ascending = False)
    top_10_indices = list(scores.iloc[1:11].index)
    for i in top_10_indices:
        recommended_articles.append(list(original_df['headline'])[i])
    return recommended_articles

In [56]:
# average the "interesting" vector
interesting_idxs = original_df[original_df["__Interesting__MANUAL"] == "yes"].index

# average the "uninteresting" vector
uninteresting_idxs = original_df[original_df["__Interesting__MANUAL"] == "no"].index

In [57]:
recommend_from_vector(original_df, transformed_matrix[uninteresting_idxs].mean(axis=0))



['Data Science at Shopify',
 'Probability Distributions To Be Aware Of For Data Science (With Code)',
 'Faking It: How to Simulate Complex Data Generation Processes in R, Tidyverse Edition',
 'A visual introduction to machine learning',
 'Everything gets a package? Yes, everything gets a package.',
 'Data Observability vs. Data Testing: Everything You Need to Know',
 'How to Structure a Data Science Project for Readability and Transparency',
 'Data salaries at FAANG companies in 2022',
 'Hungryroot is looking for a Data Scientist to join our growing Data Team. As a Data Scientist, you will work closely with other Data Scientists and Data Engineers to develop various Machine Learning models that power Hungryroot and it’s AI functions. These models include traditional forecasting models, as well as more industry-specific optimization challenges.',
 'Hungryroot is looking for a Data Scientist to join our growing Data Team. As a Data Scientist, you will work closely with other Data Scienti