In [1]:
import pandas as pd
import numpy as np
import json

# Load the labeled data

We are using the **kern export format** here, which is a simple json that can be read from pandas directly. 

If you're using a csv from an Excel export, just modify this code here!

In [12]:
path = "labeled data\datascienceweekly_labeled.json"
df = pd.read_json(path)

In [13]:
df = df.drop(columns=["__Interesting__WEAK_SUPERVISION", "__Interesting__WEAK_SUPERVISION__confidence"])

In [14]:
df.head()

Unnamed: 0,newsletter,date,topic,headline,body,__Interesting__MANUAL
0,datascienceweekly,2022-04-28 23:21:02+00:00,Editor Picks,Beyond interpretability: developing a language...,"AI will continue becoming more complex, bigger...",yes
1,datascienceweekly,2022-04-21 22:34:45+00:00,Data Science Articles & Videos,Bad ML Abstractions I (Generative vs Discrimin...,This post is part of a series on bad abstracti...,no
2,datascienceweekly,2022-03-31 23:12:31+00:00,Data Science Articles & Videos,Exploring Plain Vision Transformer Backbones f...,"We explore the plain, non-hierarchical Vision ...",yes
3,datascienceweekly,2022-03-31 23:12:31+00:00,Data Science Articles & Videos,Are we being too harsh on junior candidates? [...,As part of our hiring process for ML Engineers...,yes
4,datascienceweekly,2022-03-25 02:50:51+00:00,Data Science Articles & Videos,Sentiment Analysis on News Headlines: Classic ...,An explanatory guide to develop a binary class...,no


# Data Pre-Processing

In [41]:
import string
import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer

# required for WordNetLemmatizer
nltk.download('omw-1.4')

# required for... well stopwords
nltk.download('stopwords')

text_cols = ["topic", "headline", "body"]

[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Moe\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Moe\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [7]:
text_cols = ["topic", "headline", "body"]

## Expand Abbreviations

In [None]:
# TODO

## Lower Case

In [15]:
for col in text_cols:
    df[col] = df[col].apply(lambda x: x.lower())

## Remove Punctuiation

In [16]:
for col in text_cols:
    df[col] = df[col].apply(lambda x: "".join([char for char in x if char not in string.punctuation]))

## Tokenize

In [19]:
for col in text_cols:
    df[col] = df[col].apply(lambda x: word_tokenize(x))

In [26]:
stop_words = stopwords.words('english')

for col in text_cols:
    df[col] = df[col].apply(lambda x: [word for word in x if word not in stop_words])

## Lemmaziting

In [31]:
wnl = WordNetLemmatizer()

for col in text_cols:
    df[col] = df[col].apply(lambda x: [wnl.lemmatize(word) for word in x])

## Merging the texts

In [None]:
for i, row in df.iterrows():
    df.loc[i,"full_text"] = " ".join(row["topic"] + row["headline"] + row["body"])

In [45]:
df.head()

Unnamed: 0,newsletter,date,topic,headline,body,__Interesting__MANUAL,full_text
0,datascienceweekly,2022-04-28 23:21:02+00:00,"[editor, pick]","[beyond, interpretability, developing, languag...","[ai, continue, becoming, complex, bigger, smar...",yes,editor pick beyond interpretability developing...
1,datascienceweekly,2022-04-21 22:34:45+00:00,"[data, science, article, video]","[bad, ml, abstraction, generative, v, discrimi...","[post, part, series, bad, abstraction, machine...",no,data science article video bad ml abstraction ...
2,datascienceweekly,2022-03-31 23:12:31+00:00,"[data, science, article, video]","[exploring, plain, vision, transformer, backbo...","[explore, plain, nonhierarchical, vision, tran...",yes,data science article video exploring plain vis...
3,datascienceweekly,2022-03-31 23:12:31+00:00,"[data, science, article, video]","[harsh, junior, candidate, reddit, discussion]","[part, hiring, process, ml, engineer, looking,...",yes,data science article video harsh junior candid...
4,datascienceweekly,2022-03-25 02:50:51+00:00,"[data, science, article, video]","[sentiment, analysis, news, headline, classic,...","[explanatory, guide, develop, binary, classifi...",no,data science article video sentiment analysis ...


# Vectorizing the text

In [48]:
vectorizer = CountVectorizer().fit(df['full_text'])

In [53]:
transformed_matrix = vectorizer.transform(df['full_text'])