In [93]:
from joblib import Parallel, delayed, cpu_count

import psycopg2

import pandas as pd
import pandas.io.sql as sqlio

import funcy as fp

import re

from bs4 import BeautifulSoup

from textacy.preprocess import preprocess_text, replace_numbers, replace_phone_numbers, replace_urls

import nltk
from nltk.stem import WordNetLemmatizer 

import spacy
from spacy.lang.en.stop_words import STOP_WORDS

from multiprocessing import Pool
import numpy as np

import pattern3

import gensim
from gensim.corpora import Dictionary
from gensim.models import CoherenceModel, LdaModel, LsiModel, HdpModel, LdaMulticore, TfidfModel
from gensim.models.wrappers import LdaMallet
from gensim.similarities import Similarity, MatrixSimilarity, SparseMatrixSimilarity

# import pyLDAvis.gensim
import matplotlib.pyplot as plt

from gensim.test.utils import get_tmpfile
from gensim.matutils import hellinger
from gensim.utils import to_utf8, tokenize
from gensim.models.phrases import Phrases, Phraser
from gensim.parsing.preprocessing import strip_tags, preprocess_string, remove_stopwords, strip_punctuation, strip_multiple_whitespaces, remove_stopwords, strip_numeric, strip_non_alphanum

import pickle

In [2]:
%%time

conn = psycopg2.connect("host=localhost dbname=postgres user=postgres")

contents_df = sqlio.read_sql_query("SELECT id, all_text, article_content FROM bq_all LIMIT 20000", conn, index_col='id')

CPU times: user 660 ms, sys: 1.04 s, total: 1.7 s
Wall time: 10 s


In [4]:
%%time

contents_df["all_text"] += contents_df["article_content"]

CPU times: user 211 ms, sys: 156 ms, total: 366 ms
Wall time: 372 ms


In [12]:
test_series = contents_df["all_text"].head(10000)

In [13]:
def iter_series(series):
    extracted = 0
    for doc in series:
        yield doc
        extracted += 1
        
def pool_iter_series(series):
    pool = Pool()

    series = iter_series(series)

    pool.close()
    pool.join()
    
    return series

In [14]:
# Define function to escape HTML characters, if any:
def strip_html(text):
    soup = BeautifulSoup(text, "html.parser")
    return soup.get_text()

# Define function to replace contractions in string of text, if any:
def clean_text(text):
    text = strip_html(text)
    return replace_numbers(replace_urls(preprocess_text(text, fix_unicode=True, no_accents=True, no_contractions=True, lowercase=True, no_punct=True, no_currency_symbols=True), replace_with=' '), replace_with='')

def tokenize_text(text):
    return list(tokenize(clean_text(text.replace('/n', ' ')).replace('.com', ' ').replace('.org', ' ').replace('.net', ' ')))

In [15]:
STOP_WORDS = list(STOP_WORDS)
STOP_WORDS.append('http')
STOP_WORDS.append('www')

def lemmatize_words(words_list):
    wnl = WordNetLemmatizer()
    return [wnl.lemmatize(wnl.lemmatize(wnl.lemmatize(word, 'a'), 'v'), 'n') for word in words_list if len(word) >= 3]

In [16]:
def pool_imap(function, generator):
    pool = Pool()

    result_generator = pool.imap(function, generator)

    pool.close()
    pool.join()
    
    return result_generator

In [17]:
def get_phrases(doc_stream):
    return [word for word in trigram[bigram[doc_stream]] if len(word) >= 3]

In [18]:
def save_pickle(file_name, object_to_pickle):
    with open(file_name, 'wb') as f:
        pickle.dump(object_to_pickle, f)
        
def load_pickle(file_name):
    with open(file_name, 'rb') as f:
        unpickled_object = pickle.load(f)
    return unpickled_object

In [19]:
def do_preprocess_step(iterable_input, function, file_name):
    docs = pool_iter_series(iterable_input)
    processed_docs = pool_imap(function, docs)
    docs_list = list(processed_docs)
    save_pickle(file_name, docs_list)
    docs_list = load_pickle(file_name)
    return docs_list

In [20]:
def get_bow(tokens):
    return dct.doc2bow(tokens)

In [28]:
def train_and_save_gensim_model(model_type_str, corpus, file_name='model_300.model', num_topics=None):
    if model_type_str == "tfidf":
        model = TfidfModel(corpus=corpus, id2word=dct)
    elif model_type_str == "lsi":
        model = LsiModel(corpus=corpus, num_topics=num_topics, id2word=dct)
    elif model_type_str == "lda":
        model = LdaModel(corpus=corpus, alpha='auto', num_topics=num_topics, id2word=dct)
    elif model_type_str == "hdp":
        model = HdpModel(corpus=corpus, id2word=dct)
    model.save(file_name)
    return model

In [29]:
def transform_tfidf(bow):
    return tfidf_model[bow]

In [284]:
def train_and_save_indexer(corpus, file_name='model_100_indexer.model'):
    index_temp = get_tmpfile("index")
    indexer = Similarity(output_prefix=index_temp, corpus=corpus, num_features=len(dct), num_best=6)
    indexer.save(file_name)
    return indexer

## CLEAN AND TOKENIZE TEXTS

In [30]:
%%time
tokens_list = do_preprocess_step(test_series, tokenize_text, 'tokens_list.pkl')

CPU times: user 21.6 s, sys: 8.36 s, total: 29.9 s
Wall time: 6min 37s


In [31]:
# %%time
# contents = pool_iter_series(contents_df["all_text"])

In [32]:
# %%time
# tokens = pool_imap(tokenize_text, contents)

In [33]:
# %%time
# tokens_list = list(tokens)

In [34]:
# %%time
# save_pickle('tokens_list.pkl', tokens_list)

In [35]:
# %%time
# tokens_list = load_pickle('tokens_list.pkl')

## LEMMATIZE TEXTS

In [36]:
%%time
lemmatized_list = do_preprocess_step(tokens_list, lemmatize_words, 'lemmatized_list.pkl')

CPU times: user 19.1 s, sys: 7.75 s, total: 26.9 s
Wall time: 2min 48s


In [37]:
# %%time
# tokens = pool_iter_series(tokens_list)

In [38]:
# %%time
# lemmatized = pool_imap(lemmatize_words, tokens)

In [39]:
# %%time
# lemmatized_list = list(lemmatized)

## DETECT PHRASES

In [42]:
%%time

phrases = Phrases(lemmatized_list)
bigram = Phraser(phrases)
trigram = Phrases(bigram[lemmatized_list])

CPU times: user 3min 12s, sys: 845 ms, total: 3min 13s
Wall time: 3min 17s


In [43]:
%%time

final_tokens_list = do_preprocess_step(lemmatized_list, get_phrases, 'final_tokens_list.pkl')



CPU times: user 21.5 s, sys: 12.4 s, total: 33.9 s
Wall time: 2min 22s


## CREATE DICTIONARY (ID2WORD)

In [44]:
final_tokens = pool_iter_series(final_tokens_list)

In [45]:
# Train dictionary:

dct = Dictionary(final_tokens)

In [46]:
# Filter dictionary (valid terms in bag of words/corpus):

dct.filter_extremes(no_below=10, no_above=0.3, keep_n=None)

In [47]:
dct.save('dct.dict')

In [48]:
dct = dct.load('dct.dict')

## CONVERT TEXT TO BAG-OF-WORDS (TOKENS TO ID)

In [50]:
%%time

final_tokens = pool_iter_series(final_tokens_list)

CPU times: user 10.8 ms, sys: 121 ms, total: 132 ms
Wall time: 264 ms


In [52]:
%%time

token2id = pool_imap(get_bow, final_tokens)

CPU times: user 9.45 s, sys: 22.1 s, total: 31.6 s
Wall time: 49.1 s


In [53]:
%%time

gensim.corpora.MmCorpus.serialize('corpus_bow.mm', token2id)

CPU times: user 8.19 s, sys: 450 ms, total: 8.64 s
Wall time: 9.01 s


In [54]:
%%time

mm_corpus = gensim.corpora.MmCorpus('corpus_bow.mm')
print(mm_corpus)

MmCorpus(10000 documents, 34192 features, 3803772 non-zero entries)
CPU times: user 3.45 ms, sys: 7.22 ms, total: 10.7 ms
Wall time: 14.4 ms


## PERFORM TFIDF TRANSFORMATION

In [55]:
%%time
tfidf_model = train_and_save_gensim_model("tfidf", mm_corpus, "tfidf.model")

CPU times: user 4.4 s, sys: 73.6 ms, total: 4.47 s
Wall time: 4.7 s


In [56]:
%%time

tfidf_corpus = pool_imap(transform_tfidf, mm_corpus)

CPU times: user 8.06 s, sys: 1.5 s, total: 9.56 s
Wall time: 18.4 s


In [57]:
%%time

gensim.corpora.MmCorpus.serialize('corpus_tfidf.mm', tfidf_corpus)

CPU times: user 16 s, sys: 206 ms, total: 16.2 s
Wall time: 19.6 s


In [58]:
%%time

print(tfidf_corpus)

MmCorpus(10000 documents, 34192 features, 3803772 non-zero entries)
CPU times: user 3.22 ms, sys: 8.78 ms, total: 12 ms
Wall time: 15.2 ms


## TRAIN TOPIC MODELS/DIMENSIONALITY REDUCERS

In [59]:
%%time

lsimodel_500 = train_and_save_gensim_model("lsi", tfidf_corpus, 'lsimodel_500.model', num_topics=500)

CPU times: user 45.8 s, sys: 2.39 s, total: 48.2 s
Wall time: 49.2 s


In [60]:
%%time

lsimodel_300 = train_and_save_gensim_model("lsi", tfidf_corpus, 'lsimodel_300.model', num_topics=300)

CPU times: user 29.5 s, sys: 1.15 s, total: 30.7 s
Wall time: 34.9 s


In [61]:
%%time

ldamodel_100 = train_and_save_gensim_model("lsi", tfidf_corpus, 'ldamodel_100.model', num_topics=100)

CPU times: user 17.7 s, sys: 738 ms, total: 18.4 s
Wall time: 20.1 s


In [62]:
%%time

ldamodel_50 = train_and_save_gensim_model("lsi", tfidf_corpus, 'ldamodel_50.model', num_topics=50)

CPU times: user 18.8 s, sys: 14.1 s, total: 32.9 s
Wall time: 41 s


In [64]:
%%time

hdpmodel = train_and_save_gensim_model("hdp", tfidf_corpus, 'hdpmodel.model')

CPU times: user 5min 40s, sys: 36.3 s, total: 6min 17s
Wall time: 5min


## CONVERT TOKENS TO TOPIC DISTRIBUTIONS

In [65]:
%%time

lsi_500_corpus = lsimodel_500[tfidf_corpus]
gensim.corpora.MmCorpus.serialize('lsi_500_corpus.mm', lsi_500_corpus)

CPU times: user 33.4 s, sys: 793 ms, total: 34.2 s
Wall time: 38.7 s


In [66]:
%%time

lsi_300_corpus = lsimodel_300[tfidf_corpus]
gensim.corpora.MmCorpus.serialize('lsi_300_corpus.mm', lsi_300_corpus)

CPU times: user 17.8 s, sys: 383 ms, total: 18.2 s
Wall time: 18.7 s


In [67]:
%%time

lda_100_corpus = ldamodel_100[tfidf_corpus]
gensim.corpora.MmCorpus.serialize('lda_100_corpus.mm', lda_100_corpus)

CPU times: user 10.3 s, sys: 318 ms, total: 10.6 s
Wall time: 12.2 s


In [68]:
%%time

lda_50_corpus = ldamodel_50[tfidf_corpus]
gensim.corpora.MmCorpus.serialize('lda_50_corpus.mm', lda_50_corpus)

CPU times: user 8.43 s, sys: 243 ms, total: 8.67 s
Wall time: 9.63 s


In [69]:
%%time

hdp_corpus = hdpmodel[tfidf_corpus]
gensim.corpora.MmCorpus.serialize('hdp_corpus.mm', hdp_corpus)

CPU times: user 49 s, sys: 14.3 s, total: 1min 3s
Wall time: 47.7 s


## BUILT SIMILARITY INDICES FOR THE VARIOUS MODELS

In [290]:
%%time

lsi_500_indexer = train_and_save_indexer(lsi_500_corpus, 'lsi_500_indexer.model')

CPU times: user 24.6 s, sys: 4.25 s, total: 28.8 s
Wall time: 30.1 s


In [291]:
%%time

lsi_300_indexer = train_and_save_indexer(lsi_300_corpus, 'lsi_300_indexer.model')

CPU times: user 17.7 s, sys: 618 ms, total: 18.3 s
Wall time: 18.8 s


In [292]:
%%time

lda_50_indexer = train_and_save_indexer(lda_50_corpus, 'lda_50_indexer.model')

CPU times: user 12.2 s, sys: 4.88 s, total: 17 s
Wall time: 18.3 s


In [293]:
%%time

lda_100_indexer = train_and_save_indexer(lda_100_corpus, 'lda_100_indexer.model')

CPU times: user 11.4 s, sys: 344 ms, total: 11.8 s
Wall time: 12 s


In [294]:
%%time

hdp_indexer = train_and_save_indexer(hdp_corpus)

CPU times: user 52.4 s, sys: 3.52 s, total: 55.9 s
Wall time: 32.3 s


## RETRIEVE RECS FOR STORIES

In [305]:
def get_series_index(story_id):
    story_ids_map_dict = {story_id: series_index for series_index, story_id in enumerate(test_series.index)}
    return story_ids_map_dict[story_id]

def get_story_id(series_index):
    return test_series.index[series_index]

In [306]:
def get_sim_ids(story_id, corpus, indexer):
    series_index = get_series_index(story_id)
    vec = corpus[series_index]
    sims = indexer[vec]
    sim_indices = [sim_index for sim_index, sim_score in sims]
    return sim_indices

def fetch_story_titles_from_pgsql(story_id):
    conn = psycopg2.connect("host=localhost dbname=postgres user=postgres")
    cur = conn.cursor()
    cur.execute("SELECT title FROM bq_all WHERE id={};".format(story_id))
    title = cur.fetchone()[0]
    conn.commit
    return title

In [335]:
%%time

corpus_indexer_list= [
    ('lsi_500', lsi_500_corpus, lsi_500_indexer),
    ('lsi_300', lsi_500_corpus, lsi_500_indexer),
    ('lda_100', lda_100_corpus, lda_100_indexer),
    ('lda_50', lda_50_corpus, lda_50_indexer),
    ('hdp', hdp_corpus, hdp_indexer)
     ]

random_story_ids = np.random.choice(story_ids_list, size=10, replace=False)

given_stories = []
model_recs_dicts = []

for story_id in random_story_ids:
    model_recs = {}
    for model_name, corpus, indexer in corpus_indexer_list:
        sim_indices = get_sim_ids(story_id, corpus, indexer)
        rec_titles_list = [fetch_story_titles_from_pgsql(get_story_id(each_id)) for each_id in sim_indices]
        given_story_str = rec_titles_list[0]
        given_stories.append(given_story_str)
        rec_stories_list = rec_titles_list[1:]
        model_recs[model_name] = rec_stories_list
    model_recs_dicts.append(model_recs)    

CPU times: user 877 ms, sys: 264 ms, total: 1.14 s
Wall time: 3.47 s


In [336]:
%%time

story_recs_dataframes = [pd.DataFrame(model_recs_dicts[index]) for index in range(10)]

CPU times: user 12.8 ms, sys: 1.17 ms, total: 14 ms
Wall time: 13.9 ms


In [339]:
given_stories = given_stories[0::5]

## EVALUATE RECOMMENDATIONS

In [329]:
pd.set_option('max_colwidth', 200)

In [340]:
print("Given story: {}".format(given_stories[0]))
story_recs_dataframes[0]

Given story: The Art of Code Review: A Dropbox Story


Unnamed: 0,hdp,lda_100,lda_50,lsi_300,lsi_500
0,Read The Ultralight Startup by Jason Baptiste free for 24 hours,Show HN: Harp Platform now available to the public,Free Python ebook: Bayesian Methods for Hackers,Book Notes: Smart Choices – a Practical Guide to Making Better Life Decisions,Book Notes: Smart Choices – a Practical Guide to Making Better Life Decisions
1,Lizard Kids: A Long Trail of Fail,Dropbox blocks API access for Boxopus over piracy concerns,Python Machine Learning,Dropbox blocks API access for Boxopus over piracy concerns,Dropbox blocks API access for Boxopus over piracy concerns
2,Show HN: Wander – Travel by budget,Prototypes vs. MVPs,"Show HN: Hello Python, my book about learning python, has just been released",The Coming Dropbox Apocalypse,The Coming Dropbox Apocalypse
3,Growl notifications for GitHub updates,My experience of runnning Google Ads for Dropbox referrals,Python data tools just keep getting better,Show HN: Harp Platform now available to the public,Show HN: Harp Platform now available to the public
4,Ask HN: Idea March,Dropbox' Public/ folders will be phased out soon,The Python Standard Library - Where Modules Go To Die,Dropbox' Public/ folders will be phased out soon,Dropbox' Public/ folders will be phased out soon


In [341]:
print("Given story: {}".format(given_stories[1]))
story_recs_dataframes[1]

Given story: Bitcoin's value is decentralization


Unnamed: 0,hdp,lda_100,lda_50,lsi_300,lsi_500
0,Programming Language Analysis,Inside the Bitcoin economy,Bitcoin Miners Signal Revolt Amid Sluggish Blockchain,The Big Book of Bitcoin - An introduction,The Big Book of Bitcoin - An introduction
1,Shark Tank Statistics,China bans new Bitcoin deposits,China bans new Bitcoin deposits,Inside the Bitcoin economy,Inside the Bitcoin economy
2,Firestr: encrypted P2P chat and Lua application platform,Bitcoins Fail Currency Test in Scandinavia’s Richest Nation,Inside the Bitcoin economy,Bitcoins Fail Currency Test in Scandinavia’s Richest Nation,Bitcoins Fail Currency Test in Scandinavia’s Richest Nation
3,Ask HN: Google warned me that a state organized hacking group targeted me,The Target Value for Bitcoin is $100K to $1M,Bitcoins Fail Currency Test in Scandinavia’s Richest Nation,Petitioning SIX Interbank Clearing: Include a symbol for Bitcoin in ISO 4217,Petitioning SIX Interbank Clearing: Include a symbol for Bitcoin in ISO 4217
4,Ask HN: Please critique my personal digital security strategy,Winklevoss Twins Aim to Take Bitcoin Mainstream with a Regulated Exchange,Bitcoin drops 50% overnight as China’s Biggest BTC exchange stops Yuan deposits,Bitcoin Miners Signal Revolt Amid Sluggish Blockchain,Bitcoin Miners Signal Revolt Amid Sluggish Blockchain


In [343]:
print("Given story: {}".format(given_stories[2]))
story_recs_dataframes[2]

Given story: Bjarne Stroustrup Discusses C++


Unnamed: 0,hdp,lda_100,lda_50,lsi_300,lsi_500
0,Programming Language Analysis,C++ is a hack,C++ is a hack,"C++ Internals: STL Vector, Part I","C++ Internals: STL Vector, Part I"
1,Shark Tank Statistics,"Popular Myths about C++, Part 2",Why artificially limit your code to C?,Implementing a dynamically sized array in C,Implementing a dynamically sized array in C
2,Firestr: encrypted P2P chat and Lua application platform,Object-oriented techniques in C,Mu: making programs easier to understand in the large,"Popular Myths about C++, Part 2","Popular Myths about C++, Part 2"
3,Ask HN: Google warned me that a state organized hacking group targeted me,Mu: making programs easier to understand in the large,Loop Exits and Structured Programming: Reopening the Debate (1995) [pdf],"Covariance, Contravariance, and Super Type Constraints","Covariance, Contravariance, and Super Type Constraints"
4,Ask HN: Please critique my personal digital security strategy,"Go-internals: Chapter 2, “Interfaces” released","Popular Myths about C++, Part 2","Performance: SIMD, Vectorization and Performance Tuning [video]","Performance: SIMD, Vectorization and Performance Tuning [video]"


In [344]:
print("Given story: {}".format(given_stories[3]))
story_recs_dataframes[3]

Given story: The Moderately Enthusiastic Programmer


Unnamed: 0,hdp,lda_100,lda_50,lsi_300,lsi_500
0,Programming Language Analysis,Ask HN: What do you do when your entire being opposes the task at hand?,Ask HN: What do you do when your entire being opposes the task at hand?,I don’t want to be a software developer anymore,I don’t want to be a software developer anymore
1,Shark Tank Statistics,On Funemployment (and My Next Job),Why your programmers just want to code,Why your programmers just want to code,Why your programmers just want to code
2,Firestr: encrypted P2P chat and Lua application platform,Practical tips for writing inclusive job ads (2016),On Funemployment (and My Next Job),Practical tips for writing inclusive job ads (2016),Practical tips for writing inclusive job ads (2016)
3,Ask HN: Google warned me that a state organized hacking group targeted me,Old Geek,Only code at work? That doesn’t make you a worse programmer,I didn’t fall in love with coding,I didn’t fall in love with coding
4,Ask HN: Please critique my personal digital security strategy,About the Penny Arcade Job Posting,Ask HN: Started a new job and their existing code sucks. What to do?,"The ""Spanish Theory"" of software project management","The ""Spanish Theory"" of software project management"


In [345]:
print("Given story: {}".format(given_stories[4]))
story_recs_dataframes[4]

Given story: Wget Arbitrary Commands Execution


Unnamed: 0,hdp,lda_100,lda_50,lsi_300,lsi_500
0,Programming Language Analysis,RPGFS: Crossing an RPG with a Unix Filesystem,RPGFS: Crossing an RPG with a Unix Filesystem,cURL 7.50.2 released,cURL 7.50.2 released
1,Shark Tank Statistics,Interesting commands for the Linux shell,Show HN: View and diff files in ZFS snapshots,RPGFS: Crossing an RPG with a Unix Filesystem,RPGFS: Crossing an RPG with a Unix Filesystem
2,Firestr: encrypted P2P chat and Lua application platform,Javascript RAR reader,Zip Bomb,Show HN: immut.io – an immutable blob store,Show HN: immut.io – an immutable blob store
3,Ask HN: Google warned me that a state organized hacking group targeted me,Entr(1) – Run tests whenever files change,Entr(1) – Run tests whenever files change,A collection of Unix terminal/console/curses tools,A collection of Unix terminal/console/curses tools
4,Ask HN: Please critique my personal digital security strategy,Rob Pike: the origin of dotfiles,Show HN: YouTransfer – Self-hosted file sharing,"HTTPie is a command line HTTP client, a user-friendly cURL replacement","HTTPie is a command line HTTP client, a user-friendly cURL replacement"


In [346]:
print("Given story: {}".format(given_stories[5]))
story_recs_dataframes[5]

Given story: Google DeepMind AI destroys human expert in lip reading competition


Unnamed: 0,hdp,lda_100,lda_50,lsi_300,lsi_500
0,Programming Language Analysis,Google’s DeepMind Achieves Speech-Generation Breakthrough,Yale researchers reconstruct facial images locked in a viewer’s mind,Google’s DeepMind Achieves Speech-Generation Breakthrough,Google’s DeepMind Achieves Speech-Generation Breakthrough
1,Shark Tank Statistics,Yale researchers reconstruct facial images locked in a viewer’s mind,Google & Stanford create a digital brain that learns to identify a human face,Generating Factoid Questions with RNNs: The 30M Factoid Question-Answer Corpus,Generating Factoid Questions with RNNs: The 30M Factoid Question-Answer Corpus
2,Firestr: encrypted P2P chat and Lua application platform,Deep Voice: Real-Time Neural Text-To-Speech,Deep Voice: Real-Time Neural Text-To-Speech,Making a Video Course – Behind the Scenes,Making a Video Course – Behind the Scenes
3,Ask HN: Google warned me that a state organized hacking group targeted me,Generating Factoid Questions with RNNs: The 30M Factoid Question-Answer Corpus,Google’s DeepMind Achieves Speech-Generation Breakthrough,Neural nets more accurate than humans at detecting sexual orientation in images,Neural nets more accurate than humans at detecting sexual orientation in images
4,Ask HN: Please critique my personal digital security strategy,Amazon Transcribe - Automatic speech recognition,Explainable Artificial Intelligence (XAI) Darpa Funding,Amazon Transcribe - Automatic speech recognition,Amazon Transcribe - Automatic speech recognition


In [347]:
print("Given story: {}".format(given_stories[6]))
story_recs_dataframes[6]

Given story: Jupyter, Mathematica, and the Future of the Research Paper


Unnamed: 0,hdp,lda_100,lda_50,lsi_300,lsi_500
0,Programming Language Analysis,JupyterLab is ready for users,SpaCy: Industrial-strength NLP with Python and Cython,Celebrating Mathematica’s First Quarter Century,Celebrating Mathematica’s First Quarter Century
1,Shark Tank Statistics,Celebrating Mathematica’s First Quarter Century,The Making of SciPy 1.0,JupyterLab is ready for users,JupyterLab is ready for users
2,Firestr: encrypted P2P chat and Lua application platform,The Making of SciPy 1.0,JupyterLab is ready for users,Wolfram Language,Wolfram Language
3,Ask HN: Google warned me that a state organized hacking group targeted me,Starting data analysis with R: Things I wish I'd been told,Celebrating Mathematica’s First Quarter Century,An inside look at Stephen Wolfram's computational paradigm,An inside look at Stephen Wolfram's computational paradigm
4,Ask HN: Please critique my personal digital security strategy,How to Read a Research Paper [pdf],You shouldn’t use a spreadsheet for important work,I Wrote a Book to Teach the Wolfram Language,I Wrote a Book to Teach the Wolfram Language


In [348]:
print("Given story: {}".format(given_stories[7]))
story_recs_dataframes[7]

Given story: Procedural Modelling of Buildings [pdf]


Unnamed: 0,hdp,lda_100,lda_50,lsi_300,lsi_500
0,Crawling BitTorrent DHTs for Fun and Profit (2010) [pdf],Arxiv Sanity Preserver,Arxiv Sanity Preserver,How to Read a Research Paper [pdf],How to Read a Research Paper [pdf]
1,Django 1.4 release candidate 2,Best Papers vs. Top Cited Papers in Computer Science,How to Read a Research Paper [pdf],Best Papers vs. Top Cited Papers in Computer Science,Best Papers vs. Top Cited Papers in Computer Science
2,"Show HN: BOX'NGO, an online buying and selling platform for students",Great Works in Programming Languages,How did Andrew Ng become so good at Machine Learning?,Arxiv Sanity Preserver,Arxiv Sanity Preserver
3,"Remote Jobs, but WHERE?",How to Read a Research Paper [pdf],Great Works in Programming Languages,The Growing Impact of Old Scientific Papers,The Growing Impact of Old Scientific Papers
4,"Ask HN: Just made Director, now what?",The Growing Impact of Old Scientific Papers,Best Papers vs. Top Cited Papers in Computer Science,Great Works in Programming Languages,Great Works in Programming Languages


In [349]:
print("Given story: {}".format(given_stories[8]))
story_recs_dataframes[8]

Given story: 40,000-year-old bracelet made by extinct human species found (2015)


Unnamed: 0,hdp,lda_100,lda_50,lsi_300,lsi_500
0,Programming Language Analysis,Another person that discovered evolution besides Darwin,Another person that discovered evolution besides Darwin,Thigh Bone Suggests Ancient and Modern Humans Overlapped,Thigh Bone Suggests Ancient and Modern Humans Overlapped
1,Shark Tank Statistics,"Pom-pom crabs fight over tiny anemones, which they hold like boxing gloves","Pom-pom crabs fight over tiny anemones, which they hold like boxing gloves","In Neanderthal DNA, Signs of a Mysterious Human Migration","In Neanderthal DNA, Signs of a Mysterious Human Migration"
2,Firestr: encrypted P2P chat and Lua application platform,Greenland shark found to be at least 272 years old,"To Lions, Zebras Are Mostly Gray",A golden age of ancient DNA science begins,A golden age of ancient DNA science begins
3,Ask HN: Google warned me that a state organized hacking group targeted me,"All by Itself, the Humble Sweet Potato Colonized the World: Study",Stanford researcher declares that the sixth mass extinction is here,"Culture, mathematical models, and Neandertal extinction","Culture, mathematical models, and Neandertal extinction"
4,Ask HN: Please critique my personal digital security strategy,How and why did humans domesticate animals?,Survival of the Friendliest,What Happened to the Hominids Who May Have Been Smarter Than Us?,What Happened to the Hominids Who May Have Been Smarter Than Us?


In [350]:
print("Given story: {}".format(given_stories[9]))
story_recs_dataframes[9]

Given story: Schools Are Slow to Learn That Sleep Deprivation Hits Teenagers Hardest


Unnamed: 0,hdp,lda_100,lda_50,lsi_300,lsi_500
0,Programming Language Analysis,Why some people respond to stress by falling asleep,How I got into MIT when I was 14,The myth of the eight-hour sleep,The myth of the eight-hour sleep
1,Shark Tank Statistics,"Never mind talent: Practice, practice, practice",How Would You Do on the New SAT?,"Productive on six hours of sleep? You’re deluding yourself, expert says","Productive on six hours of sleep? You’re deluding yourself, expert says"
2,Firestr: encrypted P2P chat and Lua application platform,Existential Depression in Gifted Children and Adults (2009) [pdf],Ask HN: Allowance for kids?,When you have to wake up earlier than usual,When you have to wake up earlier than usual
3,Ask HN: Google warned me that a state organized hacking group targeted me,"Can 10,000 hours of practice make you an expert?",Existential Depression in Gifted Children and Adults (2009) [pdf],Ask HN: How many hours of sleep are you getting?,Ask HN: How many hours of sleep are you getting?
4,Ask HN: Please critique my personal digital security strategy,When you have to wake up earlier than usual,How Does Your Kindergarten Classroom Affect Your Earnings?,"The Uberwomen Who Beat Sleep: Origin of the ""Uberman"" polyphasic sleep schedule","The Uberwomen Who Beat Sleep: Origin of the ""Uberman"" polyphasic sleep schedule"
