In [1]:
from joblib import Parallel, delayed, cpu_count

import psycopg2

import pandas as pd
import pandas.io.sql as sqlio

import funcy as fp

import re

from bs4 import BeautifulSoup

from textacy.preprocess import preprocess_text, replace_numbers, replace_phone_numbers, replace_urls

import nltk
from nltk.stem import WordNetLemmatizer 

import spacy
from spacy.lang.en.stop_words import STOP_WORDS

from multiprocessing import Pool
import numpy as np

import pattern3

import gensim
from gensim.corpora import Dictionary
from gensim.models import CoherenceModel, LdaModel, LsiModel, HdpModel, LdaMulticore, TfidfModel
from gensim.models.wrappers import LdaMallet
from gensim.similarities import Similarity, MatrixSimilarity, SparseMatrixSimilarity

# import pyLDAvis.gensim
import matplotlib.pyplot as plt

from gensim.test.utils import get_tmpfile
from gensim.matutils import hellinger
from gensim.utils import to_utf8, tokenize
from gensim.models.phrases import Phrases, Phraser
from gensim.parsing.preprocessing import strip_tags, preprocess_string, remove_stopwords, strip_punctuation, strip_multiple_whitespaces, remove_stopwords, strip_numeric, strip_non_alphanum

import pickle

In [2]:
%%time

conn = psycopg2.connect("host=localhost dbname=postgres user=postgres")

contents_df = sqlio.read_sql_query("SELECT id, all_text, article_content FROM bq_all WHERE all_text IS NOT NULL AND all_text != '' AND article_content IS NOT NULL AND article_content != '' ORDER BY score DESC, story_time DESC LIMIT 25000", conn, index_col='id')

CPU times: user 1.22 s, sys: 1.63 s, total: 2.86 s
Wall time: 21.2 s


In [3]:
%%time

contents_df["all_text"] += contents_df["article_content"]

CPU times: user 503 ms, sys: 427 ms, total: 930 ms
Wall time: 930 ms


In [4]:
test_series = contents_df["all_text"]

In [5]:
def iter_series(series):
    extracted = 0
    for doc in series:
        yield doc
        extracted += 1
        
def pool_iter_series(series):
    pool = Pool()

    series = iter_series(series)

    pool.close()
    pool.join()
    
    return series

In [6]:
# Define function to escape HTML characters, if any:
def strip_html(text):
    soup = BeautifulSoup(text, "html.parser")
    return soup.get_text()

# Define function to replace contractions in string of text, if any:
def clean_text(text):
    text = strip_html(text)
    return replace_numbers(replace_urls(preprocess_text(text, fix_unicode=True, no_accents=True, no_contractions=True, lowercase=True, no_punct=True, no_currency_symbols=True), replace_with=' '), replace_with='')

def tokenize_text(text):
    return list(tokenize(clean_text(text.replace('/n', ' ')).replace('.com', ' ').replace('.org', ' ').replace('.net', ' ')))

In [7]:
STOP_WORDS = list(STOP_WORDS)
STOP_WORDS.append('http')
STOP_WORDS.append('www')

def lemmatize_words(words_list):
    wnl = WordNetLemmatizer()
    return [wnl.lemmatize(wnl.lemmatize(wnl.lemmatize(word, 'a'), 'v'), 'n') for word in words_list if len(word) >= 3]

In [8]:
def pool_imap(function, generator):
    pool = Pool()

    result_generator = pool.imap(function, generator)

    pool.close()
    pool.join()
    
    return result_generator

In [9]:
def get_phrases(doc_stream):
    return [word for word in trigram[bigram[doc_stream]] if len(word) >= 3]

In [10]:
def save_pickle(file_name, object_to_pickle):
    with open(file_name, 'wb') as f:
        pickle.dump(object_to_pickle, f)
        
def load_pickle(file_name):
    with open(file_name, 'rb') as f:
        unpickled_object = pickle.load(f)
    return unpickled_object

In [11]:
def do_preprocess_step(iterable_input, function, file_name):
    docs = pool_iter_series(iterable_input)
    processed_docs = pool_imap(function, docs)
    docs_list = list(processed_docs)
    save_pickle(file_name, docs_list)
    docs_list = load_pickle(file_name)
    return docs_list

In [12]:
def get_bow(tokens):
    return dct.doc2bow(tokens)

In [13]:
def train_and_save_gensim_model(model_type_str, corpus, file_name='model_300.model', num_topics=None):
    if model_type_str == "tfidf":
        model = TfidfModel(corpus=corpus, id2word=dct)
    elif model_type_str == "lsi":
        model = LsiModel(corpus=corpus, num_topics=num_topics, id2word=dct)
    elif model_type_str == "lda":
        model = LdaModel(corpus=corpus, alpha='auto', num_topics=num_topics, id2word=dct)
    elif model_type_str == "hdp":
        model = HdpModel(corpus=corpus, id2word=dct)
    model.save(file_name)
    return model

In [14]:
def transform_tfidf(bow):
    return tfidf_model[bow]

In [15]:
def train_and_save_indexer(corpus, file_name='model_100_indexer.model'):
    index_temp = get_tmpfile("index")
    indexer = Similarity(output_prefix=index_temp, corpus=corpus, num_features=len(dct), num_best=6)
    indexer.save(file_name)
    return indexer

## CLEAN AND TOKENIZE TEXTS

In [16]:
%%time
tokens_list = do_preprocess_step(test_series, tokenize_text, 'tokens_list.pkl')

CPU times: user 2min 10s, sys: 2min 45s, total: 4min 56s
Wall time: 25min 50s


## LEMMATIZE TEXTS

In [17]:
%%time
lemmatized_list = do_preprocess_step(tokens_list, lemmatize_words, 'lemmatized_list.pkl')

CPU times: user 2min 32s, sys: 3min 57s, total: 6min 30s
Wall time: 24min 39s


## DETECT PHRASES

In [18]:
%%time

phrases = Phrases(lemmatized_list)
bigram = Phraser(phrases)
trigram = Phrases(bigram[lemmatized_list])

CPU times: user 13min 34s, sys: 27.7 s, total: 14min 2s
Wall time: 14min 17s


In [19]:
%%time

final_tokens_list = do_preprocess_step(lemmatized_list, get_phrases, 'final_tokens_list.pkl')



CPU times: user 2min 20s, sys: 4min 32s, total: 6min 52s
Wall time: 18min 47s


## CREATE DICTIONARY (ID2WORD)

In [20]:
%%time

final_tokens = pool_iter_series(final_tokens_list)

CPU times: user 10.8 ms, sys: 2.18 s, total: 2.19 s
Wall time: 2.66 s


In [21]:
%%time

# Train dictionary:

dct = Dictionary(final_tokens)

CPU times: user 1min 10s, sys: 9.45 s, total: 1min 19s
Wall time: 1min 22s


In [22]:
%%time

# Filter dictionary (valid terms in bag of words/corpus):

dct.filter_extremes(no_below=10, no_above=0.3, keep_n=None)

CPU times: user 1.33 s, sys: 928 ms, total: 2.25 s
Wall time: 2.32 s


In [23]:
%%time

dct.save('dct.dict')

CPU times: user 33.7 ms, sys: 24.8 ms, total: 58.4 ms
Wall time: 93.3 ms


In [24]:
%%time

dct = dct.load('dct.dict')

CPU times: user 44.8 ms, sys: 19.6 ms, total: 64.4 ms
Wall time: 79.3 ms


## CONVERT TEXT TO BAG-OF-WORDS (TOKENS TO ID)

In [25]:
%%time

final_tokens = pool_iter_series(final_tokens_list)

CPU times: user 10.2 ms, sys: 2.17 s, total: 2.18 s
Wall time: 2.34 s


In [26]:
%%time

token2id = pool_imap(get_bow, final_tokens)

CPU times: user 40 s, sys: 2min 6s, total: 2min 46s
Wall time: 4min 59s


In [27]:
%%time

gensim.corpora.MmCorpus.serialize('corpus_bow.mm', token2id)

CPU times: user 21.1 s, sys: 2.63 s, total: 23.7 s
Wall time: 25 s


In [28]:
%%time

mm_corpus = gensim.corpora.MmCorpus('corpus_bow.mm')
print(mm_corpus)

MmCorpus(25000 documents, 86410 features, 14003923 non-zero entries)
CPU times: user 6 ms, sys: 12.8 ms, total: 18.8 ms
Wall time: 36.1 ms


## PERFORM TFIDF TRANSFORMATION

In [29]:
%%time
tfidf_model = train_and_save_gensim_model("tfidf", mm_corpus, "tfidf.model")

CPU times: user 11.4 s, sys: 178 ms, total: 11.5 s
Wall time: 11.7 s


In [30]:
%%time

tfidf_corpus = pool_imap(transform_tfidf, mm_corpus)

CPU times: user 43.6 s, sys: 1min 38s, total: 2min 22s
Wall time: 3min 55s


In [31]:
%%time

gensim.corpora.MmCorpus.serialize('corpus_tfidf.mm', tfidf_corpus)

CPU times: user 34.7 s, sys: 2.71 s, total: 37.4 s
Wall time: 38.3 s


In [32]:
%%time

tfidf_corpus = gensim.corpora.MmCorpus('corpus_tfidf.mm')
print(tfidf_corpus)

MmCorpus(25000 documents, 86410 features, 14003923 non-zero entries)
CPU times: user 6.23 ms, sys: 15.6 ms, total: 21.8 ms
Wall time: 50.8 ms


## TRAIN TOPIC MODELS/DIMENSIONALITY REDUCERS

In [33]:
%%time

lsimodel_500 = train_and_save_gensim_model("lsi", tfidf_corpus, 'lsimodel_500.model', num_topics=500)

CPU times: user 2min 48s, sys: 1min 42s, total: 4min 31s
Wall time: 5min 19s


In [34]:
%%time

lsimodel_300 = train_and_save_gensim_model("lsi", tfidf_corpus, 'lsimodel_300.model', num_topics=300)

CPU times: user 1min 32s, sys: 4.84 s, total: 1min 37s
Wall time: 1min 17s


In [35]:
%%time

ldamodel_100 = train_and_save_gensim_model("lsi", tfidf_corpus, 'ldamodel_100.model', num_topics=100)

CPU times: user 51 s, sys: 2.25 s, total: 53.3 s
Wall time: 45.2 s


In [36]:
%%time

ldamodel_50 = train_and_save_gensim_model("lsi", tfidf_corpus, 'ldamodel_50.model', num_topics=50)

CPU times: user 1min 1s, sys: 1min 32s, total: 2min 34s
Wall time: 3min 50s


In [37]:
%%time

hdpmodel = train_and_save_gensim_model("hdp", tfidf_corpus, 'hdpmodel.model')

CPU times: user 14min 21s, sys: 10min 25s, total: 24min 46s
Wall time: 27min 53s


## CONVERT TOKENS TO TOPIC DISTRIBUTIONS

In [38]:
%%time

lsi_500_corpus = lsimodel_500[tfidf_corpus]
gensim.corpora.MmCorpus.serialize('lsi_500_corpus.mm', lsi_500_corpus)

CPU times: user 1min 13s, sys: 1min 25s, total: 2min 38s
Wall time: 3min 56s


In [39]:
%%time

lsi_300_corpus = lsimodel_300[tfidf_corpus]
gensim.corpora.MmCorpus.serialize('lsi_300_corpus.mm', lsi_300_corpus)

CPU times: user 41.3 s, sys: 1.09 s, total: 42.4 s
Wall time: 43.2 s


In [40]:
%%time

lda_100_corpus = ldamodel_100[tfidf_corpus]
gensim.corpora.MmCorpus.serialize('lda_100_corpus.mm', lda_100_corpus)

CPU times: user 25.2 s, sys: 553 ms, total: 25.7 s
Wall time: 25.9 s


In [41]:
%%time

lda_50_corpus = ldamodel_50[tfidf_corpus]
gensim.corpora.MmCorpus.serialize('lda_50_corpus.mm', lda_50_corpus)

CPU times: user 38.3 s, sys: 1min 24s, total: 2min 3s
Wall time: 3min 22s


In [42]:
%%time

hdp_corpus = hdpmodel[tfidf_corpus]
gensim.corpora.MmCorpus.serialize('hdp_corpus.mm', hdp_corpus)

CPU times: user 6min 48s, sys: 9min 53s, total: 16min 42s
Wall time: 23min 25s


## BUILT SIMILARITY INDICES FOR THE VARIOUS MODELS

In [43]:
%%time

lsi_500_indexer = train_and_save_indexer(lsi_500_corpus, 'lsi_500_indexer.model')

CPU times: user 1min 40s, sys: 2min 56s, total: 4min 36s
Wall time: 7min 28s


In [44]:
%%time

lsi_300_indexer = train_and_save_indexer(lsi_300_corpus, 'lsi_300_indexer.model')

CPU times: user 1min 7s, sys: 1min 27s, total: 2min 35s
Wall time: 3min 57s


In [45]:
%%time

lda_50_indexer = train_and_save_indexer(lda_50_corpus, 'lda_50_indexer.model')

CPU times: user 46.2 s, sys: 1min 26s, total: 2min 12s
Wall time: 3min 34s


In [46]:
%%time

lda_100_indexer = train_and_save_indexer(lda_100_corpus, 'lda_100_indexer.model')

CPU times: user 51.4 s, sys: 1min 26s, total: 2min 17s
Wall time: 3min 39s


In [47]:
%%time

hdp_indexer = train_and_save_indexer(hdp_corpus)

CPU times: user 7min 27s, sys: 11min 24s, total: 18min 52s
Wall time: 26min 57s


## RETRIEVE RECS FOR STORIES

In [48]:
def get_series_index(story_id):
    story_ids_map_dict = {story_id: series_index for series_index, story_id in enumerate(test_series.index)}
    return story_ids_map_dict[story_id]

def get_story_id(series_index):
    return test_series.index[series_index]

In [49]:
def get_sim_ids(story_id, corpus, indexer):
    series_index = get_series_index(story_id)
    vec = corpus[series_index]
    sims = indexer[vec]
    sim_indices = [sim_index for sim_index, sim_score in sims]
    return sim_indices

def fetch_story_titles_from_pgsql(story_id):
    conn = psycopg2.connect("host=localhost dbname=postgres user=postgres")
    cur = conn.cursor()
    cur.execute("SELECT title FROM bq_all WHERE id={};".format(story_id))
    title = cur.fetchone()[0]
    conn.commit
    return title

In [53]:
story_ids_list = test_series.index.values

In [55]:
%%time

story_ids_list

corpus_indexer_list= [
    ('lsi_500', lsi_500_corpus, lsi_500_indexer),
    ('lsi_300', lsi_500_corpus, lsi_500_indexer),
    ('lda_100', lda_100_corpus, lda_100_indexer),
    ('lda_50', lda_50_corpus, lda_50_indexer),
    ('hdp', hdp_corpus, hdp_indexer)
     ]

random_story_ids = np.random.choice(story_ids_list, size=10, replace=False)

given_stories = []
model_recs_dicts = []

for story_id in random_story_ids:
    model_recs = {}
    for model_name, corpus, indexer in corpus_indexer_list:
        sim_indices = get_sim_ids(story_id, corpus, indexer)
        rec_titles_list = [fetch_story_titles_from_pgsql(get_story_id(each_id)) for each_id in sim_indices]
        given_story_str = rec_titles_list[0]
        given_stories.append(given_story_str)
        rec_stories_list = rec_titles_list[1:]
        model_recs[model_name] = rec_stories_list
    model_recs_dicts.append(model_recs)    

CPU times: user 3.46 s, sys: 889 ms, total: 4.35 s
Wall time: 6.85 s


In [69]:
random_story_ids

array([ 6755722,  6936672,  8204007, 16485975,  6101970, 15724609,
       15326648, 10539100,  3640830, 11291076])

In [56]:
%%time

story_recs_dataframes = [pd.DataFrame(model_recs_dicts[index]) for index in range(10)]

CPU times: user 53.7 ms, sys: 15.7 ms, total: 69.4 ms
Wall time: 55.7 ms


In [57]:
given_stories = given_stories[0::5]

## EVALUATE RECOMMENDATIONS

In [58]:
pd.set_option('max_colwidth', 200)

In [59]:
print("Given story: {}".format(given_stories[0]))
story_recs_dataframes[0]

Given story: Visual Cryptography


Unnamed: 0,hdp,lda_100,lda_50,lsi_300,lsi_500
0,Q – Initiative to build commercially available universal quantum computers,Image Dithering: Eleven Algorithms and Source Code,Image Dithering: Eleven Algorithms and Source Code,"In CSS, “px” is not an angular measurement and it is not non-linear","In CSS, “px” is not an angular measurement and it is not non-linear"
1,Grand Theft Auto: San Andreas Streaming Deer Cam,What every coder should know about gamma,3D scatterplot of an image,Pixel-fitting – how antialiasing can ruin your logos and icons,Pixel-fitting – how antialiasing can ruin your logos and icons
2,Startup NASA,"Pillow-SIMD – Fast, production-ready image resize for x86",Dither stabilisation,Rotating Images,Rotating Images
3,Google Earth Pro is now free,Dither stabilisation,Accurate CRT Simulation,Voronoi Diagrams on the GPU,Voronoi Diagrams on the GPU
4,Facebook Changed Everyone's Email to Facebook.com; Here's How to Fix It,Beware of Transparent Pixels,What every coder should know about gamma,Easy Scalable Text Rendering on the GPU,Easy Scalable Text Rendering on the GPU


In [60]:
print("Given story: {}".format(given_stories[1]))
story_recs_dataframes[1]

Given story: I wrote the Anarchist Cookbook in 1969. Now I see its premise as flawed


Unnamed: 0,hdp,lda_100,lda_50,lsi_300,lsi_500
0,The FBI Can Now Legally Hack Everywhere Around the World,Harvard and the Making of the Unabomber (2000),Harvard and the Making of the Unabomber (2000),Most violence in the world is motivated by moral sentiments,Most violence in the world is motivated by moral sentiments
1,"Level3 is without peer, now what to do?",US bans students from “blacklisted” countries from getting access to Coursera,The Corrosion of High School Debate,"In Wake of Charlie Hebdo Attack, Let’s Not Sacrifice Even More Rights","In Wake of Charlie Hebdo Attack, Let’s Not Sacrifice Even More Rights"
2,China orders Bitcoin exchanges in capital city to close,Most violence in the world is motivated by moral sentiments,US bans students from “blacklisted” countries from getting access to Coursera,Harvard and the Making of the Unabomber (2000),Harvard and the Making of the Unabomber (2000)
3,How kids in a low-income country use laptops: lessons from Madagascar,Our Newfound Fear of Risk,Noam Chomsky Joins Faculty at University of Arizona,My new favorite book of all time,My new favorite book of all time
4,Toys ‘R’ Us Plans Bankruptcy Filing Amid Debt Struggle,"The Day Mandela Was Arrested, With A Little Help From the CIA",In College and Hiding from Scary Ideas,Mission Creep: When Everything Is Terrorism,Mission Creep: When Everything Is Terrorism


In [61]:
print("Given story: {}".format(given_stories[2]))
story_recs_dataframes[2]

Given story: fork() can fail


Unnamed: 0,hdp,lda_100,lda_50,lsi_300,lsi_500
0,Plasma Mobile – Turns your phone into a fully open hacking device,A subsystem to restrict programs into a “reduced feature operating model”,Strace – My Favourite Secret Weapon (2011),Why Ruby App Servers Break on MacOS High Sierra,Why Ruby App Servers Break on MacOS High Sierra
1,Main is usually a function – when is it not?,Unraveling rm: what happens when you run it?,Kill init by touching a bunch of files,Writing a Unix Shell,Writing a Unix Shell
2,FFS SSL,The Collapse of the Unix Philosophy,Apt: please make the moo reproducible,The Collapse of the Unix Philosophy,The Collapse of the Unix Philosophy
3,Bugs found in GCC with the help of PVS-Studio,Linux Local Privilege Escalation via SUID /proc/pid/mem Write,How setting the TZ environment variable avoids thousands of system calls,A subsystem to restrict programs into a “reduced feature operating model”,A subsystem to restrict programs into a “reduced feature operating model”
4,10M Concurrent Websockets,How setting the TZ environment variable avoids thousands of system calls,The Collapse of the Unix Philosophy,The tale of aux.c,The tale of aux.c


In [62]:
print("Given story: {}".format(given_stories[3]))
story_recs_dataframes[3]

Given story: Spotify Form F-1


Unnamed: 0,hdp,lda_100,lda_50,lsi_300,lsi_500
0,Ideas,Spotify is gaining leverage over record labels,Spotify is gaining leverage over record labels,Spotify is gaining leverage over record labels,Spotify is gaining leverage over record labels
1,iFixit App Pulled from Apple’s Store,"Pandora to Buy Rdio Assets for $75M, Rdio Files Ch.11, Will Shutter Service","Pandora to Buy Rdio Assets for $75M, Rdio Files Ch.11, Will Shutter Service",Spotify files for its IPO,Spotify files for its IPO
2,Dead Coins – A list of dead cryptocurrencies,Spotify’s Discover Weekly: How machine learning finds new music,"Spotify opens on NYSE, valuing company at almost $30B","Spotify opens on NYSE, valuing company at almost $30B","Spotify opens on NYSE, valuing company at almost $30B"
3,What I’ve learned in 5 years of running a SaaS,Why Spotify Pays So Little,What I've been up to for the past year,Lessons from Spotify,Lessons from Spotify
4,The One-Person Product,Koel: A personal music streaming server,Why Spotify Pays So Little,Spotify raises $1B in debt with devilish terms to fight Apple Music,Spotify raises $1B in debt with devilish terms to fight Apple Music


In [63]:
print("Given story: {}".format(given_stories[4]))
story_recs_dataframes[4]

Given story: Light completely stopped for a record-breaking minute


Unnamed: 0,hdp,lda_100,lda_50,lsi_300,lsi_500
0,Distributed Consensus – How Shared Data Is Stored,Transfer of atomic mass with a photon solves the momentum paradox of light,Researchers quantum teleport particle of light six kilometres,Scientists unveil new form of matter: Time crystals,Scientists unveil new form of matter: Time crystals
1,Deny PRISM - Generate a PRISM involvement denial statement for your company,Scientists unveil new form of matter: Time crystals,Scientists unveil new form of matter: Time crystals,Physicists Create World’s First Time Crystal,Physicists Create World’s First Time Crystal
2,Paralyzed Man Uses Brain Implant to Type Eight Words per Minute,People can sense single photons,"Loophole-free Bell test ‘Spooky action at a distance’, no cheating","MIT discovers a new state of matter, a new kind of magnetism","MIT discovers a new state of matter, a new kind of magnetism"
3,Congress has added CISA to the federal budget bill,'Zeno effect' verified: Atoms won't move while you watch,Magnetic wormhole created for first time,ALPHA observes light spectrum of antimatter for first time,ALPHA observes light spectrum of antimatter for first time
4,Sweden 'a close partner' in NSA surveillance,"Loophole-free Bell test ‘Spooky action at a distance’, no cheating",EmDrive study officially published,Superconductivity without cooling?,Superconductivity without cooling?


In [64]:
print("Given story: {}".format(given_stories[5]))
story_recs_dataframes[5]

Given story: $3 Trillion in Forgotten Debt


Unnamed: 0,hdp,lda_100,lda_50,lsi_300,lsi_500
0,Galaxy Magazine: Free Texts,Fed Ends Zero-Rate Era,Bank of Canada increases overnight rate target to 1 per cent,Croatia just canceled the debts of its poorest citizens,Croatia just canceled the debts of its poorest citizens
1,The Copyright Office Belongs in a Library,The Era of Very Low Inflation and Interest Rates May Be Near an End,Left with Nothing,Anthony Bourdain on not having debt,Anthony Bourdain on not having debt
2,Comcast accused of cutting competitor’s wires to put it out of business,Bank of Canada increases overnight rate target to 1 per cent,Anthony Bourdain on not having debt,Why Don't People Manage Debt Better?,Why Don't People Manage Debt Better?
3,AlphaGo Beats Lee Sedol in Final Game,Finance is Not the Economy,The Era of Very Low Inflation and Interest Rates May Be Near an End,That Debt from 1720? Britain’s Payment Is Coming,That Debt from 1720? Britain’s Payment Is Coming
4,Reasons not to use Facebook,That Debt from 1720? Britain’s Payment Is Coming,Fed Ends Zero-Rate Era,Occupy Wall Street activists buy $15m of Americans' personal debt,Occupy Wall Street activists buy $15m of Americans' personal debt


In [65]:
print("Given story: {}".format(given_stories[6]))
story_recs_dataframes[6]

Given story: Windows 3.1 All Over Again


Unnamed: 0,hdp,lda_100,lda_50,lsi_300,lsi_500
0,Qt binding for Go with support for all major operating systems,Pony – High Performance Actor Programming,Gotchas from Two Years with Node,The Unix Philosophy and Elixir as an Alternative to Go,The Unix Philosophy and Elixir as an Alternative to Go
1,"ORWL – The first open source, physically secure computer",Erlang Scheduler Details and Why They Matter,The Way of the Gopher: Making the Switch from Node.js to Golang,How Discord Scaled Elixir to 5M Concurrent Users,How Discord Scaled Elixir to 5M Concurrent Users
2,Benjamin Button Reviews the New MacBook Pro,The Unix Philosophy and Elixir as an Alternative to Go,"Beyond 10,000 Lines: Lessons Learned from a Large Phoenix Project",Comparing Elixir and Go,Comparing Elixir and Go
3,A Prettier JavaScript Formatter,Why Erlang Matters,Elixir 1.5 released,Stuff Goes Bad: Erlang in Anger (2016),Stuff Goes Bad: Erlang in Anger (2016)
4,Linux page table isolation is not needed on AMD processors,How I Start: Erlang,Elixir web development 101: collaborative todolist with realtime updates,What's all this fuss about Erlang? (2007),What's all this fuss about Erlang? (2007)


In [66]:
print("Given story: {}".format(given_stories[7]))
story_recs_dataframes[7]

Given story: Large Companies Game H-1B Visa Program


Unnamed: 0,hdp,lda_100,lda_50,lsi_300,lsi_500
0,The Madness of Airline Élite Status,New H1-B Visa bill doubles the salary requirements to $130K/yr,New H1-B Visa bill doubles the salary requirements to $130K/yr,Gaming the H-1B system for good,Gaming the H-1B system for good
1,"Federal Prosecutors, in a Policy Shift, Cite Warrantless Wiretaps as Evidence",Gaming the H-1B system for good,H-1B Visas: U.S. Lawmaker Re-Introduces Bill to Tighten Rules,New H1-B Visa bill doubles the salary requirements to $130K/yr,New H1-B Visa bill doubles the salary requirements to $130K/yr
2,Books I read this year,H-1B Visas: U.S. Lawmaker Re-Introduces Bill to Tighten Rules,Gaming the H-1B system for good,Show HN: A histogram of salaries given to H1B workers in the software industry,Show HN: A histogram of salaries given to H1B workers in the software industry
3,Inside the NSA's War on Internet Security,Show HN: A histogram of salaries given to H1B workers in the software industry,Show HN: A histogram of salaries given to H1B workers in the software industry,H-1B Visas: U.S. Lawmaker Re-Introduces Bill to Tighten Rules,H-1B Visas: U.S. Lawmaker Re-Introduces Bill to Tighten Rules
4,U.S. Election Agency Breached by Hackers After November Vote,Judge sends two to prison for 7 years for H-1B fraud,IBM: The cost difference is too great for business not to look for H1B workers,"Immigration is about talent, not costs","Immigration is about talent, not costs"


In [67]:
print("Given story: {}".format(given_stories[8]))
story_recs_dataframes[8]

Given story: Hacker Monthly: best of the Internet, printed out, and it’s turning a profit


Unnamed: 0,hdp,lda_100,lda_50,lsi_300,lsi_500
0,The Music Industry Shouldn't Be Able to Cut Off Your Internet Access,Making Your Writing Work Harder For You,Award-Winning Nautilus Enters Rough Waters,Award-Winning Nautilus Enters Rough Waters,Award-Winning Nautilus Enters Rough Waters
1,Japanese Vending Machines at Night Juxtaposed with a Wintry Hokkaido Landscape,O’Reilly Media Has Lost Its Soul,Making Your Writing Work Harder For You,Show HN: The first issue of Compelling Science Fiction,Show HN: The first issue of Compelling Science Fiction
2,"The Huge, Unseen Operation Behind the Accuracy of Google Maps",Anatomy of a Hoax (Sony Nexus phone),Anatomy of a Hoax (Sony Nexus phone),A Letter from the Publisher of Nautilus,A Letter from the Publisher of Nautilus
3,How an unsigned rapper changed music,Dear Al-Jazeera: Why Steal Our Code?,Medium releases Memberships,Medium releases Memberships,Medium releases Memberships
4,The Invention of the AeroPress,"Farewell, Dr. Dobb's","The Oatmeal Fights Legal Threat, Raises $20,000 in an Hour","I Sold The Magazine, Too","I Sold The Magazine, Too"


In [68]:
print("Given story: {}".format(given_stories[9]))
story_recs_dataframes[9]

Given story: Handful of Biologists Went Rogue and Published Directly to Internet


Unnamed: 0,hdp,lda_100,lda_50,lsi_300,lsi_500
0,U.S. Supreme Court Curbs Excessive Design Patent Damages,The Growing Impact of Old Scientific Papers,The Growing Impact of Old Scientific Papers,Library-managed 'arXiv' spreads scientific advances rapidly and worldwide,Library-managed 'arXiv' spreads scientific advances rapidly and worldwide
1,“Startup” asks internship applicant to build their app before phone screen,Let’s make peer review scientific,Let’s make peer review scientific,Distill: a modern machine learning journal,Distill: a modern machine learning journal
2,Programming Sucks,How to read and understand a scientific paper: a guide for non-scientists,The Arsenic DNA paper exposes flaws in peer review,Should All Research Papers Be Free?,Should All Research Papers Be Free?
3,How Apple Makes the Watch,The Arsenic DNA paper exposes flaws in peer review,How to read and understand a scientific paper: a guide for non-scientists,The Growing Impact of Old Scientific Papers,The Growing Impact of Old Scientific Papers
4,Website Impounded,Should All Research Papers Be Free?,Why I published in a predatory journal,The Arsenic DNA paper exposes flaws in peer review,The Arsenic DNA paper exposes flaws in peer review


In [75]:
save_pickle('story_ids_list', story_ids_list)

In [76]:
story_ids_list

array([16582136, 11116274, 13682022, ...,  4443296,  4436063,  4333774])

In [77]:
story_ids_list[-5:]

array([4562838, 4477119, 4443296, 4436063, 4333774])