In [1]:
from joblib import Parallel, delayed, cpu_count

import psycopg2

import pandas as pd
import pandas.io.sql as sqlio

import funcy as fp

from multiprocessing import Pool
import numpy as np

import pattern3

import gensim
from gensim.corpora import Dictionary
from gensim.models import CoherenceModel, LdaModel, LsiModel, HdpModel, LdaMulticore, TfidfModel
from gensim.models.wrappers import LdaMallet
from gensim.similarities import Similarity, MatrixSimilarity, SparseMatrixSimilarity

# import pyLDAvis.gensim
import matplotlib.pyplot as plt

from gensim.test.utils import get_tmpfile
from gensim.matutils import hellinger
from gensim.utils import to_utf8, tokenize
from gensim.models.phrases import Phrases, Phraser
from gensim.parsing.preprocessing import strip_tags, preprocess_string, remove_stopwords, strip_punctuation, strip_multiple_whitespaces, remove_stopwords, strip_numeric, strip_non_alphanum

import pickle

In [2]:
def save_pickle(file_name, object_to_pickle):
    with open(file_name, 'wb') as f:
        pickle.dump(object_to_pickle, f)
        
def load_pickle(file_name):
    with open(file_name, 'rb') as f:
        unpickled_object = pickle.load(f)
    return unpickled_object

In [3]:
%%time

conn = psycopg2.connect("host=localhost dbname=postgres user=postgres")

commenters_df = sqlio.read_sql_query("SELECT id, commenters FROM bq_all WHERE all_text IS NOT NULL AND all_text != '' AND article_content IS NOT NULL AND article_content != '' ORDER BY score DESC, story_time DESC LIMIT 25000", conn, index_col='id')

CPU times: user 41.5 ms, sys: 40 ms, total: 81.5 ms
Wall time: 1.18 s


In [4]:
# Transform string of commenters into list:
def listify_commenters(commenters_str):
    return commenters_str.split(", ")

In [5]:
%%time

commenters_df["commenters"] = commenters_df["commenters"].apply(listify_commenters)

CPU times: user 166 ms, sys: 33.5 ms, total: 200 ms
Wall time: 199 ms


In [6]:
# Get list of all unique commenters:
all_commenters = list(set(x for l in commenters_df["commenters"].values for x in l))

In [7]:
all_commenters.remove("")

In [8]:
# Convert list of commenters for each story to "bag of commenters" (boc):
dct = Dictionary(commenters_df["commenters"].values)
dct.filter_extremes()

def get_boc(tokens):
    return dct.doc2bow(tokens)

commenters_df["boc"] = commenters_df["commenters"].apply(get_boc)

In [9]:
commenters_ary = commenters_df["boc"].values

In [10]:
# Dimensionality reduction of "bag of commenters" with LSI
commenters_dimrec_model = LsiModel(corpus=commenters_ary, num_topics=300, id2word=dct)

In [11]:
commenters_dimrec_ary = commenters_dimrec_model[commenters_ary]

In [12]:
gensim.corpora.MmCorpus.serialize('commenters_corpus.mm', commenters_dimrec_ary)

In [14]:
commenters_dimrec_ary = gensim.corpora.MmCorpus('commenters_corpus.mm')

In [17]:
index_temp = get_tmpfile("index")
commenters_indexer = Similarity(output_prefix=index_temp, corpus=commenters_dimrec_ary, num_features=len(dct), num_best=6)
commenters_indexer.save('commenters_indexer.model')

In [None]:
# def get_series_index(story_id):
#     story_ids_map_dict = {story_id: series_index for series_index, story_id in enumerate(story_ids_list)}
#     return story_ids_map_dict[story_id]

# def get_story_id(series_index):
#     return story_ids_list[series_index]

In [None]:
# def get_sim_ids(story_id, corpus, indexer):
#     series_index = get_series_index(story_id)
#     vec = corpus[series_index]
#     sims = indexer[vec]
#     sim_indices = [sim_index for sim_index, sim_score in sims]
#     return sim_indices

# def fetch_story_titles_from_pgsql(story_id):
#     conn = psycopg2.connect("host=localhost dbname=postgres user=postgres")
#     cur = conn.cursor()
#     cur.execute("SELECT title FROM bq_all WHERE id={};".format(story_id))
#     title = cur.fetchone()[0]
#     conn.commit
#     return title

In [None]:
# story_ids_list = load_pickle('story_ids_list')

# random_story_ids = np.random.choice(story_ids_list, size=10, replace=False)

# recs_dict = {}

# for story_id in random_story_ids:
#     sim_indices = get_sim_ids(story_id, commenters_dimrec_ary, commenters_indexer)
#     rec_titles_list = [fetch_story_titles_from_pgsql(get_story_id(each_id)) for each_id in sim_indices]
#     given_story_str = rec_titles_list[0]
#     rec_stories_list = rec_titles_list[1:]
#     recs_dict[given_story_str] = rec_stories_list

In [None]:
# pd.DataFrame(recs_dict)