In [2]:
import pandas as pd
import numpy as np
import time
import redis
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

class ContentRecommender(object):
    SIMKEY = 'p:smlr:%s'
    
    def __init__(self):
        self._r = redis.StrictRedis.from_url("redis://localhost:6379")
        
    def predict(self, item_id, num):
        return self._r.zrange(self.SIMKEY % item_id, 0, num-1, withscores=False, desc=True)
    
    def train(self, csv_path):
        start = time.time()
        self._ingest_training_data()
        print("Training data ingested in %s seconds." % (time.time() - start))
        
        start = time.time()
        self._train_engine(df)
        print("Engine trained in %s seconds." % (time.time() - start))
        
    def _ingest_training_data():
        df = pd.read_csv(csv_path)
        
    def _train_engine(self, data_frame):
        tf = TfidfVectorizer(analyzer='word', ngram_range=(1, 3), min_df=0, stop_words='english')
        content = data_frame['content']
        tfidf_matrix = tf.fit_transform(content)
        
        cosine_similarities = linear_kernel(tfidf_matrix, tfidf_matrix)
        
        for idx, row in data_frame.iterrows():
            similar_indices = cosine_similarities[idx].argsort()[:-100:-1]
            similar_items = [(cosine_similarities[idx][i], data_frame['id'][i]) for i in similar_indices]
            flattened = sum(similar_items[1:], ())
            self._r.zadd(self.SIMKEY % row['id'], *flattened)


In [5]:
rec = ContentRecommender()
rec.train("./squawks.csv")

Training data ingested in 0.008157968521118164 seconds.
Engine trained in 7.538815259933472 seconds.


In [10]:
import pandas as pd
squawks = pd.read_csv("./squawks.csv")

In [12]:
rec.predict(3, 2)

[b'1392', b'1654']

In [20]:
squawks[squawks["id"] == 1392]

Unnamed: 0,id,content,user_id,created_at,updated_at,likes_count,flags_count
1374,1392,FACEBOOK IS BUYING WHATSAPP FOR $16 BILLION —...,39,2014-02-19 22:32:34,2014-02-19 22:32:34,0,0


In [22]:
squawks[squawks["id"] == 1654]

Unnamed: 0,id,content,user_id,created_at,updated_at,likes_count,flags_count
1637,1654,FACEBOOK IS BUYING MESSAGING APP WHATSAPP FOR ...,44,2014-02-20 00:01:30,2014-02-20 00:01:30,0,0


In [23]:
squawks[squawks["id"] == 3]

Unnamed: 0,id,content,user_id,created_at,updated_at,likes_count,flags_count
2,3,APPARENTLY FACEBOOK ARE BUYING WHATSAPP FOR $1...,1,2014-02-19 22:40:58,2014-02-19 22:40:58,0,0
