In [1]:
import numpy as np
import pandas as pd
!pip install spacy
from spacy.en import English
import re
!pip install redis
import redis
from spacy.en import STOP_WORDS



In [2]:
news_data = pd.read_csv('uci-news-aggregator.csv')

In [3]:
news_data.shape

(422419, 8)

In [4]:
news_data.head(3)

Unnamed: 0,ID,TITLE,URL,PUBLISHER,CATEGORY,STORY,HOSTNAME,TIMESTAMP
0,1,"Fed official says weak data caused by weather,...",http://www.latimes.com/business/money/la-fi-mo...,Los Angeles Times,b,ddUyU0VZz0BRneMioxUPQVP6sIxvM,www.latimes.com,1394470370698
1,2,Fed's Charles Plosser sees high bar for change...,http://www.livemint.com/Politics/H2EvwJSK2VE6O...,Livemint,b,ddUyU0VZz0BRneMioxUPQVP6sIxvM,www.livemint.com,1394470371207
2,3,US open: Stocks fall after Fed official hints ...,http://www.ifamagazine.com/news/us-open-stocks...,IFA Magazine,b,ddUyU0VZz0BRneMioxUPQVP6sIxvM,www.ifamagazine.com,1394470371550


In [5]:
nlp = English()

This is just a function to lightly clean the text from our data

In [14]:
def cleaner(text):
    text = re.sub('<.{0,3}>', '', text)
    text = re.sub('[\W]',' ',text)
    text = re.sub('\d','',text)
    text = re.sub('\s+',' ', text)
    text = ' '.join([i.lemma_ for i in nlp(text) if i.orth_ not in STOP_WORDS])
    return text

Let's run that function on the title column

In [7]:
news_data['clean_title'] = news_data['TITLE'].apply(cleaner)

These next two cells will each take a little while to run. 

In [13]:
title_vecs = news_data['clean_title'].apply(lambda x: nlp(x).vector).values

In [15]:
title_vecs = np.array([nlp(i).vector for i in news_data['clean_title']])

Now that our data is clean-ish and we have vectorized the titles, we can try running some models

In [16]:
from sklearn.neighbors import NearestNeighbors

In [17]:
nn = NearestNeighbors(n_neighbors=5)

Fit the model to our data

In [18]:
nn.fit(title_vecs)

NearestNeighbors(algorithm='auto', leaf_size=30, metric='minkowski',
         metric_params=None, n_jobs=1, n_neighbors=5, p=2, radius=1.0)

Here's one way to get the distance and indices of title ids

In [20]:
distance, indices = nn.kneighbors(nlp('sports').vector.reshape(1, -1))

In [21]:
distance

array([[ 2.24176616,  2.24176616,  4.33849967,  4.59863702,  4.6591042 ]])

In [22]:
indices

array([[119580, 212774,  29006,  31646, 109849]])

Print out of these indices.

In [31]:
news_data.ix[indices[0]]

Unnamed: 0,ID,TITLE,URL,PUBLISHER,CATEGORY,STORY,HOSTNAME,TIMESTAMP,clean_title
119580,119916,Sports Other Sports,http://www.communitypress.ca/2014/04/14/ultima...,Community Press,e,dLnYW2W4bSaEflMQZ1viwZnDokmeM,www.communitypress.ca,1397594837262,sports other sports
212774,213220,Sports Other Sports,http://www.owensoundsuntimes.com/2014/05/16/ww...,Owen Sound Sun Times,b,dQ4Thx_EFXNgPHMYd8x6fhgp40PWM,www.owensoundsuntimes.com,1400381540095,sports other sports
29006,29007,Sports FYI,http://www.tulsaworld.com/sportsextra/sports-f...,Tulsa World,b,dR39a35kcuso-9M3bXDszPvdbxY1M,www.tulsaworld.com,1395503794366,sport fyi
31646,31647,Sports Media First Tweets,http://awfulannouncing.com/2014/first-tweets-b...,Awful Announcing,t,dGTARuAEeau1j-MEAoB-nfS4uaERM,awfulannouncing.com,1395513581918,sports media first tweets
109849,110046,Wide World of Sports TV partners,http://wwos.ninemsn.com.au/article.aspx\?id=88...,Ninemsn,t,diZ1QfAptk5FAnMMQEdCq5RInWgrM,wwos.ninemsn.com.au,1397455154189,wide world sports tv partner


Here's another way, using a function: 
return the most similar article to our topic. Also, I am only returning the columns from the dataframe that I think are relevant for our purposes here

In [32]:
def most_similar(search):
    _ , indices = nn.kneighbors(nlp(cleaner(search)).vector.reshape(1, -1))
    return news_data.ix[indices[0]][['ID','TITLE','clean_title','CATEGORY']]

Then, calculate the cosine similarity betwen two vectors. The lower the numer is to 0, the closer the two vectors are to one another, while the closer to 1.0 that the cosine similarity is, the more likely it is that the two vectors are orthognal to one another.

In [25]:
from scipy.spatial.distance import cosine

In [26]:
most_similar('fashion')

Unnamed: 0,ID,TITLE,clean_title,CATEGORY
270155,270601,Fashion tragedies abound at the 'Fashion Oscars',fashion tragedy abound fashion oscars,e
110551,110748,Celebrity festival fashion Coachella 2014 style,celebrity festival fashion coachella style,e
62226,62227,Fashion Watch on Vogue,fashion watch vogue,e
269921,270367,Best Dressed at the 2014 CFDA Fashion Awards,best dress cfda fashion awards,e
23030,23031,Futuristic fashion: Dress like a Divergent,futuristic fashion dress like divergent,e


In [34]:
cosine(title_vecs[270155], title_vecs[110551])

0.18658516804635905

In [27]:
most_similar('food')

Unnamed: 0,ID,TITLE,clean_title,CATEGORY
100224,100421,Diet Food 1,diet food,m
382537,382997,Organic food is still not more nutritious than...,organic food nutritious conventional food,m
104281,104478,Food Poisoning Risk Higher in Restaurant Food,food poisoning risk higher restaurant food,m
100219,100416,Backlash against diet food,backlash diet food,m
382523,382983,Organic foods may be healthier,organic food healthy,m


In [42]:
cosine(title_vecs[10024], title_vecs[382537])

0.56166861587272865

Now this is my attempt to make this 'deployable,' using Redis. Insert the IP address of your EC2 instance below, set to the variable name "redis_ip." I'm terminating the EC2 instance we used on Thursday (2017-08-03) because I think there's something going on with the memory.

In [44]:
redis_ip = '52.27.124.192'

r = redis.StrictRedis(redis_ip)

Pickle our model and store it in redis

In [45]:
import pickle

In [46]:
model = pickle.dumps(nn)

In [None]:
r.set('model', model)

Pickle our dataframe also, so we can query it

In [48]:
data = pickle.dumps(news_data)

In [None]:
r.set('dataframe', data)

Use Redis to Load our pipeline:

In [50]:
pipe = r.pipeline()

In [49]:
for key, value in news_data[['TITLE']][:4].to_dict()['TITLE'].items():
    print(key, '\t', value)

0 	 Fed official says weak data caused by weather, should not slow taper
1 	 Fed's Charles Plosser sees high bar for change in pace of tapering
2 	 US open: Stocks fall after Fed official hints at accelerated tapering
3 	 Fed risks falling 'behind the curve', Charles Plosser says


In [51]:
from argparse import ArgumentParser

parser = ArgumentParser()

parser.add_argument('-q', '--query', type=str, help='Text to query')

_StoreAction(option_strings=['-q', '--query'], dest='query', nargs=None, const=None, default=None, type=<class 'str'>, choices=None, help='Text to query', metavar=None)

So now putting all the pieces together in one function:

In [None]:
def make_prediction(title):
    #     Vectorize the input
    vec = nlp(title).vector

    #     Load the model
    model = pickle.loads(r.get('model'))

    #     Find nearest neighbors
    distances , indices = model.kneighbors(vec.reshape(1,-1))

    #     get the indices
    indices = indices[0]

    #     create a pipeline
    pipe = r.pipeline()

    #     query for indices (lookup)
    for index in indices:
        pipe.get(bytes(str(index), 'utf-8'))
    return list(zip(pipe.execute(), distances[0]))

In [None]:
make_prediction('sports')