In [1]:
import numpy as np
import pandas as pd
from nltk.tokenize.toktok import ToktokTokenizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import re
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords
from scipy.sparse import csr_matrix
import nltk
import  sklearn
from collections import Counter
stemmer = SnowballStemmer('english')
news = pd.read_csv('/home/linu/news_articles.csv')
news.head()
tokenizer = ToktokTokenizer()
stops = set(stopwords.words('english'))
from sklearn.decomposition import LatentDirichletAllocation

no_of_recommends = 5
n_topics = 8

news = news[['Article_Id','Title','Content']].dropna()
contents = news["Content"].tolist()
title = news['Title']
article_id = news['Article_Id']

In [2]:
def clean_tokenize(document):
    document = re.sub('[^\w_\s-]',' ',document)
    tokens  = nltk.word_tokenize(document)
    cleaned_article = ' '.join([stemmer.stem(item) for item in tokens])   #stemming the tokenized corpus
    return cleaned_article

cleaned_articles = list(map(clean_tokenize,contents))

In [3]:
article_vocab = { }

article_vocab = enumerate(cleaned_articles)

total_words = []

for i in range(0, len(cleaned_articles)):
    tokens = nltk.word_tokenize(cleaned_articles[i])

    for w in tokens:
        total_words.append(w)
counts = Counter(total_words)

vocab = {j:i for i,j in enumerate(counts.keys())}

stops_removed = [i for i in vocab.keys() if i not in stops]

final_vocab = {j:i for i,j in enumerate(stops_removed)}


tf_idf = TfidfVectorizer(vocabulary=final_vocab,min_df=1)

article_vocabulary = tf_idf.fit_transform(cleaned_articles)

lda = LatentDirichletAllocation(n_components=n_topics,max_iter=1,random_state=0)

Lda_articlemat = lda.fit_transform(article_vocabulary)

wordtokens_article = [word.split() for word in cleaned_articles] 

In [4]:
Lda_articlemat.shape            #topic modelling 4831 docs and 8 topics

(4831, 8)

In [197]:
existing_users = np.random.random_sample(size=(10000,8))  #we take any number of users

In [198]:
new_user = np.random.random_sample(size=(1,8))  #we take a user to recommend him according to collaborative filtering by using cosine similiarty

In [199]:
Similarity_Score = cosine_similarity(existing_users,new_user) #now we get our similar user score from our existing users and new user

In [200]:
Similarity_Score.shape

(10000, 1)

In [None]:
#now we pick top 5 similiar users out of existing users by reversing the Similarity_Score for top scores

In [240]:
top_similars_users = np.argsort(Similarity_Score,axis=0)[::-1][:5]  

In [237]:
top_similars   #here we get our top 5 similar users

array([[5680],
       [5126],
       [9204],
       [ 855],
       [4042]])

In [244]:
top_users= existing_users[top_similars]  #picking our top 5 similar user profiles out of existing users

In [247]:
top_users

array([[[0.8714688 , 0.09943635, 0.81975511, 0.90659792, 0.20167064,
         0.59874779, 0.13842781, 0.11568995]],

       [[0.53116645, 0.15099813, 0.38266618, 0.48156379, 0.0501611 ,
         0.22416667, 0.2408782 , 0.03423717]],

       [[0.72216707, 0.27146747, 0.88831756, 0.89256429, 0.04440116,
         0.38850883, 0.08461993, 0.30834674]],

       [[0.82336259, 0.03933566, 0.87710602, 0.90660293, 0.1628694 ,
         0.52365853, 0.12202507, 0.28610161]],

       [[0.55431594, 0.00759144, 0.52462022, 0.95735912, 0.16856865,
         0.17572198, 0.13916037, 0.23306737]]])

In [207]:
avg_user_profile = np.mean(top_users,axis=0)   # we take mean to get all the average or all the existing users

In [208]:
avg_user_profile.shape,Lda_articlemat.shape

((1, 8), (4831, 8))

In [209]:
sim_articles = cosine_similarity(avg_user_profile,Lda_articlemat) #now we find our similar articles according to our avg u.p.

In [249]:
sim_articles.reshape(4831,1)

array([[0.53361795],
       [0.12308587],
       [0.17435128],
       ...,
       [0.12501959],
       [0.15635202],
       [0.53543936]])

In [250]:
interested_articles = np.argsort(interested_articles)[::-1]  #we fetch the indexes of our top 5 similar articles according to user interest

In [252]:
interested_articles #so these are the indexes of our top preffered articles by cosine of new and existing user

array([[1102, 4211,  230, ...,  614, 4484, 1187]])

In [253]:
#news['Title'][interested_articles[0]]

In [263]:
for i in [interested_articles[0]] :
    print('Recommended-Articles :')
    print('\n')
    print(news['Title'][i][:20])

Recommended-Articles :


1102    Lenovo K4 Note open sale goes live on Amazon I...
4211    Modi Government May Save Rs 88 800 Crore This ...
230      Pathemari  release  Check out complete theatr...
3742    Narendra Modi Likely to Announce   70 000 Cror...
1776    Euro 2016  Wilmots worried about De Bruyne and...
331     US box office collection   Shankarabharanam  f...
3821    Nigerian army repels attack by suspected Boko ...
909     Update LG G2 with Android Marshmallow via Cyan...
439     Pre-Release Business  Will Akhil Akkineni s De...
2521    Al Qaeda Claims Responsibility for Charlie Heb...
3414    Nagaland  18 Arrested over Mob Lynching of Rap...
1069    Samsung Galaxy A5 Android Marshmallow update  ...
2414    Donald Trump last man standing after Kasich qu...
4260    Tension Continues in Assam Over Killing of Bus...
3627    Ranbir Kapoor and Katrina Kaif end their relat...
2749    NIA seeks help from Interpol in nabbing D-Comp...
2339    Photos from besieged Fallujah reveal pl

In [270]:
t = np.argsort(existing_users[0])

In [271]:
news['Title'][t]

5    Phillipines drug war  1 800 drug-related death...
2    US  South Korea begin joint military drill ami...
7    Dialogue crucial in finding permanent solution...
0    14 dead after bus falls into canal in Telangan...
3    Illegal construction in Bengaluru  Will my hou...
4    Punjab Gau Rakshak Dal chief held for assaulti...
6    Infosys shares likely to fall on Tuesday after...
1    Pratibha Tiwari molested on busy road   Saath ...
Name: Title, dtype: object