In [292]:
import numpy as np
import pandas as pd
from nltk.tokenize.toktok import ToktokTokenizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import re
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords
from scipy.sparse import csr_matrix
import nltk
import  sklearn
from collections import Counter
stemmer = SnowballStemmer('english')
news = pd.read_csv('/home/linu/news_articles.csv')
news.head()
tokenizer = ToktokTokenizer()
stops = set(stopwords.words('english'))
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.preprocessing import normalize

#File_path = '/home/linu/GoogleNews-vectors-negative300.bin'

no_of_recommends = 20
n_topics = 10

In [2]:
news = news[['Article_Id','Title','Content']].dropna()
contents = news["Content"].tolist()
title = news['Title']
article_id = news['Article_Id']

In [3]:
def clean_tokenize(document):
    document = re.sub('[^\w_\s-]',' ',document)
    tokens  = nltk.word_tokenize(document)
    cleaned_article = ' '.join([stemmer.stem(item) for item in tokens])   #stemming the tokenized corpus
    return cleaned_article

In [4]:
cleaned_articles = list(map(clean_tokenize,contents))

In [293]:
def Topic_Modeller(LDA_matrix):
    
    total_WordVocab = []
    for i in range(0,len(cleaned_articles)) :
        word_tokens = nltk.word_tokenize(cleaned_articles[i])
        for words in word_tokens :
            total_WordVocab.append(words)
        counts = Counter(total_WordVocab)

    vocab = {j:i for i,j in enumerate(counts.keys())}

    stops_removed = [word for word in vocab.keys() if word not in stops]

    Final_VocabDict = {j:i for i,j in enumerate(stops_removed)}
    
    Tfidf = TfidfVectorizer(min_df=1,vocabulary=Final_VocabDict)


    Tfidf_Matrix = Tfidf.fit_transform(cleaned_articles)

    Lda = LatentDirichletAllocation(n_components=n_topics,max_iter=1,random_state=0)
    
    Lda_articlemat = Lda.fit_transform(Tfidf_Matrix)

    return Lda_articlemat  

In [294]:
Lda_articlemat = Topic_Modeller(cleaned_articles)

In [295]:
Lda_articlemat.shape

(4831, 10)

In [8]:
wordtokens_article = [word.split() for word in cleaned_articles]

In [9]:
# a function to create user profiles on the base of articles read and time spent  for implementing content approach

def user_profiler(wordtokens,article_read,article_time):
    user_profile = []
    wordPer_second = 5
    

    for i in range(len(wordtokens)):                                        

        average_time = (len(wordtokens[i])/wordPer_second) #length of wordtokslist by wps gives us avg time to read the article
         
        user_interest_timevalue = article_time[i]/average_time  #article_times divide by avg times of each article                   
        
        Topic_weights = (article_read[i]*user_interest_timevalue)   #Ldamatrix[] * user_interest_time calculated                 
        
        user_profile.append(Topic_weights)                                      

    return sum(normalize(user_profile))                             


userProfile_One = user_profiler([wordtokens_article[600],wordtokens_article[99],wordtokens_article[120]],
                             [Lda_articlemat[600],Lda_articlemat[99],Lda_articlemat[120]],
                             [120,60,30])

In [10]:
userprofile_one =  np.array(userProfile_One)

In [11]:
userprofile_one.shape

(10,)

In [12]:
userprofile_one

array([0.06380809, 1.03563938, 0.06380688, 1.05161394, 0.06380264,
       0.0638015 , 1.03360147, 0.06380235, 0.06380346, 0.06380215])

In [13]:
def Content_Recommends_Calculator(user_profile,Lda_articlemat) :
    
    user_interested_articles = []
    
    contents_interest_score = []

    user_preffered_articles = cosine_similarity(userprofile_one.reshape(1,-1),Lda_articlemat)
    
    top_articles = np.sort(user_preffered_articles).flatten()[::-1][:10]
    
    user_interested_articles.append(top_articles)
    
    content_interest_score = (user_interested_articles[0] * 0.4)
    
    return content_interest_score


content_recommended = Content_Recommends_Calculator(userprofile_one,Lda_articlemat)

In [14]:
content_recommended

array([0.33218085, 0.33066672, 0.3303857 , 0.33036246, 0.32982406,
       0.32849261, 0.3283279 , 0.3282375 , 0.32813684, 0.32796289])

In [296]:
#For CF apporach
existing_users = np.random.random_sample(size=(1000,10))   #we take n existing users   
new_user = np.random.random_sample(size=(1,10))            #we take a single new user

In [16]:
def Collaborative_Recommends_Calculator(existing_usr,new_usr) :
    
    collaborative_interest_score = [ ]
    sorted_collaborative_interest = [ ]
    
    collaborative_interest_score = cosine_similarity(existing_users,new_user)
    
    sorted_collaborative_interest = np.argsort(collaborative_interest_score,axis=0)[::-1][:10]
    
    sorted_collaborative_indexes = existing_users[sorted_collaborative_interest]
    
    collab_interest = np.mean(sorted_collaborative_indexes.reshape(-1,10),axis=0)
    
    collaborative_interest_scores = collab_interest*0.3
    
    return collaborative_interest_scores

In [227]:
collaborative_recommended = Collaborative_Recommends_Calculator(existing_users,new_user)

In [228]:
collaborative_recommended

array([0.1401302 , 0.51475294, 0.15017204, 0.43437819, 0.44470077,
       0.24788337, 0.31189648, 0.27263806, 0.1999394 , 0.29468474])

In [287]:
def Trends(trending) :

    trends = np.mean(existing_users,axis=0)
    Trending_news = cosine_similarity(trends.reshape(1,10),Lda_articlemat)
    top= np.sort(Trending_news)[::-1][0][:10]
    return top

In [288]:
Trending_Articles = Trends(existing_users)*0.3

In [289]:
Trending_Articles 

array([0.09690846, 0.09766251, 0.09811722, 0.09881548, 0.09903137,
       0.09911932, 0.09926018, 0.09927689, 0.09928779, 0.09929183])

In [297]:
#30%content based,40% collab,30% from trends

def Hybrid_Calculator():
    
    hybrid_interests = np.add(content_recommended,collaborative_recommended,Trending_Articles) 
    
    similar_scores = cosine_similarity(hybrid_interests.reshape(1,10),Lda_articlemat)
    
    recommended_article_address =  np.argsort(similar_scores)[::-1]
    
    return recommended_article_address    

In [291]:
hybrid_recommend_indexes =  Hybrid_Calculator() #we get hyrid interest with our variations of 0.4 content based and 0.6 collab based

In [285]:
for articles in hybrid_recommend_indexes :
    
    print('Recommended-Articles :')
    
    print('\n')
    
    print(title[articles][:no_of_recommends])

Recommended-Articles :


1552    List of flag bearers for nations in the Rio Ol...
2149    Celebrity Rumours that Rocked the Internet in ...
2356    Celebrities Weddings that Cost More than your ...
737      Papanasam   Papanaasam  Movie Review Round-up...
1742                   Asian Games 2014  Day 9 Highlights
3925    Budget 2016  MPs want income tax exemption lim...
4661    Uber Rape Shame  Short Cuts Being Preferred to...
3978    Govt hopeful of implementing GST bill next yea...
2100    Most-Searched Celebrities 2014  Sunny Leone Be...
4538    Modi US Visit  Itinerary of Indian PM s trip t...
2370    Islamic State   Iranian Hulk  will fight Isis ...
2446    War on Terror Updates  Russia  kill 320 terror...
2780    Parliament winter session  PM Modi to address ...
4615    Tata Motors  Adani Ports  Axis Bank  L T lead ...
2229    Is there any steam left in IndiGo owner shares...
4545    Ex-Service Chiefs Urge PM to Implement OROP  2...
2122    Run Kerala Run  Sachin Tendulkar  Mohan