In [1]:
import numpy as np
import pandas as pd
from nltk.tokenize.toktok import ToktokTokenizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import re
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords
from scipy.sparse import csr_matrix
import nltk
import  sklearn
from collections import Counter
stemmer = SnowballStemmer('english')
news = pd.read_csv('/home/linu/news_articles.csv')
news.head()
tokenizer = ToktokTokenizer()
stops = set(stopwords.words('english'))
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.preprocessing import normalize

#File_path = '/home/linu/GoogleNews-vectors-negative300.bin'

no_of_recommends = 20
n_topics = 8

In [2]:
news = news[['Article_Id','Title','Content']].dropna()
contents = news["Content"].tolist()
title = news['Title']
article_id = news['Article_Id']

In [3]:
def clean_tokenize(document):
    document = re.sub('[^\w_\s-]',' ',document)
    tokens  = nltk.word_tokenize(document)
    cleaned_article = ' '.join([stemmer.stem(item) for item in tokens])   #stemming the tokenized corpus
    return cleaned_article

In [4]:
cleaned_articles = list(map(clean_tokenize,contents))

In [5]:
def Topic_Modeller(LDA_matrix):
    
    total_WordVocab = []
    for i in range(0,len(cleaned_articles)) :
        word_tokens = nltk.word_tokenize(cleaned_articles[i])
        for words in word_tokens :
            total_WordVocab.append(words)
        counts = Counter(total_WordVocab)

    vocab = {j:i for i,j in enumerate(counts.keys())}

    stops_removed = [word for word in vocab.keys() if word not in stops]

    Final_VocabDict = {j:i for i,j in enumerate(stops_removed)}
    
    Tfidf = TfidfVectorizer(min_df=1,vocabulary=Final_VocabDict)


    Tfidf_Matrix = Tfidf.fit_transform(cleaned_articles)

    Lda = LatentDirichletAllocation(n_components=10,max_iter=1,random_state=0)
    
    Lda_articlemat = Lda.fit_transform(Tfidf_Matrix)

    return Lda_articlemat  

In [6]:
Lda_articlemat = Topic_Modeller(cleaned_articles)

In [7]:
Lda_articlemat.shape

(4831, 10)

In [8]:
wordtokens_article = [word.split() for word in cleaned_articles]

In [9]:
# a function to create user profiles on the base of articles read and time spent  for implementing content approach

def user_profiler(wordtokens,article_read,article_time):
    user_profile = []
    wordPer_second = 5
    

    for i in range(len(wordtokens)):                                        

        average_time = (len(wordtokens[i])/wordPer_second) #length of wordtokslist by wps gives us avg time to read the article
         
        user_interest_timevalue = article_time[i]/average_time  #article_times divide by avg times of each article                   
        
        user_profile_generate = (article_read[i]*user_interest_timevalue)   #Ldamatrix[] * user_interest_time calculated                 
        
        user_profile.append(user_profile_generate)                                      

    return sum(normalize(user_profile))                             


userProfile_One = user_profiler([wordtokens_article[600],wordtokens_article[99],wordtokens_article[120]],
                             [Lda_articlemat[600],Lda_articlemat[99],Lda_articlemat[120]],
                             [120,60,30])

In [10]:
userprofile_one =  np.array(userProfile_One)

In [11]:
userprofile_one.shape

(10,)

In [12]:
userprofile_one

array([0.06380809, 1.03563938, 0.06380688, 1.05161394, 0.06380264,
       0.0638015 , 1.03360147, 0.06380235, 0.06380346, 0.06380215])

In [13]:
def Content_Recommends_Calculator(user_profile,Lda_articlemat) :
    
    user_interested_articles = []
    
    contents_interest_score = []

    user_preffered_articles = cosine_similarity(userprofile_one.reshape(1,-1),Lda_articlemat)
    
    top_articles = np.sort(user_preffered_articles).flatten()[::-1][:10]
    
    user_interested_articles.append(top_articles)
    
    content_interest_score = (user_interested_articles[0] * 0.4)
    
    return content_interest_score


content_recommended = Content_Recommends_Calculator(userprofile_one,Lda_articlemat)

In [14]:
content_recommended

array([0.33218085, 0.33066672, 0.3303857 , 0.33036246, 0.32982406,
       0.32849261, 0.3283279 , 0.3282375 , 0.32813684, 0.32796289])

In [15]:
#For CF apporach
existing_users = np.random.random_sample(size=(1000,10))   #we take n existing users   
new_user = np.random.random_sample(size=(1,10))            #we take a single new user

In [16]:
def Collaborative_Recommends_Calculator(existing_usr,new_usr) :
    
    collaborative_interest_score = [ ]
    sorted_collaborative_interest = [ ]
    
    collaborative_interest_score = cosine_similarity(existing_users,new_user)
    
    sorted_collaborative_interest = np.argsort(collaborative_interest_score,axis=0)[::-1][:10]
    
    sorted_collaborative_indexes = existing_users[sorted_collaborative_interest]
    
    collab_interest = np.mean(sorted_collaborative_indexes.reshape(-1,10),axis=0)
    
    collaborative_interest_scores = collab_interest*0.6
    
    return collaborative_interest_scores

In [17]:
collab_recommended = Collaborative_Recommends_Calculator(existing_users,new_user)

In [18]:
collab_recommended

array([0.42538367, 0.12472748, 0.27917413, 0.52935857, 0.36303798,
       0.48767273, 0.50683032, 0.344337  , 0.17345567, 0.23187774])

In [26]:
def Hybrid_Calculator():
    
    hybrid_interests = np.add(content_recommended,collab_recommended)
    
    similar_scores = cosine_similarity(hybrid_interests.reshape(1,10),Lda_articlemat)
    
    recommended_article_address =  np.argsort(similar_scores)[::-1]
    
    return recommended_article_address    

In [27]:
hybrid_recommend_indexes =  Hybrid_Calculator() #we get hyrid interest with our variations of 0.4 content based and 0.6 collab based

In [28]:
for articles in hybrid_recommend_indexes :
    
    print('Recommended-Articles :')
    
    print('\n')
    
    print(title[articles][:no_of_recommends])

Recommended-Articles :


1813    2016 Premier Badminton League  Complete team s...
4495    Asian Shares Recover from Three-year Lows Whil...
383      Oru Vadakkan Selfie  Movie Review  A Hilariou...
1331    Apple CEO s likely agenda for multi-day India ...
3606    Shah Rukh Khan Beats Salman to Win  Sexiest Kh...
1937     OK Kanmani  Trailer  Train  Rain  Beach  Roma...
4115    Modi s favourability rating rises to 87   jobs...
973                          A case for twin-lens cameras
241     Rajamma at Yahoo review  Predictable  simple  ...
2640     Captain America  Civil War  spoilers  Spider-...
954     Update OnePlus One to Android Marshmallow via ...
2154     Ormayundo Ee Mugham  Critics Review Roundup  ...
3351    Opposition Unites to Demand Sushma Swaraj s Re...
2518    France police raid homes  vow it s  just the b...
4407    Start-up India  Japan s SoftBank to invest  10...
1599    India at Rio Olympics  Shiva Thapa  Manoj Kuma...
2130    Mohanlal Offers to Return National Game