In [153]:
import numpy as np
import pandas as pd
from nltk.tokenize.toktok import ToktokTokenizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import re
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords
from scipy.sparse import csr_matrix
import nltk
import  sklearn
from collections import Counter
stemmer = SnowballStemmer('english')
news = pd.read_csv('/home/linu/news_articles.csv')
news.head()
tokenizer = ToktokTokenizer()
stops = set(stopwords.words('english'))
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.preprocessing import normalize

#File_path = '/home/linu/GoogleNews-vectors-negative300.bin'

no_of_recommends = 20
n_topics = 8

In [77]:
news = news[['Article_Id','Title','Content']].dropna()
contents = news["Content"].tolist()
title = news['Title']
article_id = news['Article_Id']

In [60]:
def clean_tokenize(document):
    document = re.sub('[^\w_\s-]',' ',document)
    tokens  = nltk.word_tokenize(document)
    cleaned_article = ' '.join([stemmer.stem(item) for item in tokens])   #stemming the tokenized corpus
    return cleaned_article

In [61]:
cleaned_articles = list(map(clean_tokenize,contents))

In [159]:
def Topic_Modeller(LDA_matrix):
    
    total_WordVocab = []
    for i in range(0,len(cleaned_articles)) :
        word_tokens = nltk.word_tokenize(cleaned_articles[i])
        for words in word_tokens :
            total_WordVocab.append(words)
        counts = Counter(total_WordVocab)

    vocab = {j:i for i,j in enumerate(counts.keys())}

    stops_removed = [word for word in vocab.keys() if word not in stops]

    Final_VocabDict = {j:i for i,j in enumerate(stops_removed)}
    
    Tfidf = TfidfVectorizer(min_df=1,vocabulary=Final_VocabDict)


    Tfidf_Matrix = Tfidf.fit_transform(cleaned_articles)

    Lda = LatentDirichletAllocation(n_components=10,max_iter=1,random_state=0)
    
    Lda_articlemat = Lda.fit_transform(Tfidf_Matrix)

    return Lda_articlemat  

In [161]:
Lda_articlemat = Topic_Modeller(cleaned_articles)

In [124]:
Lda_articlemat.shape

(4831, 10)

In [125]:
wordtokens_article = [word.split() for word in cleaned_articles]

In [126]:
# a function to create user profiles on the base of articles read and time spent  for implementing content approach

def user_profiler(wordtokens,article_read,article_time):
    user_profile = []
    wordPer_second = 5
    

    for i in range(len(wordtokens)):                                        

        average_time = (len(wordtokens[i])/wordPer_second) #length of wordtokslist by wps gives us avg time to read the article
         
        user_interest_timevalue = article_time[i]/average_time  #article_times divide by avg times of each article                   
        
        user_profile_generate = (article_read[i]*user_interest_timevalue)   #Ldamatrix[] * user_interest_time calculated                 
        
        user_profile.append(user_profile_generate)                                      

    return sum(normalize(user_profile))                             


userProfile_One = user_profiler([wordtokens_article[600],wordtokens_article[99],wordtokens_article[120]],
                             [Lda_articlemat[600],Lda_articlemat[99],Lda_articlemat[120]],
                             [120,60,30])

In [127]:
userprofile_one =  np.array(userProfile_One)

In [128]:
userprofile_one.shape

(10,)

In [129]:
userprofile_one

array([0.06380809, 1.03563938, 0.06380688, 1.05161394, 0.06380264,
       0.0638015 , 1.03360147, 0.06380235, 0.06380346, 0.06380215])

In [130]:
def Content_Recommends_Calculator(user_profile,Lda_articlemat) :
    
    user_interested_articles = []
    
    contents_interest_score = []

    user_preffered_articles = cosine_similarity(userprofile_one.reshape(1,-1),Lda_articlemat)
    
    top_articles = np.sort(user_preffered_articles).flatten()[::-1][:10]
    
    user_interested_articles.append(top_articles)
    
    content_interest_score = (user_interested_articles[0] * 0.4)
    
    return content_interest_score


content_recommended = Content_Recommends_Calculator(userprofile_one,Lda_articlemat)

In [131]:
content_recommended

array([0.33218085, 0.33066672, 0.3303857 , 0.33036246, 0.32982406,
       0.32849261, 0.3283279 , 0.3282375 , 0.32813684, 0.32796289])

In [132]:
#For CF apporach
existing_users = np.random.random_sample(size=(1000,10))   #we take n existing users   
new_user = np.random.random_sample(size=(1,10))            #we take a single new user

In [146]:
def Collaborative_Recommends_Calculator(existing_usr,new_usr) :
    
    collaborative_interest_score = [ ]
    sorted_collaborative_interest = [ ]
    
    collaborative_interest_score = cosine_similarity(existing_users,new_user)
    
    sorted_collaborative_interest = np.argsort(collaborative_interest_score,axis=0)[::-1][:10]
    
    sorted_collaborative_indexes = existing_users[sorted_collaborative_interest]
    
    collab_interest = np.mean(sorted_collaborative_indexes.reshape(-1,10),axis=0)
    
    collaborative_interest_scores = collab_interest*0.6
    
    return collaborative_interest_scores

In [162]:
collab_recommended = Collaborative_Recommends_Calculator(existing_users,new_user)

In [163]:
collab_recommended

array([0.17969108, 0.29012024, 0.09446668, 0.51803656, 0.17033427,
       0.4189131 , 0.39565082, 0.1548523 , 0.19331029, 0.22634371])

In [150]:
def Hybrid_Calculator():
    
    hybrid_interests = np.add(content_recommended,collab_recommendeds)
    
    similar_scores = cosine_similarity(hybrid_interests.reshape(1,10),Lda_articlemat)
    
    recommended_article_address =  np.argsort(similar_scores)[::-1]
    
    return recommended_article_address    

In [157]:
hybrid_recommend_indexes =  Hybrid_Calculator() #we get hyrid interest with our variations of 0.4 content based and 0.6 collab based

In [164]:
for articles in hybrid_recommend_indexes :
    
    print('Recommended-Articles :')
    
    print('\n')
    
    print(title[articles][:no_of_recommends])

Recommended-Articles :


2778    Shashi Tharoor s Scalding Oxford Union Speech ...
1470    Top 10 Technology Innovations at CES 2015  Wea...
3274    Sensex slumps 250 points ahead of Bihar electi...
1559    Rio Olympics 2016 tennis preview  Djokovic vs ...
3418    Kashmir Conflict LIVE  Fresh Ceasefire Violati...
3913    US may lift 40-year-old ban on exporting crude...
2579    Human rights activists mount pressure on Indon...
3304    Why is  Soft State  India Not Taking Action Ag...
4369    India  US launch  ease of doing business  grou...
1977    ATP World Tour Finals results  Federer tops gr...
2811    Kashmir tense after Hizbul Mujahideen militant...
4744    Indian Defence Minister Clears Deck for Acquis...
1913    AIB Roast Controversy and Stringent Censor Boa...
2604    China cites  large differences  over non-NPT s...
2194    PVP  Dil Raju to Remake  Bangalore Days  with ...
4468    Rupee closes at 68 30  near all-time low of 68...
2357    Nobody knows the system better than me 