In [3]:
import numpy as np
import pandas as pd
from nltk.tokenize.toktok import ToktokTokenizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import re
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords
from scipy.sparse import csr_matrix
import nltk
import  sklearn
from collections import Counter
stemmer = SnowballStemmer('english')
news = pd.read_csv('/home/linu/news_articles.csv')
news.head()
tokenizer = ToktokTokenizer()
stops = set(stopwords.words('english'))
from sklearn.decomposition import LatentDirichletAllocation

no_of_recommends = 5
n_topics = 8

news = news[['Article_Id','Title','Content']].dropna()
contents = news["Content"].tolist()
title = news['Title']
article_id = news['Article_Id']
from sklearn.preprocessing import Normalizer

In [4]:
def clean_tokenize(document):
    document = re.sub('[^\w_\s-]',' ',document)
    tokens  = nltk.word_tokenize(document)
    cleaned_article = ' '.join([stemmer.stem(item) for item in tokens])   #stemming the tokenized corpus
    return cleaned_article

cleaned_articles = list(map(clean_tokenize,contents))

In [20]:
article_vocab = { }

article_vocab = enumerate(cleaned_articles)

total_words = []

for i in range(0, len(cleaned_articles)):
    tokens = nltk.word_tokenize(cleaned_articles[i])

    for w in tokens:
        total_words.append(w)
counts = Counter(total_words)

vocab = {j:i for i,j in enumerate(counts.keys())}

stops_removed = [i for i in vocab.keys() if i not in stops]

final_vocab = {j:i for i,j in enumerate(stops_removed)}


tf_idf = TfidfVectorizer(vocabulary=final_vocab,min_df=1)

article_vocabulary_matrix = tf_idf.fit_transform(cleaned_articles)

lda = LatentDirichletAllocation(n_components=n_topics,max_iter=1,random_state=0)

Lda_articlemat = lda.fit_transform(article_vocabulary_matrix)

wordtokens_article = [word.split() for word in cleaned_articles]   #we tokenize each word in our article to divide by word per second

In [6]:
print(len(wordtokens_article[0]),len(wordtokens_article[2]),len(wordtokens_article[3]))

222 227 325


In [7]:
Lda_articlemat[0]

array([0.01364421, 0.01364359, 0.90449491, 0.01363855, 0.01363802,
       0.0136523 , 0.01364298, 0.01364544])

In [17]:
# a function to create user profiles

def user_profiler(wordtokens,article_read,article_time):
    user_profile = []
    wordPer_second = 5
    

    for i in range(len(wordtokens)):                                        

        average_time = (len(wordtokens[i])/wordPer_second) #length of wordtokslist by wps gives us avg time to read the article
         
        user_interest_timevalue = article_time[i]/average_time  #article_times divide by avg times of each article                   
        
        user_profile_generate = (article_read[i]*user_interest_timevalue)          #Ldamatrix[] * user_interest_time calculated                 
        
        user_profile.append(user_profile_generate)                                      

    return sum(user_profile)                                


userProfile_One = user_profiler([wordtokens_article[600],wordtokens_article[99],wordtokens_article[120]],
                         [Lda_articlemat[600],Lda_articlemat[99],Lda_articlemat[120]],
                         [120,60,30])

userProfile_Two = user_profiler([wordtokens_article[900],wordtokens_article[500],wordtokens_article[3000]],
                         [Lda_articlemat[900],Lda_articlemat[500],Lda_articlemat[3000]],
                         [111,120,180])

userProfile_Three = user_profiler([wordtokens_article[600],wordtokens_article[4830],wordtokens_article[390]],
                           [Lda_articlemat[600],Lda_articlemat[4830],Lda_articlemat[390]],
                           [200,120,100])


userprofile_List = [userProfile_One,userProfile_Two,userProfile_Three]
print(userProfile_One)   

[16.15273519  1.48287497  1.48230629 31.24012811  1.81714006  1.4823308
  1.48262537  1.48392406]


In [18]:
print(userProfile_One)

[16.15273519  1.48287497  1.48230629 31.24012811  1.81714006  1.4823308
  1.48262537  1.48392406]


In [19]:
normalized_profiles = Normalizer(csr_matrix(userprofile_List))

In [19]:
def similiar(profile_list) :
    n = []

    for profiles in userprofile_List:

        user_preffered_articles = cosine_similarity(profiles.reshape(1,-1),Lda_articlemat)
        a = np.argsort(user_preffered_articles).flatten()[::-1][:no_of_recommends]

        n.append(a)

    return n

similarityscore = similiar(normalized_profiles)

In [20]:
similarityscore

[array([1556, 1823, 2160, 1798, 1510]),
 array([ 688, 1679,  893, 3125,   68]),
 array([4744, 1977,  284, 4353,  195])]

In [21]:
#news['Title'][similarityscore[2]]

In [22]:
for i in similarityscore:
    print('\n')
    print('Recommended Articles :')
    print('\n')
    print(news['Title'][i])



Recommended Articles :


1556    Another blow in Rio Olympics preparations  as ...
1823    Saina Nehwal loses China Open final against Li...
2160     I  Box Office Collection  Vikram s Flick Gros...
1798    All England Open  Saina Nehwal clinches quarte...
1510    India at Rio Day 2 wrap  Gymnast Dipa Karmakar...
Name: Title, dtype: object


Recommended Articles :


688      Adi Kapyare Kootamani   Dhyan Sreenivasan-Nam...
1679    Rio Olympics  Can India s badminton star Saina...
893     Samsung Galaxy On5  2016  appears on Geekbench...
3125    Srinagar on alert  3 policemen killed in two m...
68      Kapu agitation violence  Pawan Kalyan condemns...
Name: Title, dtype: object


Recommended Articles :


4744    Indian Defence Minister Clears Deck for Acquis...
1977    ATP World Tour Finals results  Federer tops gr...
284     Vedalam  Vedhalam  box office collection  Ajit...
4353    BJP-RSS Meet  Patel Quota  OROP Row  Banglades...
195     Tollywood 2015  Top 10  hit blockbuster  high

In [13]:
news['Title'][12]

'Tamil Nadu  2 students dead  over 40 injured after live wire falls on bus'

In [14]:
np.argmax(Lda_articlemat,axis=1)

array([2, 1, 7, ..., 4, 1, 2])