In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import nltk
from nltk.tokenize.toktok import ToktokTokenizer
import pickle as pk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import re
from nltk.stem.snowball import SnowballStemmer
import nltk
stemmer = SnowballStemmer('english')
news = pd.read_csv('news_articles.csv')
news.head()
tokenizer = ToktokTokenizer()
import gensim
from gensim import corpora, models
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.cluster import KMeans



In [2]:
articles_read = [2,7]
no_of_recommends = 5
n_topics = 7

In [126]:
news.head()

Unnamed: 0,Article_Id,Title,Content
0,0,14 dead after bus falls into canal in Telangan...,At least 14 people died and 17 others were inj...
1,1,Pratibha Tiwari molested on busy road Saath ...,TV actress Pratibha Tiwari who is best known ...
2,2,US South Korea begin joint military drill ami...,The United States and South Korea began a join...
3,3,Illegal construction in Bengaluru Will my hou...,The relentless drive by Bengaluru s Bangalore...
4,4,Punjab Gau Rakshak Dal chief held for assaulti...,Punjab Gau Raksha Dal chief Satish Kumar and h...


In [3]:
news = news[['Article_Id','Title','Content']].dropna()
contents = news['Content'].tolist()

In [4]:
contents[0]

'At least 14 people died and 17 others were injured after a bus travelling from Hyderabad to Kakinada plunged into a canal from a bridge on the accident-prone stretch of the Hyderabad-Khammam highway in Telangana early Monday morning \r\nThe injured were admitted to the Government General Hospital for treatment \r\n\r\n\r\nSeven people died on the spot and the others succumbed to injuries while undergoing treatment at the hospital  The passengers belonged to the East and West Godavari districts of Andhra Pradesh \r\nThe bus  owned by private operator Yatra Genie  commenced its journey from Hyderabad at 11 30 p m  on Sunday  Khammam Superintendent of Police Shah Nawaz Khan was quoted by the Hindustan Times as saying \r\nThe accident happened around 2 30 a m  when the driver slammed the brakes to avoid a collision with another vehicle coming from the opposite direction on a bridge over Nagarjunsagar project left canal at Nayankangudem village in Khammam district  the daily reported  The 

In [5]:
def clean_tokenize(document):
    document = re.sub('[^\w_\s-]',' ',document)
    tokens  = nltk.word_tokenize(document)
    cleaned_article = ' '.join([stemmer.stem(item) for item in tokens])   #stemming the tokenized corpus
    return cleaned_article

In [6]:
cleaned_articles = list(map(clean_tokenize,contents))

In [7]:
cleaned_articles[0]

'at least 14 peopl die and 17 other were injur after a bus travel from hyderabad to kakinada plung into a canal from a bridg on the accident-pron stretch of the hyderabad-khammam highway in telangana earli monday morn the injur were admit to the govern general hospit for treatment seven peopl die on the spot and the other succumb to injuri while undergo treatment at the hospit the passeng belong to the east and west godavari district of andhra pradesh the bus own by privat oper yatra geni commenc it journey from hyderabad at 11 30 p m on sunday khammam superintend of polic shah nawaz khan was quot by the hindustan time as say the accid happen around 2 30 a m when the driver slam the brake to avoid a collis with anoth vehicl come from the opposit direct on a bridg over nagarjunsagar project left canal at nayankangudem villag in khammam district the daili report the bus hit the parapet wall of the bridg and nose-div into the canal the driver of the bus was appar drive at high speed due t

In [12]:
user_articles = ' '.join(cleaned_articles[i] for i in articles_read)

In [13]:
tfidf_matrix = TfidfVectorizer(stop_words='english',min_df=2)
article_tfidf_matrix = tfidf_matrix.fit_transform(cleaned_articles)
article_tfidf_matrix

<4831x16009 sparse matrix of type '<class 'numpy.float64'>'
	with 468648 stored elements in Compressed Sparse Row format>

In [14]:
user_articles_tfidf_matrix = tfidf_matrix.transform([user_articles])
user_articles_tfidf_matrix

<1x16009 sparse matrix of type '<class 'numpy.float64'>'
	with 188 stored elements in Compressed Sparse Row format>

In [18]:
article_similarity_matrix = cosine_similarity(article_tfidf_matrix,user_articles_tfidf_matrix)

In [19]:
article_similarity_matrix.shape

(4831, 1)

In [20]:
recommended_article_ids = np.argsort(article_similarity_matrix,axis=0)[::-1].flatten()

In [21]:
recommended_article_ids

array([   2,    7, 3326, ...,  210,  622,  262], dtype=int64)

In [224]:
final_recommended_articles_id = [article_id for article_id in recommended_article_ids if article_id not in articles_read][:no_of_recommends]

In [225]:
final_recommended_articles_id

[3326, 2862, 2808, 2724, 2950]

In [226]:
for i in final_recommended_articles_id:
    print (news["Title"][news["Article_Id"]==i])

3326    US  China to  fully implement  sanctions again...
Name: Title, dtype: object
2862    J K  PM Modi appeals for peace in Valley  assu...
Name: Title, dtype: object
2808    J K  CM Mufti blames  vested interests  for Ka...
Name: Title, dtype: object
2724    PM Modi says at all-party meeting that PoK is ...
Name: Title, dtype: object
2950    Kashmir  Death toll rises to 8 in protests ove...
Name: Title, dtype: object


In [36]:
print('Articles_Read by user :')
print(news.loc[news['Article_Id'].isin(articles_read)]['Title'])

print('Recommended-Articles for user :')

print(news.loc[news['Article_Id'].isin(final_recommended_articles_id)]['Title'])

Articles_Read by user :
2    US  South Korea begin joint military drill ami...
7    Dialogue crucial in finding permanent solution...
Name: Title, dtype: object
Recommended-Articles for user :
2724    PM Modi says at all-party meeting that PoK is ...
2808    J K  CM Mufti blames  vested interests  for Ka...
2862    J K  PM Modi appeals for peace in Valley  assu...
2950    Kashmir  Death toll rises to 8 in protests ove...
3326    US  China to  fully implement  sanctions again...
Name: Title, dtype: object


# TOPIC-MODELLING 

In [70]:
lda = LatentDirichletAllocation(n_components=n_topics,random_state=0)

In [71]:
article_vocab = lda.fit_transform(article_tfidf_matrix)

In [75]:
kmeans = KMeans(n_clusters=7,random_state=0)

In [76]:
article_cluster = kmeans.fit(article_vocab)

In [77]:
labels = kmeans.labels_

In [239]:
final_recommended_articles_id = [article_id for article_id in article_vocab[labels]][:no_of_recommends]

In [240]:
final_recommended_articles_id

[array([0.01444536, 0.01480156, 0.88979321, 0.03722346, 0.01444537,
        0.01468933, 0.01460172]),
 array([0.01460517, 0.79366538, 0.1331954 , 0.01466423, 0.01460516,
        0.01463757, 0.01462709]),
 array([0.01444536, 0.01480156, 0.88979321, 0.03722346, 0.01444537,
        0.01468933, 0.01460172]),
 array([0.01444536, 0.01480156, 0.88979321, 0.03722346, 0.01444537,
        0.01468933, 0.01460172]),
 array([0.01444536, 0.01480156, 0.88979321, 0.03722346, 0.01444537,
        0.01468933, 0.01460172])]

In [355]:
print('Recommended-Articles :\n')
recommends = []
for i in range (7) :
    recommends.append(news["Title"][labels==i][:3])

Recommended-Articles :



In [356]:
recommends

[51     Dhanush set to make Hollywood debut  to share ...
 129       Kathakali movie review  Live audience response
 142     Amar Akbar Anthony  review  Fun filled family...
 Name: Title, dtype: object,
 31    Ajith set to take up Vishnuvardhan s historica...
 32     Theri  teaser  Will Vijay starrer clip reach ...
 36         Anjala  movie review  Live audience response
 Name: Title, dtype: object,
 30    Box office collection   Jil Jung Juk    Deadpo...
 40     Theri  teaser  Video of Vijay starrer shatter...
 43     Theri  teaser  Video of Vijay starrer crosses...
 Name: Title, dtype: object,
 1     Pratibha Tiwari molested on busy road   Saath ...
 35     Jil Jung Juk  movie review  Live audience res...
 75    Jallikattu row  I m not a brand ambassador of ...
 Name: Title, dtype: object,
 6      Infosys shares likely to fall on Tuesday after...
 130           Gethu movie review  Live audience response
 238     Su Su Sudhi Vathmeekam  review  Live audience...
 Name: Title, dtype: ob