In [22]:
import numpy as np
import pandas as pd
import seaborn as sns
import nltk
from nltk.tokenize.toktok import ToktokTokenizer
import pickle as pk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import re
from nltk.stem.snowball import SnowballStemmer
import nltk
stemmer = SnowballStemmer('english')
news = pd.read_csv('news_articles.csv')
news.head()
tokenizer = ToktokTokenizer()

In [43]:
articles_read = [2,7]
no_of_recommends = 5

In [44]:
news = news[['Article_Id','Title','Content']].dropna()
contents = news["Content"].tolist()

In [45]:
contents[0]

'At least 14 people died and 17 others were injured after a bus travelling from Hyderabad to Kakinada plunged into a canal from a bridge on the accident-prone stretch of the Hyderabad-Khammam highway in Telangana early Monday morning \r\nThe injured were admitted to the Government General Hospital for treatment \r\n\r\n\r\nSeven people died on the spot and the others succumbed to injuries while undergoing treatment at the hospital  The passengers belonged to the East and West Godavari districts of Andhra Pradesh \r\nThe bus  owned by private operator Yatra Genie  commenced its journey from Hyderabad at 11 30 p m  on Sunday  Khammam Superintendent of Police Shah Nawaz Khan was quoted by the Hindustan Times as saying \r\nThe accident happened around 2 30 a m  when the driver slammed the brakes to avoid a collision with another vehicle coming from the opposite direction on a bridge over Nagarjunsagar project left canal at Nayankangudem village in Khammam district  the daily reported  The 

In [46]:
def clean_tokenize(document):
    document = re.sub('[^\w_\s-]',' ',document)
    tokens  = nltk.word_tokenize(document)
    cleaned_article = ' '.join([stemmer.stem(item) for item in tokens])   #stemming the tokenized corpus
    return cleaned_article

In [47]:
cleaned_articles = list(map(clean_tokenize,contents)) 

In [48]:
cleaned_articles[0]

'at least 14 peopl die and 17 other were injur after a bus travel from hyderabad to kakinada plung into a canal from a bridg on the accident-pron stretch of the hyderabad-khammam highway in telangana earli monday morn the injur were admit to the govern general hospit for treatment seven peopl die on the spot and the other succumb to injuri while undergo treatment at the hospit the passeng belong to the east and west godavari district of andhra pradesh the bus own by privat oper yatra geni commenc it journey from hyderabad at 11 30 p m on sunday khammam superintend of polic shah nawaz khan was quot by the hindustan time as say the accid happen around 2 30 a m when the driver slam the brake to avoid a collis with anoth vehicl come from the opposit direct on a bridg over nagarjunsagar project left canal at nayankangudem villag in khammam district the daili report the bus hit the parapet wall of the bridg and nose-div into the canal the driver of the bus was appar drive at high speed due t

In [49]:
user_articles = ' '.join(cleaned_articles[i] for i in articles_read) 

In [50]:
user_articles

'the unit state and south korea began a joint militari drill on monday which prompt threat from north korea the latter has late receiv strong critic worldwid for defi sanction from the unit nation secur council unsc by launch sever ballist missil such action have led to tighter sanction for north korea by the un north korea consid the joint militari drill as prepar for invas and has threaten a pre-empt nuclear strike if the u s and south korea continu the oper it had also conduct a nuclear test in januari which further isol it the ulchi freedom guardian exercis will continu till sept 2 and around 25 000 u s troop are expect to join it the us-l un command militari armistic commiss said that it had notifi the north korean armi that the joint militari drill between the two nation was not provoc from this moment the first-strik combin unit of the korean peopl s armi keep themselv fulli readi to mount a preemptiv retaliatori strike at all enemi attack group involv in ulji freedom guardian a

In [51]:
tfidf_matrix = TfidfVectorizer(stop_words='english',min_df=2)
article_tfidf_matrix = tfidf_matrix.fit_transform(cleaned_articles)
article_tfidf_matrix

<4831x16009 sparse matrix of type '<class 'numpy.float64'>'
	with 468648 stored elements in Compressed Sparse Row format>

In [52]:
user_article_tfidf_vector = tfidf_matrix.transform([user_articles])
user_article_tfidf_vector

<1x16009 sparse matrix of type '<class 'numpy.float64'>'
	with 188 stored elements in Compressed Sparse Row format>

In [53]:
user_article_tfidf_vector

<1x16009 sparse matrix of type '<class 'numpy.float64'>'
	with 188 stored elements in Compressed Sparse Row format>

In [54]:
articles_similarity_score = cosine_similarity(article_tfidf_matrix,user_article_tfidf_vector)

In [55]:
recommeded_articles_ids = np.argsort(articles_similarity_score,axis=0)[::-1].flatten()

In [56]:
articles_similarity_score

array([[0.03255244],
       [0.00861364],
       [0.78934896],
       ...,
       [0.01131361],
       [0.01259012],
       [0.02664573]])

In [57]:
recommeded_articles_ids

array([   2,    7, 3326, ...,  210,  622,  262], dtype=int64)

In [58]:
final_recommended_articles_id = [article_id for article_id in recommeded_articles_ids
                                if article_id not in articles_read][:no_of_recommends]

In [59]:
final_recommended_articles_id

[3326, 2862, 2808, 2724, 2950]

In [67]:
print('articles_read : ')
print (news.loc[news['Article_Id'].isin(articles_read)]['Title'])
print('recommended articles :')
print (news.loc[news['Article_Id'].isin(final_recommended_articles_id)]['Title'])

articles_read : 
2    US  South Korea begin joint military drill ami...
7    Dialogue crucial in finding permanent solution...
Name: Title, dtype: object
recommended articles :
2724    PM Modi says at all-party meeting that PoK is ...
2808    J K  CM Mufti blames  vested interests  for Ka...
2862    J K  PM Modi appeals for peace in Valley  assu...
2950    Kashmir  Death toll rises to 8 in protests ove...
3326    US  China to  fully implement  sanctions again...
Name: Title, dtype: object
