#                  CONTENT BASED FILTERING USING NEWS DATASET

##                         IMPORTING THE NECESSARY LIBRARIES

In [2]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel 
ds = pd.read_csv("newsdata.csv")

In [3]:
ds

Unnamed: 0,ID,category,description
0,0,DIVORCE,Prince William Marriage: New Biography Claims ...
1,1,WELLNESS,WATCH: Exclusive Becoming Fearless Video Series
2,2,WELLNESS,15 Things You Should Give Up To Be Happy
3,3,WELLNESS,"Lana Kuykendall\, Mom With Flesh-Eating Diseas..."
4,4,TRAVEL,Airbus A320 To Offer Extra-Wide Seats For Amer...
...,...,...,...
10844,10844,TECH,RIM CEO Thorsten Heins' 'Significant' Plans Fo...
10845,10845,SPORTS,Maria Sharapova Stunned By Victoria Azarenka I...
10846,10846,SPORTS,"Giants Over Patriots\, Jets Over Colts Among ..."
10847,10847,SPORTS,Aldon Smith Arrested: 49ers Linebacker Busted ...


# TF-IDF VECTORIZER(Term Frequency-Inverse Document FRequency)(for converting strings into vectors to extract features)


### TF=No.of time the term appears in the document/Total no.of terms in the document
### IDF=loge(Total no.of documents/number of documents with term in it)
### score=TF*IDF


In [4]:
tf = TfidfVectorizer(analyzer='word', ngram_range=(1, 3), min_df=0, stop_words='english')
tfidf_matrix = tf.fit_transform(ds['description'])

# COSINE SIMILARITY
### (To Calculate the similarity among one datapoint and the other)(In simple terms,similarity between two vectors)
### The need of using cosine similarity here is it gives more similarity as angle between the vectors decreases

In [5]:
cosine_similarities = linear_kernel(tfidf_matrix, tfidf_matrix) 
results = {}
for idx, row in ds.iterrows():
    similar_indices = cosine_similarities[idx].argsort()[:-100:-1] 
    similar_items = [(cosine_similarities[idx][i], ds['ID'][i]) for i in similar_indices] 
    results[row['ID']] = similar_items[1:]

In [6]:
def item(ID):
    return ds.loc[ds['ID'] == ID]['description'].tolist()[0].split(' - ')[0] 
# Just reads the results out of the dictionary.def 
def recommend(category, num):
    #print("Recommending " + str(num) + " products similar to " + item(category) + "...")   
    #print("-------")    
    recs = results[category][:num]   
    for rec in recs: 
        print("Recommended: " + item(rec[1]) + " (score:" +      str(rec[0]) + ")")


### Here category is the input and the num is number of recommendations

In [46]:
recommend(category=1, num=5)
    

Recommended: WATCH: Becoming Fearless About Money (score:0.16475115317345368)
Recommended: WATCH: Becoming Fearless About Relationships (score:0.1613653360707772)
Recommended: INET Video Series: Re-Examining Research on Financial Economics (score:0.1353716007546778)
Recommended: What's Your 'Fearless Charm'? (score:0.12557043023167025)
Recommended: LISTEN: Your Fearless Playlist -- Be Fearless In Love! (score:0.12049591513144521)


## Reference Link:https://heartbeat.fritz.ai/recommender-systems-with-python-part-i-content-based-filtering-5df4940bd831

### Dataset link:https://www.kaggle.com/rmisra/news-category-dataset#News_Category_Dataset_v2.json
