## Importing Required Libraries

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

## Read input file from Github URL

In [11]:
ds = pd.read_csv("https://raw.githubusercontent.com/jobmthomas12/Content-Based-Recommendation/master/sample-data.csv")
#print(ds.head())

## Generating TFIDF weightage to each word  in each document

In [12]:
tfidf = TfidfVectorizer(analyzer='word', ngram_range=(1, 3), min_df=0, stop_words='english')
tfidf_matrix = tfidf.fit_transform(ds['description'])
#print(type(tfidf_matrix))

## Converting TFID metrix to a data frame for better clarity

In [5]:
dataframe=pd.DataFrame(data=tfidf_matrix.todense(),columns=tfidf.get_feature_names())
#print(dataframe.head())

## Finding Similarity using Linear Kernel

In [6]:
cosine_similarities = linear_kernel(tfidf_matrix, tfidf_matrix)
#print(type(cosine_similarities))
#print(cosine_similarities)

## Converting Similarity metrix to a easily queriyable model

In [7]:
model = {}
for idx, row in ds.iterrows():
    #print(row['id'])
    #print(cosine_similarities[idx])
    similar_indices = cosine_similarities[idx].argsort()[:-100:-1]
    #print(similar_indices)
    similar_items = [( ds['id'][i],cosine_similarities[idx][i]) for i in similar_indices ]
    #print(similar_items)
    similar_items=similar_items[1:]
    model[row['id']]=similar_items
    
#print(model)

   

## Testing the Model

In [8]:
print(model['item3'])

[('item2', 0.682744385495972), ('item4', 0.3737255610399872), ('item1', 0.2062550633230314), ('item6', 0.19938805974926746), ('item5', 0.1959392188554314)]
