In [73]:
# import pandas
import pandas as pd

# import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

# import cosine_similarity
from sklearn.metrics.pairwise import cosine_similarity

In [74]:
# import data
df = pd.read_csv("https://raw.githubusercontent.com/nikitaa30/Content-based-Recommender-System/master/sample-data.csv", index_col=0)

In [75]:
df

Unnamed: 0_level_0,description
id,Unnamed: 1_level_1
1,Active classic boxers - There's a reason why o...
2,Active sport boxer briefs - Skinning up Glory ...
3,Active sport briefs - These superbreathable no...
4,"Alpine guide pants - Skin in, climb ice, switc..."
5,"Alpine wind jkt - On high ridges, steep ice an..."
...,...
496,Cap 2 bottoms - Cut loose from the maddening c...
497,Cap 2 crew - This crew takes the edge off fick...
498,All-time shell - No need to use that morning T...
499,All-wear cargo shorts - All-Wear Cargo Shorts ...


* explore DataFrame

In [76]:
df.shape

(500, 1)

We will be using Tf-Idf to find similar items based on description
* instantiate TF-IDF

In [77]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(stop_words='english')

#Replace NaN with an empty string
df['description'] = df['description'].fillna('')



* fit and transform 'description' column with TFIDF

In [78]:
v_description = vectorizer.fit_transform(df['description'])

In [79]:
v_description.shape

(500, 4600)

In [80]:
vectorizer.get_feature_names()



['000',
 '03',
 '10',
 '100',
 '1000',
 '1021',
 '1027',
 '103',
 '1038',
 '1055',
 '106',
 '1070',
 '108',
 '109',
 '1096',
 '11',
 '110',
 '112',
 '1125',
 '1128',
 '1139',
 '115',
 '116',
 '1171',
 '118',
 '1188',
 '11c',
 '12',
 '1200',
 '121',
 '1234',
 '124',
 '125',
 '126',
 '127',
 '129',
 '1298',
 '12d',
 '13',
 '130',
 '132',
 '1324',
 '1327',
 '133',
 '1341',
 '135',
 '138',
 '14',
 '141',
 '144',
 '15',
 '150',
 '152',
 '153',
 '155',
 '156',
 '158',
 '16',
 '164',
 '165',
 '167',
 '168',
 '17',
 '170',
 '172',
 '173',
 '174',
 '175',
 '178',
 '18',
 '181',
 '184',
 '187',
 '189',
 '19',
 '190',
 '192',
 '193',
 '195',
 '1950',
 '198',
 '199',
 '20',
 '200',
 '2009',
 '201',
 '204',
 '206',
 '207',
 '21',
 '210',
 '213',
 '216',
 '219',
 '22',
 '221',
 '222',
 '223',
 '225',
 '229',
 '23',
 '230',
 '232',
 '233',
 '239',
 '24',
 '242',
 '243',
 '245',
 '248',
 '25',
 '250',
 '253',
 '256',
 '257',
 '259',
 '26',
 '260',
 '262',
 '265',
 '266',
 '269',
 '27',
 '271',
 '2711'

* calculate the cosine similarity of each item with every other item in the dataset, 

In [95]:
from sklearn.metrics.pairwise import cosine_similarity

cosine_description = cosine_similarity(v_description, v_description)

cosine_description

array([[1.        , 0.31005145, 0.18891957, ..., 0.14812382, 0.18521397,
        0.20070706],
       [0.31005145, 1.        , 0.57514356, ..., 0.11131481, 0.2053139 ,
        0.18008906],
       [0.18891957, 0.57514356, 1.        , ..., 0.10043647, 0.12778935,
        0.14410777],
       ...,
       [0.14812382, 0.11131481, 0.10043647, ..., 1.        , 0.11674521,
        0.14302157],
       [0.18521397, 0.2053139 , 0.12778935, ..., 0.11674521, 1.        ,
        0.57835324],
       [0.20070706, 0.18008906, 0.14410777, ..., 0.14302157, 0.57835324,
        1.        ]])

* sort all items using their similarity for each item i, and store the values in dictionary `results`

```
results = {
    "1": [5,7,9...],
    "2": [45,2,3...]
}
```

In [101]:
import numpy as np

results = {}
for id in df.index:
    similarities = list(enumerate(cosine_description[id-1]))
    similarities = sorted(similarities, key=lambda x: x[1], reverse=True)
    results[id] = [similarity[0] for similarity in similarities] 

* create function `recommender` that will recommend similar products
    * function must have two input params: **item_id** and **count** of similar products 

In [109]:
def recommender(item_id, count):
    sim_scores = results[item_id]
    sim_scores = sim_scores[1:count]
    return sim_scores

* show top 5 the most similar items for item with idem_id = 11

In [111]:
recommender(30, 10)

[145, 91, 99, 394, 117, 118, 349, 482, 461]