In [1]:
# import pandas
import pandas as pd

# import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

# import cosine_similarity
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
# import data
df = pd.read_csv("https://raw.githubusercontent.com/nikitaa30/Content-based-Recommender-System/master/sample-data.csv")

In [3]:
df

Unnamed: 0,id,description
0,1,Active classic boxers - There's a reason why o...
1,2,Active sport boxer briefs - Skinning up Glory ...
2,3,Active sport briefs - These superbreathable no...
3,4,"Alpine guide pants - Skin in, climb ice, switc..."
4,5,"Alpine wind jkt - On high ridges, steep ice an..."
...,...,...
495,496,Cap 2 bottoms - Cut loose from the maddening c...
496,497,Cap 2 crew - This crew takes the edge off fick...
497,498,All-time shell - No need to use that morning T...
498,499,All-wear cargo shorts - All-Wear Cargo Shorts ...


* explore DataFrame

In [5]:
df.value_counts()

id   description                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        

We will be using Tf-Idf to find similar items based on description
* instantiate TF-IDF

In [15]:
vectorizer = TfidfVectorizer(strip_accents='unicode',stop_words='english', min_df=2)

* fit and transform 'description' column with TFIDF

In [16]:
tfidf_matrix = vectorizer.fit_transform(df['description'])

In [17]:
tfidf_matrix.shape

(500, 2634)

* calculate the cosine similarity of each item with every other item in the dataset, 

In [18]:
# Import linear_kernel
from sklearn.metrics.pairwise import linear_kernel

# Compute the cosine similarity matrix
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [21]:
cosine_sim

array([[1.        , 0.32725079, 0.20330552, ..., 0.16423651, 0.20677301,
        0.21969475],
       [0.32725079, 1.        , 0.61821744, ..., 0.1232794 , 0.22894503,
        0.19689612],
       [0.20330552, 0.61821744, 1.        , ..., 0.11341077, 0.14528903,
        0.16064326],
       ...,
       [0.16423651, 0.1232794 , 0.11341077, ..., 1.        , 0.13675708,
        0.16426658],
       [0.20677301, 0.22894503, 0.14528903, ..., 0.13675708, 1.        ,
        0.66883065],
       [0.21969475, 0.19689612, 0.16064326, ..., 0.16426658, 0.66883065,
        1.        ]])

* sort all items using their similarity for each item i, and store the values in dictionary `results`

```
results = {
    "1": [5,7,9...],
    "2": [45,2,3...]
}
```

In [24]:
#Construct a reverse map of indices and descriptions
indices = pd.Series(df.index, index=df['description']).drop_duplicates()
indices[0]

0

In [22]:
ordered_indices = cosine_sim.argsort() #smallest values to largest
ordered_indices

array([[424, 318,  41, ..., 493,  18,   0],
       [318, 395,  47, ...,   0,   2,   1],
       [318, 395,  47, ..., 494,   1,   2],
       ...,
       [147, 489, 331, ..., 385, 301, 497],
       [318, 395,  68, ..., 461, 499, 498],
       [318, 395,  68, ..., 462, 498, 499]], dtype=int64)

In [None]:
results = {indices:ordered_indices for (key,value) in dictonary.items()}

* create function `recommender` that will recommend similar products
    * function must have two input params: **item_id** and **count** of similar products 

In [39]:
# Function that takes in id as input and outputs most similar ids
def get_recommendations(id, count):

    # Get the index of the id that matches the id
    idx = indices[id]

    # Get the pairwsie similarity scores of all ids with that id
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the ids based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar ids
    sim_scores = sim_scores[1:count+1]

    # Get the id indices
    id_indices = [i[0] for i in sim_scores]

    # Return the top 10 most similar ids
    return df['description'].iloc[id_indices]

* show top 5 the most similar items for item with idem_id = 11

In [40]:
get_recommendations(11, 5)

401    River shorts - River life is mighty fine, but ...
311    Baggies shorts - A loyal partner in grime, Bag...
407    Baggies shorts - Summertime unwinds in a boist...
427    Girl's baggies shorts - An everyday staple for...
465    Baby baggies shorts - About 70% of the planet ...
Name: description, dtype: object

In [41]:
df['description'].iloc[11]

'Baggies shorts - Even Baggies, our most popular shorts for anything, or nothing, occasionally need an update. This season we\'ve increased the inseam length. Their casual fit, quick-drying water-repellent nylon and elasticized waistband with an internal drawstring remain the same as ever. Other features include a polyester mesh lining, a rear snap pocket and front pockets (with self-draining mesh corners) positioned to reduce drag in the water. Inseam (size M) is 7". Recyclable through the Common Threads Recycling Program.<br><br><b>Details:</b><ul> <li>Quick-drying nylon with a DWR (durable water repellent) finish</li> <li>Elasticized waistband with internal drawstring; black mesh liner</li> <li>Vertical on-seam side pockets for reduced drag in the water; pocket bags have quick-drain-and-dry mesh corners; snap-closed back pocket</li> <li>"Inseam is 7"""</li></ul><br><br><b>Fabric: </b>4.2-oz 100% nylon with a DWR finish. Lining: 5.2-oz 100% polyester mesh. Recyclable through the Comm