In [1]:
# import pandas
import pandas as pd

# import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

# import cosine_similarity
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
# import data
df = pd.read_csv("https://raw.githubusercontent.com/nikitaa30/Content-based-Recommender-System/master/sample-data.csv")

In [3]:
df

Unnamed: 0,id,description
0,1,Active classic boxers - There's a reason why o...
1,2,Active sport boxer briefs - Skinning up Glory ...
2,3,Active sport briefs - These superbreathable no...
3,4,"Alpine guide pants - Skin in, climb ice, switc..."
4,5,"Alpine wind jkt - On high ridges, steep ice an..."
...,...,...
495,496,Cap 2 bottoms - Cut loose from the maddening c...
496,497,Cap 2 crew - This crew takes the edge off fick...
497,498,All-time shell - No need to use that morning T...
498,499,All-wear cargo shorts - All-Wear Cargo Shorts ...


* explore DataFrame

We will be using Tf-Idf to find similar items based on description
* instantiate TF-IDF

In [10]:
tfidf = TfidfVectorizer(stop_words='english')

* fit and transform 'description' column with TFIDF

In [11]:
transformed = tfidf.fit_transform(df['description'])

In [13]:
transformed.shape

(500, 4600)

* calculate the cosine similarity of each item with every other item in the dataset, 

In [14]:
sim =  cosine_similarity(transformed, transformed)

In [15]:
sim

array([[1.        , 0.31005145, 0.18891957, ..., 0.14812382, 0.18521397,
        0.20070706],
       [0.31005145, 1.        , 0.57514356, ..., 0.11131481, 0.2053139 ,
        0.18008906],
       [0.18891957, 0.57514356, 1.        , ..., 0.10043647, 0.12778935,
        0.14410777],
       ...,
       [0.14812382, 0.11131481, 0.10043647, ..., 1.        , 0.11674521,
        0.14302157],
       [0.18521397, 0.2053139 , 0.12778935, ..., 0.11674521, 1.        ,
        0.57835324],
       [0.20070706, 0.18008906, 0.14410777, ..., 0.14302157, 0.57835324,
        1.        ]])

In [32]:
sim.shape

(500, 500)

* sort all items using their similarity for each item i, and store the values in dictionary `results`

```
results = {
    "1": [5,7,9...],
    "2": [45,2,3...]
}
```

In [24]:
results = {}

for id in (df['id'] - 1):
    sim_scores = list(enumerate(sim[id]))

    # Sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:]
    results[id] = [i[0] for i in sim_scores]

In [27]:
len(results)

500

* create function `recommender` that will recommend similar products
    * function must have two input params: **item_id** and **count** of similar products 

In [42]:
def recommender(item_id, count):
    return results.get(item_id)[:count]

* show top 5 the most similar items for item with idem_id = 11

In [43]:
recommender(11, 5)

[401, 407, 427, 311, 465]

In [37]:
df[df['id'] == 5].values

array([[5,
        'Alpine wind jkt - On high ridges, steep ice and anything alpine, this jacket serves as a true "best of all worlds" staple. It excels as a stand-alone shell for blustery rock climbs, cool-weather trail runs and high-output ski tours. And then, when conditions have you ice and alpine climbing, it functions as a lightly insulated windshirt on the approach, as well as a frictionless midlayer when it\'s time to bundle up and tie-in. The polyester ripstop shell with a Deluge DWR (durable water repellent) treatment sheds snow and blocks wind, while the smooth, lightly brushed hanging mesh liner wicks moisture, dries fast, and doesn\'t bind to your baselayers. Superlight stretch-woven underarm panels enhance breathability and allow for unimpaired arm motion, and the two hand pockets close with zippers. A drawcord hem, elastic cuffs, a heat-transfer reflective logo and a regular-coil, center-front zipper with DWR finish round out the features. Updated this season for an impr

In [38]:
df[df['id'] == 401].values

array([[401,
        "Reversible phone home - Travel broadly and surf the world, but don't forget to call your mom. Now reversible for road-trip rejuvenation, the inimitable hooded Phone Home Jacket helps us keep in touch. Made of 100% organic cotton fleece lined with a soft organic cotton jersey. Comes with an extended rib-knit at the waist and cuffs and roomy pockets on the fleecy side. Recyclable through the Common Threads Recycling Program.<br><br><b>Details:</b><ul> <li>Organic cotton fleece hoody lined with organic cotton jersey</li> <li>Rib-knit waist and cuffs</li> <li>Drawstring closure at hood</li> <li>Roomy front pockets</li> <li>Reversible</li></ul><br><br><b>Fabric: </b>10.3-oz 100% organic cotton fleece. Lining: 4.5-oz 100% organic cotton jersey. Recyclable through the Common Threads Recycling Program<br><br><b>Weight: </b>(917 g 31.8 oz)<br><br>Made in India."]],
      dtype=object)

In [39]:
df[df['id'] == 407].values

array([[407,
        'Merino 2 crew - In this soft long-sleeved tee, you can work up a sweat biking to work, catch a bouldering session in the afternoon, and still have people want to be near you at the pub. Inherently soft and comfortable, merino naturally resists odor, moves moisture away from the skin, regulates body temperature, and stretches. Made from our most versatile merino layer for an active lifestyle in warm to cool weather, this lightweight pullover features 73% merino wool spun around an all-recycled polyester (27%) core for a luxurious hand, an improved dry time and enduring strength. The raglan-sleeve design and offset side seams are smooth beneath pack straps. We slow-wash our merino for next-to-skin softness. Machine-wash cold; lay flat to dry. Recyclable through the Common Threads Recycling Program.<br><br><b>Details:</b><ul> <li>"Slow-washed merino wool resists odor naturally, insulates even when wet, is durable and stretches, provides superior performance and softn

In [46]:
results[11]

[401,
 407,
 427,
 311,
 465,
 324,
 99,
 91,
 394,
 328,
 145,
 301,
 45,
 187,
 304,
 305,
 51,
 42,
 444,
 58,
 283,
 408,
 449,
 14,
 484,
 28,
 329,
 392,
 402,
 208,
 327,
 499,
 426,
 399,
 479,
 96,
 44,
 213,
 15,
 223,
 146,
 186,
 206,
 307,
 453,
 4,
 216,
 43,
 357,
 319,
 464,
 483,
 385,
 435,
 6,
 353,
 266,
 222,
 274,
 255,
 258,
 52,
 54,
 471,
 443,
 352,
 367,
 5,
 167,
 388,
 437,
 119,
 111,
 183,
 286,
 105,
 341,
 203,
 215,
 114,
 415,
 423,
 302,
 2,
 0,
 103,
 231,
 34,
 398,
 140,
 69,
 436,
 472,
 12,
 325,
 279,
 450,
 493,
 452,
 185,
 228,
 142,
 92,
 342,
 178,
 134,
 33,
 83,
 60,
 497,
 418,
 207,
 32,
 381,
 498,
 156,
 30,
 66,
 130,
 211,
 414,
 281,
 3,
 463,
 10,
 411,
 212,
 378,
 106,
 350,
 293,
 16,
 482,
 1,
 158,
 413,
 179,
 386,
 118,
 117,
 349,
 267,
 144,
 361,
 155,
 49,
 18,
 29,
 330,
 95,
 320,
 468,
 457,
 272,
 292,
 477,
 205,
 226,
 323,
 481,
 438,
 280,
 480,
 36,
 50,
 369,
 368,
 75,
 74,
 383,
 393,
 209,
 416,
 245,
 363