# Imports:

In [2]:
import numpy as np
import pandas as pd 

from scipy import sparse
from sklearn.metrics.pairwise import pairwise_distances, cosine_distances

**Read in the data:**

In [3]:
recommend = pd.read_csv('data/recommend.csv')

In [4]:
recommend.head()

Unnamed: 0,user_id,rating,title
0,29403,5,Where the Wild Things Are
1,37112,5,Harry Potter and the Deathly Hallows (Harry Po...
2,16114,4,Sphere
3,50684,4,"The Desert Spear (Demon Cycle, #2)"
4,43944,3,"Y: The Last Man, Vol. 1: Unmanned"


I am creating an item-based (the books) collaborative recommender. I will set up my pivot table in the following way:

- The title will be the index
- The user_id will be the column
- The rating will be the value

In [5]:
recommend_piv = pd.pivot_table(
    recommend,
    values='rating',
    index='title',
    columns='user_id',
    fill_value=0
)

recommend_piv.head()

user_id,1,2,3,4,5,7,8,9,10,11,...,53413,53414,53415,53416,53417,53418,53419,53421,53422,53423
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"Angels (Walsh Family, #3)",0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
#GIRLBOSS,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
'Salem's Lot,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
"'Tis (Frank McCourt, #2)",0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
"10% Happier: How I Tamed the Voice in My Head, Reduced Stress Without Losing My Edge, and Found Self-Help That Actually Works",0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [6]:
recommend_piv.shape

(8018, 42659)

Create a sparse matrix:

In [7]:
recommend_sparse = sparse.csr_matrix(recommend_piv)

Calculate Cosine Similarity:

In [8]:
# Remember - a distance of 1 is a similarity of 0.
distance = pairwise_distances(recommend_sparse, metric='cosine')

In [9]:
distance.shape

(8018, 8018)

In [10]:
np.round(distance[:10, :10], 2)

array([[0., 1., 1., 1., 1., 1., 1., 1., 1., 1.],
       [1., 0., 1., 1., 1., 1., 1., 1., 1., 1.],
       [1., 1., 0., 1., 1., 1., 1., 1., 1., 1.],
       [1., 1., 1., 0., 1., 1., 1., 1., 1., 1.],
       [1., 1., 1., 1., 0., 1., 1., 1., 1., 1.],
       [1., 1., 1., 1., 1., 0., 1., 1., 1., 1.],
       [1., 1., 1., 1., 1., 1., 0., 1., 1., 1.],
       [1., 1., 1., 1., 1., 1., 1., 0., 1., 1.],
       [1., 1., 1., 1., 1., 1., 1., 1., 0., 1.],
       [1., 1., 1., 1., 1., 1., 1., 1., 1., 0.]])

Create the recommender DataFrame

In [11]:
titles = recommend_piv.index
titles

Index([' Angels (Walsh Family, #3)', '#GIRLBOSS', ''Salem's Lot',
       ''Tis (Frank McCourt, #2)',
       '10% Happier: How I Tamed the Voice in My Head, Reduced Stress Without Losing My Edge, and Found Self-Help That Actually Works',
       '100 Bullets, Vol. 1: First Shot, Last Call', '100 Love Sonnets',
       '100 Selected Poems', '10th Anniversary (Women's Murder Club, #10)',
       '11 Birthdays (Willow Falls, #1)',
       ...
       'Zero to One: Notes on Startups, or How to Build the Future',
       'Zita the Spacegirl (Zita the Spacegirl, #1)', 'Zodiac', 'Zone One',
       'Zorba the Greek', 'Zorro',
       'for colored girls who have considered suicide/when the rainbow is enuf',
       'god is Not Great: How Religion Poisons Everything',
       'ttyl (Internet Girls, #1)', 'xxxHolic, Vol. 1 (xxxHOLiC, #1)'],
      dtype='object', name='title', length=8018)

In [12]:
recommend_df = pd.DataFrame(distance, columns=titles, index=titles)
recommend_df.head()

title,"Angels (Walsh Family, #3)",#GIRLBOSS,'Salem's Lot,"'Tis (Frank McCourt, #2)","10% Happier: How I Tamed the Voice in My Head, Reduced Stress Without Losing My Edge, and Found Self-Help That Actually Works","100 Bullets, Vol. 1: First Shot, Last Call",100 Love Sonnets,100 Selected Poems,"10th Anniversary (Women's Murder Club, #10)","11 Birthdays (Willow Falls, #1)",...,"Zero to One: Notes on Startups, or How to Build the Future","Zita the Spacegirl (Zita the Spacegirl, #1)",Zodiac,Zone One,Zorba the Greek,Zorro,for colored girls who have considered suicide/when the rainbow is enuf,god is Not Great: How Religion Poisons Everything,"ttyl (Internet Girls, #1)","xxxHolic, Vol. 1 (xxxHOLiC, #1)"
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"Angels (Walsh Family, #3)",0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
#GIRLBOSS,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
'Salem's Lot,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
"'Tis (Frank McCourt, #2)",1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
"10% Happier: How I Tamed the Voice in My Head, Reduced Stress Without Losing My Edge, and Found Self-Help That Actually Works",1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


Try it out...

In [13]:
titles[titles.str.lower().str.contains('a court of')]

Index(['A Court of Mist and Fury (A Court of Thorns and Roses, #2)',
       'A Court of Thorns and Roses (A Court of Thorns and Roses, #1)',
       'A Court of Wings and Ruin (A Court of Thorns and Roses, #3)'],
      dtype='object', name='title')

In [14]:
recommend_df['A Court of Thorns and Roses (A Court of Thorns and Roses, #1)'].sort_values().head(6)

title
A Court of Thorns and Roses (A Court of Thorns and Roses, #1)           0.000000
Dawn of the Dreadfuls (Pride and Prejudice and Zombies, #0.5)           0.822441
Percy Jackson's Greek Gods (A Percy Jackson and the Olympians Guide)    0.847277
Nimona                                                                  0.890388
A Gathering of Shadows (Shades of Magic, #2)                            0.895949
The Crown of Embers (Fire and Thorns, #2)                               0.899734
Name: A Court of Thorns and Roses (A Court of Thorns and Roses, #1), dtype: float64

In [15]:
recommend_df['A Court of Mist and Fury (A Court of Thorns and Roses, #2)'].sort_values().head(6)

title
A Court of Mist and Fury (A Court of Thorns and Roses, #2)    0.000000
Maybe Not (Maybe, #1.5)                                       0.817848
Alice's Adventures in Wonderland & Other Stories              0.846415
Origin (Lux, #4)                                              0.892237
Lumberjanes, Vol. 1: Beware the Kitten Holy                   0.902381
Spell Bound (Hex Hall, #3)                                    0.925400
Name: A Court of Mist and Fury (A Court of Thorns and Roses, #2), dtype: float64

In [3]:
# titles[titles.str.lower().str.contains('harry')]

In [17]:
recommend_df["Harry Potter and the Sorcerer's Stone (Harry Potter, #1)"].sort_values().head(6)

title
Harry Potter and the Sorcerer's Stone (Harry Potter, #1)    0.000000
Zita the Spacegirl (Zita the Spacegirl, #1)                 0.913395
Nigella Express: Good Food, Fast                            0.938139
The Alexandria Link (Cotton Malone, #2)                     0.941637
Child of the Prophecy (Sevenwaters, #3)                     0.946955
The Expats                                                  0.950512
Name: Harry Potter and the Sorcerer's Stone (Harry Potter, #1), dtype: float64

In [18]:
titles[titles.str.lower().str.contains('outlander')]

Index(['A Breath of Snow and Ashes (Outlander, #6)',
       'An Echo in the Bone (Outlander, #7)',
       'Dragonfly in Amber (Outlander, #2)', 'Drums of Autumn (Outlander, #4)',
       'Outlander (Outlander, #1)', 'The Fiery Cross (Outlander, #5)',
       'The Space Between (Outlander, #7.5)', 'Voyager (Outlander, #3)',
       'Written in My Own Heart's Blood (Outlander, #8)'],
      dtype='object', name='title')

In [19]:
recommend_df["Outlander (Outlander, #1)"].sort_values().head(10)

title
Outlander (Outlander, #1)                                                                                    0.000000
The Commitments                                                                                              0.891255
Proper Gauge (Wool, #2)                                                                                      0.903882
Feeling Good: The New Mood Therapy                                                                           0.906752
The Mermaids Singing (Tony Hill & Carol Jordan, #1)                                                          0.915084
InuYasha: Turning Back Time (InuYasha, #1)                                                                   0.924944
The Eight (The Eight #1)                                                                                     0.937216
Morning, Noon & Night                                                                                        0.937812
These Is My Words: The Diary of Sarah Agnes Prine,

These distances aren't very close... will they improve if we use more data?

In [20]:
recommend = pd.read_csv('data/recommend_big.csv')

In [21]:
recommend.head()

Unnamed: 0,user_id,rating,title
0,29403,5,Where the Wild Things Are
1,37112,5,Harry Potter and the Deathly Hallows (Harry Po...
2,16114,4,Sphere
3,50684,4,"The Desert Spear (Demon Cycle, #2)"
4,43944,3,"Y: The Last Man, Vol. 1: Unmanned"


I am creating an item-based (the books) collaborative recommender. I will set up my pivot table in the following way:

- The title will be the index
- The user_id will be the column
- The rating will be the value

In [22]:
recommend_piv = pd.pivot_table(
    recommend,
    values='rating',
    index='title',
    columns='user_id',
    fill_value=0
)

recommend_piv.head()

user_id,1,2,3,4,5,6,7,8,9,10,...,53415,53416,53417,53418,53419,53420,53421,53422,53423,53424
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"Angels (Walsh Family, #3)",0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
#GIRLBOSS,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
'Salem's Lot,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
"'Tis (Frank McCourt, #2)",0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
"10% Happier: How I Tamed the Voice in My Head, Reduced Stress Without Losing My Edge, and Found Self-Help That Actually Works",0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [23]:
recommend_piv.shape

(8390, 53357)

Create a sparse matrix:

In [24]:
recommend_sparse = sparse.csr_matrix(recommend_piv)

Calculate Cosine Similarity:

In [25]:
# Remember - a distance of 1 is a similarity of 0.
distance = pairwise_distances(recommend_sparse, metric='cosine')

In [26]:
distance.shape

(8390, 8390)

In [27]:
np.round(distance[:10, :10], 2)

array([[0.  , 1.  , 1.  , 1.  , 1.  , 1.  , 1.  , 1.  , 1.  , 1.  ],
       [1.  , 0.  , 1.  , 1.  , 1.  , 1.  , 1.  , 1.  , 1.  , 1.  ],
       [1.  , 1.  , 0.  , 0.99, 1.  , 1.  , 1.  , 0.99, 0.99, 1.  ],
       [1.  , 1.  , 0.99, 0.  , 0.98, 1.  , 1.  , 1.  , 1.  , 1.  ],
       [1.  , 1.  , 1.  , 0.98, 0.  , 1.  , 1.  , 1.  , 1.  , 1.  ],
       [1.  , 1.  , 1.  , 1.  , 1.  , 0.  , 1.  , 1.  , 1.  , 1.  ],
       [1.  , 1.  , 1.  , 1.  , 1.  , 1.  , 0.  , 0.97, 1.  , 1.  ],
       [1.  , 1.  , 0.99, 1.  , 1.  , 1.  , 0.97, 0.  , 1.  , 1.  ],
       [1.  , 1.  , 0.99, 1.  , 1.  , 1.  , 1.  , 1.  , 0.  , 1.  ],
       [1.  , 1.  , 1.  , 1.  , 1.  , 1.  , 1.  , 1.  , 1.  , 0.  ]])

Create the recommender DataFrame

In [28]:
titles = recommend_piv.index
titles

Index([' Angels (Walsh Family, #3)', '#GIRLBOSS', ''Salem's Lot',
       ''Tis (Frank McCourt, #2)',
       '10% Happier: How I Tamed the Voice in My Head, Reduced Stress Without Losing My Edge, and Found Self-Help That Actually Works',
       '100 Bullets, Vol. 1: First Shot, Last Call', '100 Love Sonnets',
       '100 Selected Poems', '10th Anniversary (Women's Murder Club, #10)',
       '11 Birthdays (Willow Falls, #1)',
       ...
       'Zodiac', 'Zoe's Tale (Old Man's War, #4)', 'Zone One',
       'Zorba the Greek', 'Zorro',
       'for colored girls who have considered suicide/when the rainbow is enuf',
       'god is Not Great: How Religion Poisons Everything', 'sTORI Telling',
       'ttyl (Internet Girls, #1)', 'xxxHolic, Vol. 1 (xxxHOLiC, #1)'],
      dtype='object', name='title', length=8390)

In [29]:
recommend_df = pd.DataFrame(distance, columns=titles, index=titles)
recommend_df.head()

title,"Angels (Walsh Family, #3)",#GIRLBOSS,'Salem's Lot,"'Tis (Frank McCourt, #2)","10% Happier: How I Tamed the Voice in My Head, Reduced Stress Without Losing My Edge, and Found Self-Help That Actually Works","100 Bullets, Vol. 1: First Shot, Last Call",100 Love Sonnets,100 Selected Poems,"10th Anniversary (Women's Murder Club, #10)","11 Birthdays (Willow Falls, #1)",...,Zodiac,"Zoe's Tale (Old Man's War, #4)",Zone One,Zorba the Greek,Zorro,for colored girls who have considered suicide/when the rainbow is enuf,god is Not Great: How Religion Poisons Everything,sTORI Telling,"ttyl (Internet Girls, #1)","xxxHolic, Vol. 1 (xxxHOLiC, #1)"
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"Angels (Walsh Family, #3)",0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
#GIRLBOSS,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
'Salem's Lot,1.0,1.0,0.0,0.994075,1.0,1.0,1.0,0.989595,0.98636,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
"'Tis (Frank McCourt, #2)",1.0,1.0,0.994075,0.0,0.979876,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
"10% Happier: How I Tamed the Voice in My Head, Reduced Stress Without Losing My Edge, and Found Self-Help That Actually Works",1.0,1.0,1.0,0.979876,0.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


Try it out...

In [31]:
recommend_df['A Court of Thorns and Roses (A Court of Thorns and Roses, #1)'].sort_values().head(6)

title
A Court of Thorns and Roses (A Court of Thorns and Roses, #1)    0.000000
A Court of Mist and Fury (A Court of Thorns and Roses, #2)       0.885106
The Heart of Betrayal (The Remnant Chronicles, #2)               0.886706
Heir of Fire (Throne of Glass, #3)                               0.893655
Red Queen (Red Queen, #1)                                        0.895211
The Wrath and the Dawn (The Wrath and the Dawn, #1)              0.911957
Name: A Court of Thorns and Roses (A Court of Thorns and Roses, #1), dtype: float64

In [32]:
recommend_df['A Court of Mist and Fury (A Court of Thorns and Roses, #2)'].sort_values().head(6)

title
A Court of Mist and Fury (A Court of Thorns and Roses, #2)       0.000000
A Court of Thorns and Roses (A Court of Thorns and Roses, #1)    0.885106
Red Queen (Red Queen, #1)                                        0.893751
The Assassin's Blade (Throne of Glass, #0.1-0.5)                 0.896288
Heir of Fire (Throne of Glass, #3)                               0.897353
Empire of Storms (Throne of Glass, #5)                           0.907076
Name: A Court of Mist and Fury (A Court of Thorns and Roses, #2), dtype: float64

In [34]:
recommend_df["Harry Potter and the Sorcerer's Stone (Harry Potter, #1)"].sort_values().head(6)

title
Harry Potter and the Sorcerer's Stone (Harry Potter, #1)        0.000000
Harry Potter and the Chamber of Secrets (Harry Potter, #2)      0.881504
Harry Potter and the Goblet of Fire (Harry Potter, #4)          0.888212
Harry Potter and the Order of the Phoenix (Harry Potter, #5)    0.890310
Harry Potter and the Half-Blood Prince (Harry Potter, #6)       0.891420
Harry Potter and the Prisoner of Azkaban (Harry Potter, #3)     0.891526
Name: Harry Potter and the Sorcerer's Stone (Harry Potter, #1), dtype: float64

In [37]:
recommend_df["Outlander (Outlander, #1)"].sort_values().head(6)

title
Outlander (Outlander, #1)                     0.000000
Dragonfly in Amber (Outlander, #2)            0.892600
The Fiery Cross (Outlander, #5)               0.900507
A Breath of Snow and Ashes (Outlander, #6)    0.905510
Voyager (Outlander, #3)                       0.915457
An Echo in the Bone (Outlander, #7)           0.919807
Name: Outlander (Outlander, #1), dtype: float64

In [37]:
recommend_df["Outlander (Outlander, #1)"].sort_values().head(6)

title
Outlander (Outlander, #1)                     0.000000
Dragonfly in Amber (Outlander, #2)            0.892600
The Fiery Cross (Outlander, #5)               0.900507
A Breath of Snow and Ashes (Outlander, #6)    0.905510
Voyager (Outlander, #3)                       0.915457
An Echo in the Bone (Outlander, #7)           0.919807
Name: Outlander (Outlander, #1), dtype: float64