In [1]:
import pandas as pd
import numpy as np
import json

In [2]:
data = pd.read_csv("ratings_Grocery_and_Gourmet_Food.csv", sep = ",", header=None, names=['User', 'Item', 'Rating', 'Timestamps'])

print("There are %d rows in this dataframe" %len(data))

data.head()

There are 1297156 rows in this dataframe


Unnamed: 0,User,Item,Rating,Timestamps
0,A1ZQZ8RJS1XVTX,0657745316,5.0,1381449600
1,A31W38VGZAUUM4,0700026444,5.0,1354752000
2,A3I0AV0UJX5OH0,1403796890,1.0,1385942400
3,A3QAAOLIXKV383,1403796890,3.0,1307836800
4,AB1A5EGHHVA9M,141278509X,5.0,1332547200


In [3]:
top_items = pd.value_counts(data.Item)
top_items.head(20)

B0051SU0OW    6340
B00DS842HS    3755
B000E1D7RS    3102
B0029XDZIK    3011
B005K4Q1YA    3000
B0015KAYN2    2740
B008I1XLDQ    2731
B007TGDXMU    2452
B007Y59HVM    2394
B0010YA02S    2371
B0031JK95S    2220
B001D0GV4K    2096
B001EO5Q64    1899
B005K4Q1T0    1838
B000GFYRHG    1765
B00BI8OG2Q    1688
B001D05RWC    1658
B00032G1S0    1628
B007TGDXNO    1585
B003D4F18G    1583
Name: Item, dtype: int64

In [4]:
top_reviewers = pd.value_counts(data.User)
top_reviewers.head(20)

AY12DBB0U420B     518
A1LACH6MLQWZ      490
A3D6OI36USYOU1    440
A2NYK9KWFMJV4Y    432
A3OXHLG6DIBRW8    378
A281NPSIMI1C2R    335
A1GQAKL9CGQLP1    332
A2XKJ1KX6XUHYP    315
A1Z54EM24Y40LL    285
A2MUGFV2TDQ47K    265
A2M9D9BDHONV3Y    233
A1UQBFCERIP7VJ    231
A2MNB77YGJ3CN0    229
A14BTJRH9VNLJJ    225
A1X1CEGHTHMBL1    219
A36MP37DITBU6F    215
ALSAOZ1V546VT     212
A2C9XE9I8RSKNX    208
AQQLWCMRNDFGI     208
A1WX42M589VAMQ    206
Name: User, dtype: int64

In [5]:
# Let's create a function that collect the reviews of our common reviewers
def get_book_reviews(title, common_reviewers):
    mask = (data.User.isin(common_reviewers)) & (data.Item==title)
    reviews = data[mask].sort_values(by='User')
    reviews = reviews[reviews.User.duplicated()==False]
    return reviews

In [6]:
item_1, item_2 = "B001SB2SGO", "B001TJPM0O"

item_1_reviewers = data[data.Item == item_1].User
item_2_reviewers = data[data.Item == item_2].User

common_reviewers = set(item_1_reviewers).intersection(item_2_reviewers)

print("%d people have reviewed these items" % len(common_reviewers))

# Checking the table with only the common reviewers
list_common_reviewers = []

for i in common_reviewers:
    list_common_reviewers.append(i)

common_reviewers_only = data[data.User.isin(list_common_reviewers)]

common_reviewers_only.head(10)

1 people have reviewed these items


Unnamed: 0,User,Item,Rating,Timestamps
1961,A1LACH6MLQWZ,B0000CCZYY,5.0,1399075200
8036,A1LACH6MLQWZ,B0000DIYKH,5.0,1385683200
8066,A1LACH6MLQWZ,B0000DIYVC,5.0,1398124800
8126,A1LACH6MLQWZ,B0000DIYWP,5.0,1398470400
11926,A1LACH6MLQWZ,B0000TI4L8,5.0,1403654400
20402,A1LACH6MLQWZ,B00016LA7U,5.0,1402358400
21429,A1LACH6MLQWZ,B00016V25A,5.0,1399334400
21434,A1LACH6MLQWZ,B00016V26O,5.0,1399420800
21456,A1LACH6MLQWZ,B00016V282,5.0,1399420800
25410,A1LACH6MLQWZ,B0001DMSW4,5.0,1404432000


In [7]:
# Importing the scipy library to measure the pearson correlation coefficient
from scipy.stats.stats import pearsonr

def calculate_correlation(book1, book2):
    # We start by finding the common reviewers
    book_1_reviewers = data[data.Item == book1].User
    book_2_reviewers = data[data.Item == book2].User
    common_reviewers = set(book_1_reviewers).intersection(book_2_reviewers)

    # Then we look for the reviews given by common reviewers
    book_1_reviews = get_book_reviews(book1, common_reviewers)
    book_2_reviews = get_book_reviews(book2, common_reviewers)
        
    # Calculate the Pearson Correlation Score
    return pearsonr(book_1_reviews.Rating, book_2_reviews.Rating)[0]

# Print the correlation score
calculate_correlation(item_1,item_2)

nan

In [8]:
most_reviewed_books = pd.DataFrame({'count' : data.groupby(["Item"]).size()})\
                                    .reset_index().sort_values(by = ['count'],ascending = False)

most_reviewed_books.head(40)  

Unnamed: 0,Item,count
96463,B0051SU0OW,6340
151070,B00DS842HS,3755
10790,B000E1D7RS,3102
57073,B0029XDZIK,3011
103055,B005K4Q1YA,3000
35030,B0015KAYN2,2740
124594,B008I1XLDQ,2731
118613,B007TGDXMU,2452
119549,B007Y59HVM,2394
32570,B0010YA02S,2371


In [14]:
# Getting the list of the most reviewed book

top_books = []

for i in most_reviewed_books.Item[0:100]:
    top_books.append(i)

    
# calculate the correlation for our top books
correlation_coefficient = []

for book1 in top_books:
    print("Calculating the correlations for:", book1)
    for book2 in top_books:
        if book1 != book2:
            row = [book1, book2] + [calculate_correlation(book1, book2)]
            correlation_coefficient.append(row)

Calculating the correlations for: B0051SU0OW




Calculating the correlations for: B00DS842HS
Calculating the correlations for: B000E1D7RS
Calculating the correlations for: B0029XDZIK
Calculating the correlations for: B005K4Q1YA
Calculating the correlations for: B0015KAYN2
Calculating the correlations for: B008I1XLDQ
Calculating the correlations for: B007TGDXMU
Calculating the correlations for: B007Y59HVM
Calculating the correlations for: B0010YA02S
Calculating the correlations for: B0031JK95S
Calculating the correlations for: B001D0GV4K
Calculating the correlations for: B001EO5Q64
Calculating the correlations for: B005K4Q1T0
Calculating the correlations for: B000GFYRHG
Calculating the correlations for: B00BI8OG2Q
Calculating the correlations for: B001D05RWC
Calculating the correlations for: B00032G1S0
Calculating the correlations for: B007TGDXNO
Calculating the correlations for: B003D4F18G
Calculating the correlations for: B0001EQN88
Calculating the correlations for: B000LKXRNQ
Calculating the correlations for: B005K0L2U2
Calculatin

In [15]:
# Let's look at what the table of correlation looks like
cols = ["Item_1", "Item_2", "Correlation"]
correlation_coefficient = pd.DataFrame(correlation_coefficient, columns=cols).sort('Correlation')
correlation_coefficient.head(10)

  app.launch_new_instance()


Unnamed: 0,Item_1,Item_2,Correlation
6737,B00GW3KSPQ,B0015KAYN2,-1.0
5553,B000EVQWKC,B0010YA02S,-1.0
6717,B009AH7OU8,B00ABKWSTA,-1.0
3750,B000NMJWZO,B000UUWECC,-1.0
8544,B0045Z6K50,B003Z6W32E,-1.0
369,B0029XDZIK,B000UP8QQ0,-1.0
3871,B0034EDLS2,B0031JK95S,-1.0
8482,B00ABKWSTA,B009AH7OU8,-1.0
8456,B00ABKWSTA,B000VK4DXO,-1.0
3914,B0034EDLS2,B000EVOSE4,-1.0


In [16]:
def calc_correlation(corr, book1, book2):
    mask = (corr.Item_1==book1) & (corr.Item_2==book2)
    row = corr[mask]
    corr = row
    return corr.sum(axis=1).tolist()[0]

In [17]:
calc_correlation(correlation_coefficient,"B0010YA02S", "B008I1XLDQ")

-0.57735026918962584

In [18]:
my_item = "B0051SU0OW"
results = []
for b in top_books:
    if my_item!=b:
        results.append((my_item, b, calc_correlation(correlation_coefficient, my_item, b)))
sorted(results, key=lambda x: x[2], reverse=True)

[('B0051SU0OW', 'B007N04AF6', 1.0),
 ('B0051SU0OW', 'B005KK56JU', 1.0),
 ('B0051SU0OW', 'B000F4DKAI', 1.0),
 ('B0051SU0OW', 'B000VK08OC', 1.0),
 ('B0051SU0OW', 'B003OZV4Y4', 0.87831006565367997),
 ('B0051SU0OW', 'B001E5E2RC', 0.875),
 ('B0051SU0OW', 'B000LKXRNQ', 0.87208159927238094),
 ('B0051SU0OW', 'B0015KAYN2', 0.53300179088902611),
 ('B0051SU0OW', 'B008XFE2YO', 0.46911839730040156),
 ('B0051SU0OW', 'B007TGDXMK', 0.44268015435926766),
 ('B0051SU0OW', 'B0010UOGWM', 0.41221726907430378),
 ('B0051SU0OW', 'B004TJF39Q', 0.4036839258133284),
 ('B0051SU0OW', 'B004538TME', 0.38760126648920734),
 ('B0051SU0OW', 'B005HGAVD8', 0.37386963155028874),
 ('B0051SU0OW', 'B005K0L2U2', 0.34519948546390994),
 ('B0051SU0OW', 'B003Z6ZGZU', 0.33816225564196212),
 ('B0051SU0OW', 'B0014WYXQK', 0.2711630722733202),
 ('B0051SU0OW', 'B005K4Q1T0', 0.26649502619718712),
 ('B0051SU0OW', 'B002HQCWYM', 0.25267634667287536),
 ('B0051SU0OW', 'B005K4Q1YA', 0.25116811447584242),
 ('B0051SU0OW', 'B000FEH8ME', 0.24999999

In [19]:
def predict(self, my_book):
        results = []
        for other_book in self.top_books:
            if my_book != other_book:
                correlation = calc_correlation(self.correlation_coefficient, my_book, other_book)
                results.append((my_book, other_book, correlation)) 
        return sorted(results, key=lambda x: x[2], reverse=True)

In [20]:
def prediction(my_item):
    results = []
    for b in top_books:
        if my_item!=b:
            results.append((my_item, b, calc_correlation(correlation_coefficient, my_item, b)))
    return sorted(results, key=lambda x: x[2], reverse=True)

In [27]:
prediction("B008I1XLDQ")

[('B008I1XLDQ', 'B000F4DKAI', 1.0),
 ('B008I1XLDQ', 'B0007NOWMM', 1.0),
 ('B008I1XLDQ', 'B000R71WMQ', 1.0),
 ('B008I1XLDQ', 'B0027Z5J6G', 1.0),
 ('B008I1XLDQ', 'B003YBH398', 0.99999999999999989),
 ('B008I1XLDQ', 'B005KK56JU', 0.96558102873057594),
 ('B008I1XLDQ', 'B003OZV4Y4', 0.77491694904398156),
 ('B008I1XLDQ', 'B004538TME', 0.77139253277983122),
 ('B008I1XLDQ', 'B003D4F18G', 0.61237243569579458),
 ('B008I1XLDQ', 'B0029XDZKI', 0.59233942641578774),
 ('B008I1XLDQ', 'B000CN7BMA', 0.5222329678670935),
 ('B008I1XLDQ', 'B000FEH8ME', 0.49414359250244588),
 ('B008I1XLDQ', 'B007TGDXMK', 0.43441636322431459),
 ('B008I1XLDQ', 'B0045Z6K50', 0.42260569821229232),
 ('B008I1XLDQ', 'B003TBRF1O', 0.35404225536669398),
 ('B008I1XLDQ', 'B004TJF39Q', 0.30769983137545265),
 ('B008I1XLDQ', 'B004M62D5S', 0.289308582340706),
 ('B008I1XLDQ', 'B001CHFUDC', 0.28473029832044183),
 ('B008I1XLDQ', 'B005HG9ERW', 0.27463449890578462),
 ('B008I1XLDQ', 'B001D0GV4K', 0.25085490599350074),
 ('B008I1XLDQ', 'B002IEVJRY