### Popularity-Based Recommender
Generating recommendations using weighted popularity scores
The scores are based on the combined ratings of the item and the number of ratings for that item to get the mean rating for that item across the dataset to figure out what is most popular. 


In [55]:
import pandas as pd

book_test_df = pd.read_csv('data/Books.test.csv.gz', compression='gzip', sep=',', header=0)
book_val_df = pd.read_csv('data/Books.valid.csv.gz', compression='gzip', sep=',', header=0)
book_train_df = pd.read_csv('data/Books.train.csv.gz', compression='gzip', sep=',', header=0)

book_train_df.head()

Unnamed: 0,user_id,parent_asin,rating,timestamp,history
0,AFKZENTNBQ7A7V7UXW5JJI6UGRYQ,1446304000,5.0,1441260345000,
1,AFKZENTNBQ7A7V7UXW5JJI6UGRYQ,1564770672,5.0,1441260365000,1446304000
2,AFKZENTNBQ7A7V7UXW5JJI6UGRYQ,1442450703,5.0,1523093714024,1446304000 1564770672
3,AFKZENTNBQ7A7V7UXW5JJI6UGRYQ,1780671067,1.0,1611623223325,1446304000 1564770672 1442450703
4,AFKZENTNBQ7A7V7UXW5JJI6UGRYQ,1645671127,3.0,1612044209266,1446304000 1564770672 1442450703 1780671067


In [None]:
#summary of data
print("Training set: ", book_train_df.shape)
print("Val Set: ", book_val_df.shape)
print("Test Set: ", book_test_df.shape)
print(book_test_df.head())
print()
print(book_val_df.head())
print("Unique Users: ", book_train_df['user_id'].nunique())
print("size of train set", len(book_train_df))
print("size of val set", len(book_val_df))
print("size of test set", len(book_test_df))


Training set:  (7935557, 5)
Val Set:  (776370, 5)
Test Set:  (776370, 5)
                        user_id parent_asin  rating      timestamp  \
0  AFKZENTNBQ7A7V7UXW5JJI6UGRYQ  0593235657     5.0  1640629604904   
1  AGKASBHYZPGTEPO6LWZPVJWB2BVA  0803736800     4.0  1454676557000   
2  AGXFEGMNVCSTSYYA5UWXDV7AFSXA  1542046599     5.0  1605649719611   
3  AFWHJ6O3PV4JC7PVOJH6CPULO2KQ  0679450815     5.0  1638987703546   
4  AHXBL3QDWZGJYH7A5CMPFNUPMF7Q  1250866448     5.0  1669414969335   

                                             history  
0  1446304000 1564770672 1442450703 1780671067 16...  
1  0811849783 0803729952 0735336296 1508558884 08...  
2        1578052009 1477493395 1594747350 1594749310  
3  B00INIQVJA 1496407903 1974633225 B07KD27RHM 16...  
4  0920668372 1589255208 2764322836 2764330898 00...  

                        user_id parent_asin  rating      timestamp  \
0  AFKZENTNBQ7A7V7UXW5JJI6UGRYQ  1782490671     5.0  1640383495102   
1  AGKASBHYZPGTEPO6LWZPVJWB2BVA  08

In [None]:
book_train_df.drop(['timestamp', 'history'], axis=1, inplace=True)

In [58]:
print(type(book_val_df.iloc[0]['history']))
x = book_val_df.iloc[0]['history']
x = x.split(" ")
for product in x:
    print(product)

<class 'str'>
1446304000
1564770672
1442450703
1780671067
1645671127
1784881953
1640210148
0823098079


#### Weighted Popularity Score calculation

In [53]:
grouped_books = book_val_df.groupby("parent_asin")
print("Grouped books: ", grouped_books.head())

#compute the mean rating for each book
rating_mean = grouped_books["rating"].mean()
#count the number of ratings for each book
rating_count = grouped_books["rating"].count()

item_stats = pd.DataFrame({
    "parent_asin": rating_mean.index,
    "rating_mean": rating_mean.values,  # average rting per book
    "rating_count": rating_count.values # count of rting per book
})

print("Item Stats: ", item_stats.head())

C = book_train_df["rating"].mean()
print("C: ", C)

# top 20% threshold, for items to be in the running for most popular, they must be in top 20% by count of ratings
m = item_stats["rating_count"].quantile(0.90)  
print("M: ", m)

item_stats["weighted_rating"] = (
    (item_stats["rating_count"] / (item_stats["rating_count"] + m)) * item_stats["rating_mean"]
    + (m / (item_stats["rating_count"] + m)) * C
)

popular_items = item_stats.sort_values("weighted_rating", ascending=False)
print("Popular Items: ", popular_items)
print("length pop items: ", len(popular_items))
print("popularitems.shape: ", popular_items.shape)
print(popular_items["parent_asin"].nunique())  
print(type(popular_items))
#popularity list of top 5000 most popular items
pop_list = popular_items["parent_asin"].head(5000).tolist()

print(len(pop_list))
print(pop_list)

Grouped books:                               user_id parent_asin  rating      timestamp  \
0       AFKZENTNBQ7A7V7UXW5JJI6UGRYQ  1782490671     5.0  1640383495102   
1       AGKASBHYZPGTEPO6LWZPVJWB2BVA  0802737803     5.0  1454676232000   
2       AGXFEGMNVCSTSYYA5UWXDV7AFSXA  1594749310     5.0  1541884305941   
3       AFWHJ6O3PV4JC7PVOJH6CPULO2KQ  1633573001     5.0  1612225279592   
4       AHXBL3QDWZGJYH7A5CMPFNUPMF7Q  0451450523     2.0  1635710722120   
...                              ...         ...     ...            ...   
776365  AGNKVZGDVXCB2VUXSEZELY22WICA  1449007945     5.0  1257527532000   
776366  AFQ5IENJ2URIIK4A6HW7GDUIBGZQ  0399162097     5.0  1383957911000   
776367  AEWDTEKLGUAZYBTDQDED4WZ5PECQ  0471190454     1.0  1190542653000   
776368  AGVUBY43MX4PETNFTXL2CBGLJJSQ  1849701903     5.0  1369038019000   
776369  AGZ44L7OCCLE76RJOZ3VGKOEKLFQ  0500016909     2.0  1103970985000   

                                                  history  
0       1446304000 1564

#### Naive Popularity Score
Uses a regular count to find the most popular items. Does not take into account the ratings of each item or the number of ratings

In [59]:
# Count purchases per item
purchase_counts = (
    book_train_df.groupby("parent_asin")
              .size()
              .reset_index(name="purchase_count")
)

print(purchase_counts)

# Sort by most purchased
popular_items = purchase_counts.sort_values(
    "purchase_count",
    ascending=False
)
print(popular_items)

# Convert to list (most purchased first)
pop_list = popular_items["parent_asin"].head(5000).tolist()
print(pop_list)

       parent_asin  purchase_count
0       0000013714               5
1       000100039X              11
2       0001061240               5
3       0001361155               5
4       0001473727              17
...            ...             ...
494138  B0C8GGPD1H              56
494139  B0C8GHMWG7             369
494140  B0C8GJYMNH             602
494141  B0CCK4H78Y              12
494142  B0CFWT48FK               2

[494143 rows x 2 columns]
       parent_asin  purchase_count
445331  B00L9B7IKE           10683
430470  B006LSZECO            7430
444391  B00JO8PEN2            6871
440008  B00DPM7TIG            6132
439148  B00CNQ7HAU            4914
...            ...             ...
494086  B0C2NPXMMQ               1
494082  B0C1SYJBLG               1
494081  B0C1PBVHKJ               1
494067  B0C1JFQXKB               1
494061  B0C1JB5GBP               1

[494143 rows x 2 columns]
['B00L9B7IKE', 'B006LSZECO', 'B00JO8PEN2', 'B00DPM7TIG', 'B00CNQ7HAU', 'B016ZNRC0Q', 'B00C2WDD5I', 'B00YTX

In [None]:
#For each user in val and test sets, get their product recommendations while also filtering out their previously purchased books in the 'history' field
def get_popularity_recs(df, k):
    user_recs = {}
    for _, row in df.iterrows():
        user_id = row["user_id"]
        history = set(row['history'].split(" "))
        filtered = [item for item in pop_list if item not in history]
        user_recs[user_id] = filtered[:k]
    return user_recs

val_user_recs = get_popularity_recs(book_val_df, k=10)
test_user_recs = get_popularity_recs(book_test_df, k=10)

In [None]:
# checks if the actual book purchased next in val and test sets is included in recs for each user 
# compares number of hits to the total number of books purchased to get the hit rate with k items being recommended 
def hit_rate_at_k(df,recs,k):
    hits = 0
    for _, row in df.iterrows():
        user_id = row["user_id"]
        item = row["parent_asin"]

        if user_id in recs and item in recs[user_id]:
            #print("got a hit")
            hits += 1

    return hits / len(df)

val_hr10 = hit_rate_at_k(book_val_df, val_user_recs, k=10)
test_hr10 = hit_rate_at_k(book_test_df, test_user_recs, k=10)
print("Validation Hit Rate@10:", val_hr10)
print("Test Hit Rate@10:", test_hr10)

Validation Hit Rate@10: 0.006455684789468939
Test Hit Rate@10: 0.004769633035794789
