In [3]:
import pandas as pd
import pickle

from sklearn.preprocessing import MinMaxScaler

pd.set_option('display.max_colwidth', 60)

In [4]:
def read_json(path):
    return pd.read_json(path, orient="records", compression="gzip")

df_products = read_json('../data/interim/final/products.json.gz')
df_reviews = read_json('../data/interim/final/reviews.json.gz')
df_processed_reviews = read_json('../data/processed/reviews.json.gz')
df_reviews['processed_review_text'] = df_processed_reviews['cleaned_review']

Common Function

In [5]:
# Get unrated items for a user
def get_user_unrated_items(user_id, df_products, df_reviews):
    df_user_reviews = df_reviews[df_reviews['user_id'] == user_id]
    return df_products[~df_products['product_id'].isin(df_user_reviews['product_id'])]

Content-based Filtering (CBF)

In [6]:
CBF_item_similarity_matrix = pickle.load(open(f'../models/content_based_filtering/item_similarity_matrix.pkl', 'rb'))

In [7]:
def CBF_find_similar_items(product_id, df_products):
    idx = df_products[df_products['product_id'] == product_id].index[0]
    sorted_scores = pd.Series(CBF_item_similarity_matrix[idx]).sort_values(ascending = False)
    return df_products.iloc[list(sorted_scores.iloc[1:].index)]

def content_based_filtering(user_id, product_id, n = 10):
    df_similar_items = CBF_find_similar_items(product_id, df_products)
    df_user_unrated_items = get_user_unrated_items(user_id, df_similar_items, df_reviews)
    
    if n < len(df_user_unrated_items): 
        df_user_unrated_items = df_user_unrated_items[:n]
    
    return df_user_unrated_items

Item-based collaborative Filtering (CF)

In [8]:
item_based_CF_knn_model = pickle.load(open('../models/item_based_collaborative_filtering/knn.pkl', 'rb'))

In [9]:
# Get all items that have at least 1 rating and sorted by similarity
def item_based_CF_find_similar_items(product_id, df_products):
    neighbors = item_based_CF_knn_model.get_neighbors(
        item_based_CF_knn_model.trainset.to_inner_iid(product_id), 
        k=item_based_CF_knn_model.trainset.n_items
    )
    df_1 = pd.DataFrame({'product_id' : [item_based_CF_knn_model.trainset.to_raw_iid(inner_id) for inner_id in neighbors]})
    return df_1.merge(df_products, how='inner', on='product_id')

def predict_rating(user_id, product_id):
    return item_based_CF_knn_model.predict(user_id, product_id)

def item_based_collaborative_filtering(user_id, product_id, n = 10):
    df_similar_items = item_based_CF_find_similar_items(product_id, df_products)
    df_user_unrated_items = get_user_unrated_items(user_id, df_similar_items, df_reviews)
    
    if n < len(df_user_unrated_items):
        df_user_unrated_items = df_user_unrated_items[:n]
    
    df_user_unrated_items['pre'] = df_user_unrated_items['product_id'].apply(lambda id: predict_rating(user_id, id).est)
    df_user_unrated_items = df_user_unrated_items.sort_values(by=['pre'], ascending=False)
    return df_user_unrated_items

Sentiment Analysis

In [10]:
sentiment_analysis_model = pickle.load(open('../models/sentiment_analysis/hyperparameter_tuning/logistic_regression_with_tfidf_vectorizer.pkl', 'rb'))

In [11]:
def predict_mean_sentiment(product_id:str):
    review_list = df_reviews[df_reviews['product_id'] == product_id]['processed_review_text'].tolist()
    if (len(review_list) == 0): return 0
    sentiment = predict_sentiment(review_list)
    if(sentiment is None):return 0
    
    return sentiment.mean()

def predict_sentiment(features:list):
    if (len(features) == 0): return
    return sentiment_analysis_model.predict(features)

Proposed Recommendation Algorithm

In [12]:
def recommend(user_id, product_id, n = 10):
    # Get top n items using item_based_collaborative_filtering
    df_item_based_CF_items = item_based_collaborative_filtering(user_id, product_id, n)
    
    # Calculate the mean sentiment for each product
    df_item_based_CF_items['sen'] = df_item_based_CF_items['product_id'].apply(predict_mean_sentiment)
    
    # Scale the sentiment score to fit the rating [formula: (x - xmin) / (xmax - xmin)] for each product
    scaler = MinMaxScaler(feature_range=(1, 5))
    scaler.fit(df_item_based_CF_items[['sen']])
    df_item_based_CF_items['sen'] = scaler.transform(df_item_based_CF_items[['sen']])
    
    # Calculate the ranking score for each product
    w1 = 1
    w2 = 2
    df_item_based_CF_items['ranking_score'] = w1 * df_item_based_CF_items['pre'] + w2 * df_item_based_CF_items['sen']
    
    # filter out low ranking score products
    threshold = (w1 * 5 + w2 * 5) / 2.0
    df_item_based_CF_items = df_item_based_CF_items[df_item_based_CF_items['ranking_score'] >= threshold]
    
    # Sort by ranking score
    df_item_based_CF_items = df_item_based_CF_items.sort_values(by=['ranking_score'], ascending = False)
    
    df_final_items = df_item_based_CF_items
    
    # Handle the cold-start problem
    # Also, complement the filtered products by content_based_filtering
    com = n - len(df_item_based_CF_items)
    if(com > 0):
        df_CBF_items = content_based_filtering(user_id, product_id, len(df_products))
        df_CBF_items = df_CBF_items[~df_CBF_items['product_id'].isin(df_item_based_CF_items['product_id'])]
        df_final_items = pd.concat([df_final_items, df_CBF_items[:com]])
    
    return df_final_items

Starting Point of the System

In [13]:
user_id = 'A0203183BAH3TR08FZGB'
product_id = 'B0043T7FHK'

In [14]:
item_list = recommend(user_id, product_id, 10)

In [15]:
item_list[['product_id', 'name', 'pre', 'sen', 'ranking_score']].head(10)

Unnamed: 0,product_id,name,pre,sen,ranking_score
0,B015WCV70W,HP Pavilion 21.5-Inch IPS LED HDMI VGA Monitor (22cwa),5.0,5.0,15.0
9,B00EZSUWFG,ViewSonic VX2252MH 22 Inch 2ms 75Hz 1080p Gaming Monitor...,5.0,4.967285,14.934569
4,B00OL0L1VM,Sceptre E Series E275W-1920 V1 27&quot; Screen LED-Lit M...,5.0,4.441974,13.883949
7,B0058UUR6E,ASUS VS248H-P 24&quot; Full HD 1920x1080 2ms HDMI DVI VG...,5.0,4.437914,13.875827
8,B00B5Q6Y8U,ASUS VS207D-P 19.5&quot; HD+ 1600x900 VGA Back-lit LED M...,5.0,4.204315,13.408631
5,B003LPTAYI,Sennheiser HD 202 II Professional Headphones (Black),5.0,3.701101,12.402202
6,B0149QBOF0,Dell Gaming S2716DGR 27.0&quot; Screen LED-Lit Monitor w...,5.0,3.24699,11.49398
1,B0043T7FKC,ASUS VK278Q 27&quot; Full HD 1920x1080 2ms HDMI 2.0M web...,5.0,2.720259,10.440518
229,B0045JFTXU,ASUS VE208T 20&quot; HD+ 1600x900 DVI VGA Back-lit LED M...,,,
393,B00A6AMYG2,"Asus VK228H-CSM 21.5&quot; Widescreen LED Monitor, 16:9,...",,,


In [16]:
item_based_list = item_based_collaborative_filtering(user_id, product_id)

In [17]:
item_based_list[['product_id', 'name', 'pre']].head(10)

Unnamed: 0,product_id,name,pre
0,B015WCV70W,HP Pavilion 21.5-Inch IPS LED HDMI VGA Monitor (22cwa),5.0
2,B009A5204K,LG Tone HBS-730 Wireless Stereo Headset - Black,5.0
3,B00906GBBC,ASUS PB238Q 23&quot; Full HD 1920x1080 IPS DisplayPort H...,5.0
4,B00OL0L1VM,Sceptre E Series E275W-1920 V1 27&quot; Screen LED-Lit M...,5.0
5,B003LPTAYI,Sennheiser HD 202 II Professional Headphones (Black),5.0
6,B0149QBOF0,Dell Gaming S2716DGR 27.0&quot; Screen LED-Lit Monitor w...,5.0
7,B0058UUR6E,ASUS VS248H-P 24&quot; Full HD 1920x1080 2ms HDMI DVI VG...,5.0
9,B00EZSUWFG,ViewSonic VX2252MH 22 Inch 2ms 75Hz 1080p Gaming Monitor...,5.0
1,B0043T7FKC,ASUS VK278Q 27&quot; Full HD 1920x1080 2ms HDMI 2.0M web...,5.0
8,B00B5Q6Y8U,ASUS VS207D-P 19.5&quot; HD+ 1600x900 VGA Back-lit LED M...,5.0


In [18]:
content_based_list = content_based_filtering(user_id, product_id)

In [19]:
content_based_list[['product_id', 'name', 'description']]

Unnamed: 0,product_id,name,description
229,B0045JFTXU,ASUS VE208T 20&quot; HD+ 1600x900 DVI VGA Back-lit LED M...,Dynamically enhances the displays contrast by adjusting ...
393,B00A6AMYG2,"Asus VK228H-CSM 21.5&quot; Widescreen LED Monitor, 16:9,...","<b>True-to-life Pictures Powered by LED<br> 80,000,000:1..."
408,B00ANKMNXO,ASUS VX248H 24&quot; Full HD 1920x1080 1ms HDMI DVI VGA ...,Superior Image Quality Meets Ultra-Slim Elegant DesignDe...
240,B004G7U5LC,ASUS VW199T-P 19&quot; WXGA+ 1440x900 DVI VGA Back-lit L...,Shipping Depth: 19.5 Shipping Height: 5.6 Shipping Width...
95,B000VZG0QM,Asus VB171T - 17&quot; (4:3) LCD Monitors - Black,ASUS VB171T - with SPLENDID Video Intelligence Technolog...
267,B0058UUR6E,ASUS VS248H-P 24&quot; Full HD 1920x1080 2ms HDMI DVI VG...,Superior Image Quality Meets Classic Elegant Design Deli...
227,B0043T7FKC,ASUS VK278Q 27&quot; Full HD 1920x1080 2ms HDMI 2.0M web...,Full HD with HDMI:VK278Q leads you to enter a big and sp...
1451,B019HN6P36,ASUS VP247H-P 23.6 in LED Monitor 1920x1080 1ms VGA DVI ...,"With a 1 ms response time, the VP247H-P 23.6"" Widescreen..."
277,B005G2P16A,ASUS VS208N-P 20&quot; HD+ 1600x900 DVI VGA Back-lit LED...,20IN LCD 1600X900 16:9 LED built in power adapter vesa m...
1341,B0163JMDLA,ASUS PRO C424AQ 23.8&rdquo; Full HD 1920x1080 IPS Displa...,"Designed for business versatility, the all-new ASUSPRO C..."
