# Recommendation System

In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
# change directory if data is in a different folder
import os
os.chdir("C:/Users/tanmb/Downloads/SJSU/CMPE_256/project/project_data")

In [3]:
val_pred = pd.read_csv('val_pred.csv')
news_embeddings = pd.read_csv('news_embeddings.csv')
news_metadata = pd.read_csv('small_full_news.csv')
train_clicked = pd.read_csv('train_clicked.csv')

In [4]:
news_embeddings = news_embeddings.merge(news_metadata[['news_id', 'category']], on = 'news_id', how = 'left')
train_clicked = train_clicked.merge(news_metadata[['news_id', 'category']], on = 'news_id', how = 'left')
val_pred = val_pred.merge(news_metadata[['news_id', 'category']], on = 'news_id', how = 'left')

In [5]:
news_embeddings.head()

Unnamed: 0,news_id,title_emb_0,title_emb_1,title_emb_2,title_emb_3,title_emb_4,title_emb_5,title_emb_6,title_emb_7,title_emb_8,...,title_emb_191,title_emb_192,title_emb_193,title_emb_194,title_emb_195,title_emb_196,title_emb_197,title_emb_198,title_emb_199,category
0,55528,-0.186653,0.236455,-0.210393,-0.290194,-0.079926,-0.084619,0.008138,-0.093584,-0.036084,...,-0.060976,-0.017067,0.014921,0.001137,0.041048,-0.017283,-0.053397,0.044348,0.010367,lifestyle
1,19639,-0.089158,0.161629,-0.058451,-0.064467,-0.171379,-0.10699,0.03077,-0.214236,0.046488,...,-0.030919,-0.000426,0.107756,0.053488,-0.021131,-0.007939,-0.01153,0.046534,-0.088146,health
2,61837,-0.11246,-0.160912,-0.179721,0.194282,-0.131232,-0.134176,0.149837,-0.054741,-0.020564,...,-0.001209,-0.046804,0.021327,-0.00653,-0.014037,0.032238,-0.024075,0.047432,0.002414,news
3,53526,0.048788,-0.057316,-0.076083,-0.227301,-0.052115,0.118391,0.050027,-0.169648,0.111652,...,-0.010022,-0.011369,-0.065288,0.030645,0.037783,-0.061337,-0.002957,0.004196,0.038869,health
4,38324,-0.159312,0.067011,-0.027634,0.003047,-0.065959,-0.023923,-0.046621,-0.105488,-0.039912,...,0.068282,-0.041803,-0.015534,0.091072,0.014027,-0.008631,-0.018318,0.030822,-0.004235,health


In [6]:
train_clicked.head()

Unnamed: 0,user_id,timestamp,news_id,impression,category
0,13740,205558,55689,1,sports
1,91836,324690,17059,1,finance
2,73700,457308,23814,1,lifestyle
3,34670,192485,49685,1,music
4,8125,317481,8400,1,autos


## Article Similarities

In [7]:
categories = news_embeddings['category'].unique()
art_sim = {}
for cat in categories :
    articles = news_embeddings[news_embeddings['category'] == cat]
    labels = articles['news_id']
    articles = articles.drop(columns = ['news_id', 'category'])
    
    sim = cosine_similarity(articles)
    sim = pd.DataFrame(sim, index = labels, columns = labels)
    
    art_sim[cat] = sim    

In [65]:
art_sim['lifestyle'].loc[55528, 7517]

np.float64(0.1057327793768996)

## Top Categories by User
If a user does not have enough categories we know they like, we will just recommend through the top categories. 

In [61]:
# get all categories
categories = news_metadata['category'].unique()

# get category popularity from clicks
cat_pop = list(train_clicked.groupby('category')['impression'].count().sort_values(ascending = False).index)

# add any categories with no clicks
for c in categories :
    if c not in cat_pop :
        cat_pop.append(c)

In [63]:
# get user clicks per category
user_cat_counts = train_clicked.groupby(['user_id', 'category'])['impression'].count().reset_index()

# rank category popularity for each user
user_cat_counts['rank'] = user_cat_counts.groupby('user_id')['impression'].rank(method='first', ascending=False)
top_user_cats = user_cat_counts.sort_values(['user_id', 'rank'])

# convert categories into list representing preference then store as series
user_top_cat = pd.Series(top_user_cats.groupby('user_id')['category'].apply(list).to_dict())

## Five Article Recommendatons per User 

In [112]:
def rank_articles(poss_arts, clicked, cat) :
    sim_scores = []
    for a in poss_arts :
        similarity = np.mean([art_sim[cat].loc[a, c] for c in clicked])
        sim_scores.append([a, similarity])

    rankings = pd.DataFrame(sim_scores, columns = ['article', 'similarity']).sort_values(by = 'similarity') 
    return list(rankings['article'])

def search_categories(ranked_categories, articles, clicked, articles_needed = 5) :
    thresholds = {'one': [], 'two': [], 'three': []}

    for cat in ranked_categories :
        cat_click = None if cat not in clicked.index else clicked[cat]
        
        # threshold 1, 0.6 <= x < 0.7, 
        arts_1 = articles[(articles['category'] == cat) & (articles['pred_impression'].between(0.6, 0.7, inclusive = 'left'))].sort_values(by = 'pred_impression')
        arts_1 = arts_1['news_id']
        rank_arts_1 = arts_1 if cat_click is None else rank_articles(arts_1, cat_click, cat)
        thresholds['one'].extend(rank_arts_1)
        
        # threshold 2, 0.5 <= x < 0.6
        arts_2 = articles[(articles['category'] == cat) & (articles['pred_impression'].between(0.5, 0.6, inclusive = 'left'))].sort_values(by = 'pred_impression')
        arts_2 = arts_2['news_id']
        rank_arts_2 = arts_2 if cat_click is None else rank_articles(arts_2, cat_click, cat)
        thresholds['two'].extend(rank_arts_2)

        # threshold 3, 0.7 <= x
        arts_3 = articles[(articles['category'] == cat) & (articles['pred_impression'].between(0.7, 1, inclusive = 'left'))].sort_values(by = 'pred_impression')
        arts_3 = arts_3['news_id']
        rank_arts_3 = arts_3 if cat_click is None else rank_articles(arts_3, cat_click, cat)
        thresholds['three'].extend(rank_arts_3)

        if len(thresholds['one']) >= articles_needed :
            break

    return thresholds

def get_recommendations (user, articles) :
    cat_rank = cat_pop if user not in user_top_cat.keys() else user_top_cat[user]
    clicked = train_clicked[train_clicked['user_id'] == user].groupby('category')['news_id'].apply(list)
    recs = []
    
    # get articles in the users top categories and rank them
    user_thresholds = search_categories(cat_rank, articles, clicked)
    # go through the different thresholds and add until we get 5 recommendations
    for k,v in user_thresholds.items() :
        needed = 5 - len(recs)
        recs.extend(v[:needed])

    if len(recs) == 5 :
        return recs

    # if not enough articles in user categories, go through popular categories
    pop_rank = [x for x in cat_pop if x not in cat_rank]
    pop_thresholds = search_categories(pop_rank, articles, clicked, 5 - len(recs))
    
    for k,v in pop_thresholds.items() :
        needed = 5 - len(recs)
        recs.extend(v[:needed])
    return recs

In [87]:
val_pred.groupby('user_id')['impression'].count().describe()

count    50000.000000
mean        54.819960
std         61.763309
min          2.000000
25%         13.000000
50%         33.000000
75%         76.000000
max       1187.000000
Name: impression, dtype: float64

In [88]:
# group by user
user_poss_arts = val_pred.groupby('user_id') # possible articles to recommend

In [113]:
user_recs = {}
act_clicked = 0
total = 0
for user, art in user_poss_arts :
    recs = get_recommendations(user, art)
    user_recs[user] = recs
    
    rec_art = art[art['news_id'].isin(recs)]
    act_clicked += rec_art['impression'].sum()
    total += len(recs)

print(act_clicked / total)

0.10496280918645677


In [111]:
act_clicked / total

np.float64(0.10227240990886156)

start at categories, look at top category, take articles that are least similar but they still are predicted to click 

In [13]:
val_pred['pred_impression'].describe()

count    2.740998e+06
mean     3.624721e-01
std      1.728962e-01
min      7.767049e-03
25%      2.272878e-01
50%      3.404727e-01
75%      4.859801e-01
max      9.577069e-01
Name: pred_impression, dtype: float64

In [None]:
.03