### Run once, to check environment

In [1]:
#!pip3 install -r requirements.txt

### Download Data, put under root directory DM-project/

https://msnews.github.io/behaviors

### Scraper part, no need to run, just in case, can be adapted to other datasource
   * Scraping publish time of news
   * To run it, put 'MINDlarge_train' in the top level directory of the project, which is 'DM-project'
   * don't submit MINDlarge_train, add it to gitignore
   * Output filepath is 'DM-project/generate/newstimes.csv
   * Don't forget to go back to the top level directory
   

In [1]:
#%cd scraper

In [2]:
#!scrapy crawl news

In [3]:
#%cd ..

In [4]:
#newsTimes = 'generate/newstimes.csv'

### Load Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sys
import warnings

if not sys.warnoptions:
    warnings.simplefilter("ignore")

### Preprocssing, run once

In [2]:
#from preprocessing.preprocessing import createNews,createUsers,splitFiles,createCategoriyEmbeddingNLP

In [29]:
#behaviors = 'MINDlarge_train/behaviors.tsv'
news = 'MINDlarge_train/news.tsv'
#entityEmbedding = 'MINDlarge_train/entity_embedding.vec'
##generate news_embedding.csv, news_cleaned.csv
#createNews(news,newsTimes,entityEmbedding)
##generate user_history.csv, user_impression.csv
#createUsers(behaviors)
##split big files
#splitFiles('generate/user_history.csv')
#splitFiles('generate/user_impressions.csv')

### No need to run, already generated, but in case you want to play around.
  * generate/news_subcat_embedding_nlp.csv

In [4]:
#!pip3 install spacy

In [5]:
#!python3 -m spacy download en_core_web_lg

In [6]:
#subcategories = "generate/news_subcategories.csv"
#createCategoriyEmbeddingNLP(subcategories)"

In [7]:
from analysis.measurement import measurement,baselineTest,baselineTestAvg,tuningParameters
from analysis.clustering import clusteringBatch
from analysis.recommendation import searchKNearestNeighbors

In [19]:
t0 = 1575586800+1000
lam = 0.1
threshold = 3  # Dendrogram cut level

history = 'generate/user_history_mini.csv'
impression = 'generate/user_impressions_mini.csv'

### User representation
* clusteringBatch 
     -  group user by UID
     -  perform ward clustering on each user's reading history
     -  sample 3 cluster according to the Importance Scores
     -  calculate the medoid/centroid as one of representation of user's interest
     -  calculate accepted boundary(radius) for each cluster

In [20]:
df_user_representation_medoid,df_user_representation_centroid = clusteringBatch(t0,history=history,threshold=threshold,lam=lam,with_centroid=True)

### Evaluation Method in the paper

In [21]:
#evaluation relevance in paper
#they draw in total 400 nearest sample among billons of pins, we draw k=50, in total 3*50 samples
df_recommendation = searchKNearestNeighbors(df_user_representation_centroid,k=50) 
df_news_embeddings = pd.read_csv('generate/news_embedding.csv') 
df_recommendation = df_recommendation.merge(df_news_embeddings,on='NID')

#they use 0.2 as similarity distance threshold, I use 0.3
df_measure_recommendation = measurement(df_recommendation,impression=impression,similarity_threshold = 0.3)

In [22]:
mean_recall = df_measure_recommendation.recall.mean()
print("If use centroids as user representation, get relevance %.2f"%mean_recall)

If use centroids as user representation, get relevance 0.46


In [23]:
df_recommendation = searchKNearestNeighbors(df_user_representation_medoid,k=50)
df_news_embeddings = pd.read_csv('generate/news_embedding.csv') 
df_recommendation = df_recommendation.merge(df_news_embeddings,on='NID')
df_measure_recommendation = measurement(df_recommendation,impression=impression,similarity_threshold = 0.3)

In [24]:
mean_recall = df_measure_recommendation.recall.mean()
print("If use medoids as user representation, get relevance %.2f"%mean_recall)

If use medoids as user representation, get relevance 0.40


In [25]:
df_user_representation_random,df_user_representation_latest = baselineTest(history)
df_recommendation = searchKNearestNeighbors(df_user_representation_random,k=50)
df_news_embeddings = pd.read_csv('generate/news_embedding.csv') 
df_recommendation = df_recommendation.merge(df_news_embeddings,on='NID')
df_measure_recommendation = measurement(df_recommendation,impression=impression,similarity_threshold = 0.3)

In [26]:
mean_recall = df_measure_recommendation.recall.mean()
print("If use randomly chosen articles from history list as user representation, get relevance %.2f"%mean_recall)

If use randomly chosen articles from history list as user representation, get relevance 0.39


* measurement
  - compare user representation(n by 100) with user impression(m by 100) 
  - get n by m distance matrix
  - mask distance matrix with accepted boundary
  - count recall for each positive impression, divide by length of impressions
  - count hits for each representation, consider empty hits percentage."

In [9]:
df_measure_centroid = measurement(df_user_representation_centroid,impression=impression,similarity_threshold = 0.3)

In [10]:
mean_recall = df_measure_centroid.recall.mean()
empty_percent = df_measure_centroid.percent_empty.mean()
print("If use centroids as user representation, get recall %.2f, the percentage of empty recommendation is %.2f"%(mean_recall,empty_percent))

If use centroids as user representation, get recall 0.24, the percentage of empty recommendation is 0.64


In [11]:
df_measure_medoid = measurement(df_user_representation_medoid,impression=impression,similarity_threshold=0.3)

In [12]:
mean_recall = df_measure_medoid.recall.mean()
empty_percent = df_measure_medoid.percent_empty.mean()
print("If use medoids as user representation, get recall %.2f, the percentage of empty recommendation is %.2f"%(mean_recall,empty_percent))

If use medoids as user representation, get recall 0.20, the percentage of empty recommendation is 0.69


### Baseline Test
   * randomly draw 3 news from history
   * pick the latest 3 news from history

In [29]:
df_user_representation_random,df_user_representation_latest = baselineTest(history)

In [30]:
df_measure_random = measurement(df_user_representation_random,impression=impression,similarity_threshold=0.3)

In [31]:
mean_recall = df_measure_random.recall.mean()
empty_percent = df_measure_random.percent_empty.mean()
print("If randomly draw 3 samples, get recall %.2f, the percentage of empty recommendation is %.2f"%(mean_recall,empty_percent))

If randomly draw 3 samples, get recall 0.19, the percentage of empty recommendation is 0.68


In [33]:
df_measure_lastest = measurement(df_user_representation_latest,impression=impression,similarity_threshold=0.3)

In [34]:
mean_recall = df_measure_lastest.recall.mean()
empty_percent = df_measure_lastest.percent_empty.mean()
print("If draw 3 latests samples, get recall %.2f, the percentage of empty recommendation is %.2f"%(mean_recall,empty_percent))

If draw 3 latests samples, get recall 0.10, the percentage of empty recommendation is 0.87


In [34]:
df_user_represent_avg = baselineTestAvg(history)

In [35]:
df_measure_avg = measurement(df_user_represent_avg,impression=impression,similarity_threshold=0.3)

In [36]:
mean_recall = df_measure_avg.recall.mean()
empty_percent = df_measure_avg.percent_empty.mean()
print("If use mean of user history as representation, get recall %.2f, the percentage of empty recommendation is %.2f"%(mean_recall,empty_percent))

If use mean of user history as representation, get recall 0.29, the percentage of empty recommendation is 0.04


### Parameter tuning
* $\lambda$ for importance score
* threshold for ward clustering
* measureed by recall and hits

In [None]:
lams = [1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1]
thresholds = [0.75] # [0.5, 0.75, 0.85, 1.0, 1.25, 1.50, 2.0, 3.0, 4.0, 5.0]
similarity_thresholds = [0.3] # [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
df_tuning_result = tuningParameters("1",lams,thresholds,similarity_thresholds)

In [54]:
df_tuning_result

Unnamed: 0,Threshold,Lambda,Similarity,Medoid Recall,Empty medoids,Medoids per user,Centroid Recall,Empty centroids,Centroids per user
0,0.75,1e-05,0.3,0.212942,0.64794,3.0,0.230337,0.629213,3.0
1,0.75,0.0001,0.3,0.206902,0.696629,3.0,0.21349,0.685393,3.0
2,0.75,0.001,0.3,0.189407,0.674157,3.0,0.202158,0.666667,3.0
3,0.75,0.01,0.3,0.176378,0.700375,3.0,0.186998,0.692884,3.0
4,0.75,0.1,0.3,0.188257,0.722846,3.0,0.20378,0.700375,3.0
5,0.75,1.0,0.3,0.209362,0.670412,3.0,0.232327,0.640449,3.0


In [56]:
df_tuning_result.to_csv("generate/lambda_tuning_1_th0.75_sim0.3.csv",index=False)

### Recommendation System--Hierarchical Navigable Small World
* 89222 elements in space, with 100 dimensions.
* Search 4 nearest neighbor for each item of user representation

In [27]:
import time

t_before = time.perf_counter()
df_recommendation = searchKNearestNeighbors(df_user_representation_centroid,k=50)
print(f'Time cost {time.perf_counter()-t_before} s')

Time cost 0.40071307099992737 s


In [30]:
df_news_meta = pd.read_csv(news,sep='\t',header=None)
df_news_meta.rename(columns={0:'NID',1:'category',2:'subcategory',3:'title',4:'abstract',6:'title_entities',7:'abstract_entities'},inplace=True)
df_news_meta = df_news_meta[['NID','category','subcategory','title','abstract','title_entities','abstract_entities']]
  

In [32]:
UID = df_recommendation.sample().UID.values[0]
print('Recommendation!')
df_recommendation[df_recommendation.UID==UID].merge(df_news_meta,on='NID')

Recommendation!


Unnamed: 0,UID,NID,category,subcategory,title,abstract,title_entities,abstract_entities
0,U672445,N58030,news,newspolitics,Bay Briefing: Why do presidential candidates s...,"Good morning, Bay Area. It's Thursday, Nov. 14...","[{""Label"": ""California"", ""Type"": ""G"", ""Wikidat...","[{""Label"": ""1978 California Proposition 13"", ""..."
1,U672445,N7898,news,newscrime,FBI hopes sketch will help ID New Orleans vict...,New details have been released about a woman w...,"[{""Label"": ""Federal Bureau of Investigation"", ...","[{""Label"": ""Federal Bureau of Investigation"", ..."
2,U672445,N118762,finance,finance-real-estate,America's cheapest cities where everyone wants...,"Cities like New York, San Francisco, and Washi...","[{""Label"": ""United States"", ""Type"": ""G"", ""Wiki...","[{""Label"": ""Washington, D.C."", ""Type"": ""G"", ""W..."
3,U672445,N95981,weather,weathertopstories,Snow powders U.S. as temperatures plunge and w...,Winter returned with a vengeance across the U....,"[{""Label"": ""United States"", ""Type"": ""G"", ""Wiki...","[{""Label"": ""Chicago"", ""Type"": ""G"", ""WikidataId..."
4,U672445,N117353,news,newsworld,Family thinks relative may be victim of serial...,The most prolific serial killer in U.S. histor...,[],"[{""Label"": ""New Orleans"", ""Type"": ""G"", ""Wikida..."
...,...,...,...,...,...,...,...,...
95,U672445,N29741,sports,baseball_mlb,"MLB rumors: Who's competing with Yankees, Phil...",The MLB GM Meetings begin Monday in Scottsdale...,"[{""Label"": ""Gerrit Cole"", ""Type"": ""P"", ""Wikida...","[{""Label"": ""Gerrit Cole"", ""Type"": ""P"", ""Wikida..."
96,U672445,N64749,sports,baseball_mlb,Tigers' local TV ratings decline again in 2019,"The Detroit Tigers' local television ratings, ...","[{""Label"": ""Detroit Tigers"", ""Type"": ""O"", ""Wik...","[{""Label"": ""Detroit Tigers"", ""Type"": ""O"", ""Wik..."
97,U672445,N17907,sports,baseball_mlb,"Odorizzi, Abreu accept $17.8M offers to stay w...","SCOTTSDALE, Ariz. (AP) Pitcher Jake Odorizzi...","[{""Label"": ""Jake Odorizzi"", ""Type"": ""P"", ""Wiki...","[{""Label"": ""Jake Odorizzi"", ""Type"": ""P"", ""Wiki..."
98,U672445,N83488,news,newspolitics,President Trump to speak at international poli...,"President Donald Trump, who will be in Chicago...","[{""Label"": ""Chicago"", ""Type"": ""G"", ""WikidataId...","[{""Label"": ""International Association of Chief..."


In [33]:
df_history = pd.read_csv(history)
df_history = df_history[df_history.UID==UID].merge(df_news_meta,on='NID')
print('History!')
df_history

History!


Unnamed: 0,UID,NID,category,subcategory,title,abstract,title_entities,abstract_entities
0,U672445,N61252,news,newspolitics,"Klobuchar, Booker rule out recusing themselves...",Senate Dems who are running for the party's 20...,"[{""Label"": ""Amy Klobuchar"", ""Type"": ""P"", ""Wiki...","[{""Label"": ""Donald Trump"", ""Type"": ""P"", ""Wikid..."
1,U672445,N107254,sports,baseball_mlb,ALCS preview: Astros vs. Yankees is the marque...,This is the series America has been waiting fo...,"[{""Label"": ""Houston Astros"", ""Type"": ""O"", ""Wik...","[{""Label"": ""Houston Astros"", ""Type"": ""O"", ""Wik..."
2,U672445,N7769,news,newsworld,"ISIS Rears Its Head, Adding to Chaos as Turkey...",The Turkish invasion of Kurdish-held territory...,[],"[{""Label"": ""Rojava"", ""Type"": ""N"", ""WikidataId""..."
3,U672445,N84419,news,newspolitics,Trump says he's replacing McAleenan as acting ...,McAleenan had been frustrated with a cadre of ...,"[{""Label"": ""Kevin McAleenan"", ""Type"": ""P"", ""Wi...","[{""Label"": ""Kevin McAleenan"", ""Type"": ""P"", ""Wi..."
4,U672445,N27258,news,newspolitics,Kamala Harris to Trump Jr: 'You wouldn't know ...,The 2020 democratic hopeful was responding to ...,"[{""Label"": ""Kamala Harris"", ""Type"": ""P"", ""Wiki...","[{""Label"": ""Hillary Clinton"", ""Type"": ""P"", ""Wi..."
...,...,...,...,...,...,...,...,...
220,U672445,N20241,sports,baseball_mlb,Why the Phillies are in for another busy hot s...,"The Phillies have a new manager, now they need...","[{""Label"": ""Philadelphia Phillies"", ""Type"": ""O...","[{""Label"": ""Philadelphia Phillies"", ""Type"": ""O..."
221,U672445,N78696,news,newsus,Judge strikes down new Trump rule on religious...,NEW YORK (AP) A federal judge on Wednesday s...,"[{""Label"": ""Donald Trump"", ""Type"": ""P"", ""Wikid...","[{""Label"": ""United States Department of Health..."
222,U672445,N16492,foodanddrink,newstrends,New York City Has One of America's Best All-Yo...,If you feel the indulgence of an all-you-can-e...,"[{""Label"": ""New York City"", ""Type"": ""G"", ""Wiki...","[{""Label"": ""Churrascaria"", ""Type"": ""C"", ""Wikid..."
223,U672445,N73329,news,newsus,Mayor De Blasio Defends Giving Freed Prison In...,The incentive program would give accused crimi...,"[{""Label"": ""Bill de Blasio"", ""Type"": ""P"", ""Wik...","[{""Label"": ""New York Mets"", ""Type"": ""O"", ""Wiki..."
