### Run once, to check environment

In [1]:
#!pip3 install -r requirements.txt

### Download Data, put under root directory DM-project/

https://msnews.github.io/behaviors

### Scraper part, no need to run, just in case, can be adapted to other datasource
   * Scraping publish time of news
   * To run it, put 'MINDlarge_train' in the top level directory of the project, which is 'DM-project'
   * don't submit MINDlarge_train, add it to gitignore
   * Output filepath is 'DM-project/generate/newstimes.csv
   * Don't forget to go back to the top level directory
   

In [1]:
#%cd scraper

In [2]:
#!scrapy crawl news

In [3]:
#%cd ..

In [4]:
#newsTimes = 'generate/newstimes.csv'

### Load Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sys
import warnings

if not sys.warnoptions:
    warnings.simplefilter("ignore")

### Preprocssing, run once

In [2]:
#from preprocessing.preprocessing import createNews,createUsers,splitFiles,createCategoriyEmbeddingNLP

In [3]:
#behaviors = 'MINDlarge_train/behaviors.tsv'
news = 'MINDlarge_train/news.tsv'
#entityEmbedding = 'MINDlarge_train/entity_embedding.vec'
##generate news_embedding.csv, news_cleaned.csv
#createNews(news,newsTimes,entityEmbedding)
##generate user_history.csv, user_impression.csv
#createUsers(behaviors)
##split big files
#splitFiles('generate/user_history.csv')
#splitFiles('generate/user_impressions.csv')

### No need to run, already generated, but in case you want to play around.
  * generate/news_subcat_embedding_nlp.csv

In [4]:
#!pip3 install spacy

In [5]:
#!python3 -m spacy download en_core_web_lg

In [6]:
#subcategories = "generate/news_subcategories.csv"
#createCategoriyEmbeddingNLP(subcategories)"

In [7]:
from analysis.measurement import measurement,baselineTest,baselineTestAvg,tuningParameters
from analysis.clustering import clusteringBatch
from analysis.recommendation import searchKNearestNeighbors

In [8]:
t0 = 1575586800+1000
lam = 0.01
threshold = 1

history = 'generate/user_history_1.csv'
impression = 'generate/user_impressions_1.csv'

### User representation
* clusteringBatch 
     -  group user by UID
     -  perform ward clustering on each user's reading history
     -  sample 3 cluster according to the Importance Scores
     -  calculate the medoid/centroid as one of representation of user's interest
     -  calculate accepted boundary(radius) for each cluster

In [9]:
df_user_representation_medoid,df_user_representation_centroid = clusteringBatch(t0,history=history,threshold=threshold,lam=lam,with_centroid=True)

### Evaluation Method in the paper

In [24]:
#evaluation relevance in paper
#they draw in total 400 nearest sample among billons of pins, we draw k=50, in total 3*50 samples
df_recommendation = searchKNearestNeighbors(df_user_representation_centroid,k=50) 
df_news_embeddings = pd.read_csv('generate/news_embedding.csv') 
df_recommendation = df_recommendation.merge(df_news_embeddings,on='NID')

#they use 0.2 as similarity distance threshold, I use 0.3
df_measure_recommendation = measurement(df_recommendation,impression=impression,similarity_threshold = 0.3)

In [25]:
mean_recall = df_measure_recommendation.recall.mean()
print("If use centroids as user representation, get recall %.2f"%mean_recall)

If use centroids as user representation, get recall 0.49


In [21]:
df_recommendation = searchKNearestNeighbors(df_user_representation_medoid,k=50)
df_news_embeddings = pd.read_csv('generate/news_embedding.csv') 
df_recommendation = df_recommendation.merge(df_news_embeddings,on='NID')
df_measure_recommendation = measurement(df_recommendation,impression=impression,similarity_threshold = 0.3)

In [23]:
mean_recall = df_measure_recommendation.recall.mean()
print("If use centroids as user representation, get recall %.2f"%mean_recall)

If use centroids as user representation, get recall 0.44


* measurement
  - compare user representation(n by 100) with user impression(m by 100) 
  - get n by m distance matrix
  - mask distance matrix with accepted boundary
  - count recall for each positive impression, divide by length of impressions
  - count hits for each representation, consider empty hits percentage."

In [None]:
df_measure_centroid = measurement(df_user_representation_centroid,impression=impression,similarity_threshold = 0.3)

In [12]:
mean_recall = df_measure_centroid.recall.mean()
empty_percent = df_measure_centroid.percent_empty.mean()
print("If use centroids as user representation, get recall %.2f, the percentage of empty recommendation is %.2f"%(mean_recall,empty_percent))

If use centroids as user representation, get recall 0.37, the percentage of empty recommendation is 0.29


In [26]:
df_measure_medoid = measurement(df_user_representation_medoid,impression=impression,similarity_threshold=0.3)

In [27]:
mean_recall = df_measure_medoid.recall.mean()
empty_percent = df_measure_medoid.percent_empty.mean()
print("If use medoids as user representation, get recall %.2f, the percentage of empty recommendation is %.2f"%(mean_recall,empty_percent))

If use medoids as user representation, get recall 0.28, the percentage of empty recommendation is 0.42


### Baseline Test
   * randomly draw 3 news from history
   * pick the latest 3 news from history

In [28]:
df_user_representation_random,df_user_representation_latest = baselineTest(history)

In [30]:
df_measure_random = measurement(df_user_representation_random,impression=impression,similarity_threshold=0.3)

In [31]:
mean_recall = df_measure_random.recall.mean()
empty_percent = df_measure_random.percent_empty.mean()
print("If randomly draw 3 samples, get recall %.2f, the percentage of empty recommendation is %.2f"%(mean_recall,empty_percent))

If randomly draw 3 samples, get recall 0.19, the percentage of empty recommendation is 0.53


In [32]:
df_measure_lastest = measurement(df_user_representation_latest,impression=impression,similarity_threshold=0.3)

In [33]:
mean_recall = df_measure_lastest.recall.mean()
empty_percent = df_measure_lastest.percent_empty.mean()
print("If draw 3 latests samples, get recall %.2f, the percentage of empty recommendation is %.2f"%(mean_recall,empty_percent))

If draw 3 latests samples, get recall 0.22, the percentage of empty recommendation is 0.51


In [34]:
df_user_represent_avg = baselineTestAvg(history)

In [35]:
df_measure_avg = measurement(df_user_represent_avg,impression=impression,similarity_threshold=0.3)

In [36]:
mean_recall = df_measure_avg.recall.mean()
empty_percent = df_measure_avg.percent_empty.mean()
print("If use mean of user history as representation, get recall %.2f, the percentage of empty recommendation is %.2f"%(mean_recall,empty_percent))

If use mean of user history as representation, get recall 0.29, the percentage of empty recommendation is 0.04


### Parameter tuning
* $\lambda$ for importance score
* threshold for ward clustering
* measureed by recall and hits

In [None]:
lams = [0.01]
thresholds = [0.5, 0.75, 1.0, 1.25, 1.50, 2.0, 3.0, 4.0, 5.0]
df_tuning_result = tuningParameters("12",lam=lams,threshold=thresholds)

In [None]:
df_tuning_result.head()

In [None]:
df_tuning_result.to_csv("generate/threshold_tuning.csv")

### Recommendation System--Hierarchical Navigable Small World
* 89222 elements in space, with 100 dimensions.
* Search 4 nearest neighbor for each item of user representation

In [21]:
import time

t_before = time.perf_counter()
df_recommendation = searchKNearestNeighbors(df_user_representation_centroid,k=50)
print(f'Time cost {time.perf_counter()-t_before} s')

Time cost 23.508871365000005 s


In [22]:
df_news_meta = pd.read_csv(news,sep='\t',header=None)
df_news_meta.rename(columns={0:'NID',1:'category',2:'subcategory',3:'title',4:'abstract',6:'title_entities',7:'abstract_entities'},inplace=True)
df_news_meta = df_news_meta[['NID','category','subcategory','title','abstract','title_entities','abstract_entities']]
  

In [23]:
UID = df_recommendation.sample().UID.values[0]

df_recommendation[df_recommendation.UID==UID].merge(df_news_meta,on='NID')

Unnamed: 0,UID,NID,category,subcategory,title,abstract,title_entities,abstract_entities
0,U100873,N6982,news,newscrime,California cops respond to 2 homeless men shot...,Two homeless men living at an encampment near ...,"[{""Label"": ""California"", ""Type"": ""G"", ""Wikidat...","[{""Label"": ""San Francisco"", ""Type"": ""G"", ""Wiki..."
1,U100873,N44069,finance,finance-career-education,Will Increasing Budget Fix CA Schools? (Opinion),COMMENTARY: Report by former San Francisco leg...,"[{""Label"": ""California"", ""Type"": ""G"", ""Wikidat...","[{""Label"": ""California"", ""Type"": ""G"", ""Wikidat..."
2,U100873,N108645,news,newsus,No more fire in the kitchen: Cities are bannin...,Thirteen cities and one county in California h...,[],"[{""Label"": ""California"", ""Type"": ""G"", ""Wikidat..."
3,U100873,N4730,weather,weathertopstories,"The Latest: Evacuations, historic winds in Cal...",SAN FRANCISCO (AP) Evacuation orders have ex...,"[{""Label"": ""California"", ""Type"": ""G"", ""Wikidat...","[{""Label"": ""Santa Rosa, California"", ""Type"": ""..."
4,U100873,N102039,tv,tv-gallery,17 Hallmark Movies That Are Totally Worth Watc...,From festive flicks to cheesy romantic comedie...,[],"[{""Label"": ""Hallmark Channel"", ""Type"": ""M"", ""W..."
5,U100873,N128327,entertainment,entertainment-books,12 Books to Read If You Love Romantic Comedies,Because reading about romance is just as fun a...,"[{""Label"": ""Romantic comedy"", ""Type"": ""C"", ""Wi...",[]
6,U100873,N9526,tv,tv-gallery,20 Teen Movies on Netflix Your Kids Will Love ...,How many of these movies have you watched?,"[{""Label"": ""Netflix"", ""Type"": ""O"", ""WikidataId...",[]
7,U100873,N61510,health,wellness,7 Things That Happen to Your Body When You Wat...,"Discover what the ""boo!"" factor can do to your...","[{""Label"": ""Horror film"", ""Type"": ""C"", ""Wikida...","[{""Label"": ""Discover (magazine)"", ""Type"": ""M"",..."
8,U100873,N5138,news,newspolitics,"After impeachment vote, a defiant Trump",A day after the House voted for the impeachmen...,[],"[{""Label"": ""Mississippi"", ""Type"": ""G"", ""Wikida..."
9,U100873,N23264,news,newsus,Mississippi woman found after being missing fo...,S.O.S spelled out with rocks saved a woman mis...,"[{""Label"": ""Mississippi"", ""Type"": ""G"", ""Wikida...",[]


In [24]:
df_history = pd.read_csv(history)
df_history = df_history[df_history.UID==UID].merge(df_news_meta,on='NID')
df_history

Unnamed: 0,UID,NID,category,subcategory,title,abstract,title_entities,abstract_entities
0,U100873,N59893,news,newsus,Two Columbus statues vandalized on Columbus Day,The statue in San Francisco was doused in red ...,[],"[{""Label"": ""San Francisco"", ""Type"": ""G"", ""Wiki..."
1,U100873,N65823,news,newscrime,'Time was of the essence': Politicians add to ...,"Kamille ""Cupcake"" McKinney was playing at a bi...",[],[]
2,U100873,N128503,health,health-news,"He dropped an amazing 475 pounds, then ran Det...","Once one of Michigan's most obese people, this...","[{""Label"": ""Detroit"", ""Type"": ""G"", ""WikidataId...","[{""Label"": ""Detroit Free Press Marathon"", ""Typ..."
3,U100873,N27352,movies,movies-celebrity,"Emily Ratajkowski Is Being Sued for $150,000 O...",What's more? The photographer is also asking f...,"[{""Label"": ""Emily Ratajkowski"", ""Type"": ""P"", ""...",[]
4,U100873,N44524,news,newscrime,Dad Lied About 4-Year-Old's Role In Double Sho...,Authorities say a Milwaukee man blamed a doubl...,[],"[{""Label"": ""Milwaukee"", ""Type"": ""G"", ""Wikidata..."
5,U100873,N71665,video,viral,"Cummings' widow responds to Trump's attacks, g...","Dr. Maya Rockeymoore Cummings, the widow of th...","[{""Label"": ""Maya Rockeymoore Cummings"", ""Type""...","[{""Label"": ""Maya Rockeymoore Cummings"", ""Type""..."
6,U100873,N23264,news,newsus,Mississippi woman found after being missing fo...,S.O.S spelled out with rocks saved a woman mis...,"[{""Label"": ""Mississippi"", ""Type"": ""G"", ""Wikida...",[]
7,U100873,N53933,news,newsus,Multiple Houses on Fire After Plane Crashes in NJ,"At least two houses are on fire in Colonia, Ne...",[],"[{""Label"": ""Federal Aviation Administration"", ..."
8,U100873,N8243,lifestyle,lifestyleroyals,Meghan Markle Personally Calls British Politic...,The Duchess of Sussex thanked the British fema...,"[{""Label"": ""Meghan, Duchess of Sussex"", ""Type""...","[{""Label"": ""Meghan, Duchess of Sussex"", ""Type""..."
9,U100873,N102039,tv,tv-gallery,17 Hallmark Movies That Are Totally Worth Watc...,From festive flicks to cheesy romantic comedie...,[],"[{""Label"": ""Hallmark Channel"", ""Type"": ""M"", ""W..."
