In [9]:
import sys
sys.path.append("../../")

import os
from tempfile import TemporaryDirectory
import logging
import papermill as pm
import tensorflow as tf

from reco_utils.recommender.tfidf.tfidf_utils import TfidfRecommender
from reco_utils.dataset.download_utils import maybe_download
from reco_utils.dataset.mind import (download_mind, 
                                     extract_mind, 
                                     read_clickhistory, 
                                     get_train_input, 
                                     get_valid_input, 
                                     get_user_history,
                                     get_words_and_entities,
                                     generate_embeddings) 
from reco_utils.recommender.deeprec.deeprec_utils import prepare_hparams
from reco_utils.recommender.deeprec.models.dkn import DKN
from reco_utils.recommender.deeprec.io.dkn_iterator import DKNTextIterator

print(f"System version: {sys.version}")
print(f"Tensorflow version: {tf.__version__}")

System version: 3.7.7 (default, May  6 2020, 11:45:54) [MSC v.1916 64 bit (AMD64)]
Tensorflow version: 1.15.2


In [10]:
# Temp dir

BASE_DIR = os.getcwd()
# Logger
logger = logging.getLogger()
logger.setLevel(logging.INFO)
handler = logging.StreamHandler(sys.stderr)
formatter = logging.Formatter("%(asctime)s %(levelname)s: %(message)s", datefmt='%I:%M:%S')
handler.setFormatter(formatter)
logger.handlers = [handler]

In [11]:
# Mind parameters
MIND_SIZE = "demo"

# DKN parameters
epochs = 10
history_size = 50
batch_size = 100

# Paths

data_path = '../dataset'
train_file = os.path.join(data_path, "train_mind_demo.txt")
valid_file = os.path.join(data_path, "valid_mind_demo.txt")
user_history_file = os.path.join(data_path, "user_history.txt")
infer_embedding_file = os.path.join(data_path, "infer_embedding.txt")


## Data preparation

In this example, let's go through a real case on how to apply DKN on a raw news dataset from the very beginning. We will download a copy of open-source MIND dataset, in its original raw format. Then we will process the raw data files into DKN's input data format, which is stated previously. 

In [34]:
#train_zip, valid_zip = download_mind(size=MIND_SIZE, dest_path=data_path)

train_path = data_path+'\\train'

valid_path = data_path+'\\valid'


In [35]:
print(train_path)

C:\Users\雷雨寒\recommenders-master\examples\02_model_content_based_filtering\data\train


In [6]:
train_session, train_history = read_clickhistory(train_path, "behaviors.tsv")
valid_session, valid_history = read_clickhistory(valid_path, "behaviors.tsv")
get_train_input(train_session, train_file)
get_valid_input(valid_session, valid_file)
get_user_history(train_history, valid_history, user_history_file)

05:32:09 INFO: Train file C:\Users\雷雨寒\recommenders-master\examples\02_model_content_based_filtering\data\train_mind_demo.txt successfully generated
05:32:10 INFO: Validation file C:\Users\雷雨寒\recommenders-master\examples\02_model_content_based_filtering\data\valid_mind_demo.txt successfully generated
05:32:10 INFO: User history file C:\Users\雷雨寒\recommenders-master\examples\02_model_content_based_filtering\data\user_history.txt successfully generated


In [36]:
train_news = os.path.join(train_path, "news.tsv")
valid_news = os.path.join(valid_path, "news.tsv")
news_words, news_entities = get_words_and_entities(train_news, valid_news)

In [8]:
train_entities = os.path.join(train_path, "entity_embedding.vec")
valid_entities = os.path.join(valid_path, "entity_embedding.vec")
news_feature_file, word_embeddings_file, entity_embeddings_file = generate_embeddings(
    data_path,
    news_words,
    news_entities,
    train_entities,
    valid_entities,
    max_sentence=10,
    word_embedding_dim=100,
)

05:32:12 INFO: Downloading glove...
05:32:12 INFO: File C:\Users\雷雨寒\recommenders-master\examples\02_model_content_based_filtering\data\glove.6B.zip already downloaded
05:32:12 INFO: Loading glove with embedding dimension 100...
05:32:24 INFO: Reading train entities...
05:32:25 INFO: Reading valid entities...
05:32:25 INFO: Generating word and entity indexes...
05:32:26 INFO: Generating word embeddings...
05:32:26 INFO: Generating entity embeddings...
05:32:26 INFO: Saving word and entity features in C:\Users\雷雨寒\recommenders-master\examples\02_model_content_based_filtering\data\doc_feature.txt
05:32:26 INFO: Saving word embeddings in C:\Users\雷雨寒\recommenders-master\examples\02_model_content_based_filtering\data\word_embeddings_5w_100.npy
05:32:26 INFO: Saving word embeddings in C:\Users\雷雨寒\recommenders-master\examples\02_model_content_based_filtering\data\entity_embeddings_5w_100.npy


In [9]:
print(news_feature_file)

C:\Users\雷雨寒\recommenders-master\examples\02_model_content_based_filtering\data\doc_feature.txt


## Create hyper-parameters

In [10]:
yaml_file = maybe_download(url="https://recodatasets.blob.core.windows.net/deeprec/deeprec/dkn/dkn_MINDsmall.yaml", 
                           work_directory=data_path)
hparams = prepare_hparams(yaml_file,
                          news_feature_file=news_feature_file,
                          user_history_file=user_history_file,
                          wordEmb_file=word_embeddings_file,
                          entityEmb_file=entity_embeddings_file,
                          epochs=epochs,
                          history_size=history_size,
                          batch_size=batch_size)

05:32:29 INFO: File C:\Users\雷雨寒\recommenders-master\examples\02_model_content_based_filtering\data\dkn_MINDsmall.yaml already downloaded


The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
  * https://github.com/tensorflow/io (for I/O related ops)
If you depend on functionality not listed there, please file an issue.



The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
  * https://github.com/tensorflow/io (for I/O related ops)
If you depend on functionality not listed there, please file an issue.

05:32:29 INFO: Note: NumExpr detected 12 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.
05:32:29 INFO: NumExpr defaulting to 8 threads.


## Train the DKN model

In [11]:
model = DKN(hparams, DKNTextIterator)

Instructions for updating:
Use Variable.read_value. Variables in 2.X are initialized automatically both in eager and graph (inside tf.defun) contexts.


Instructions for updating:
Use Variable.read_value. Variables in 2.X are initialized automatically both in eager and graph (inside tf.defun) contexts.


Instructions for updating:
Use keras.layers.BatchNormalization instead.  In particular, `tf.control_dependencies(tf.GraphKeys.UPDATE_OPS)` should not be used (consult the `tf.keras.layers.batch_normalization` documentation).


Instructions for updating:
Use keras.layers.BatchNormalization instead.  In particular, `tf.control_dependencies(tf.GraphKeys.UPDATE_OPS)` should not be used (consult the `tf.keras.layers.batch_normalization` documentation).


Instructions for updating:
Please use `layer.__call__` method instead.


Instructions for updating:
Please use `layer.__call__` method instead.


Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


In [12]:
model.fit(train_file, valid_file)

at epoch 1
train info: logloss loss:0.48024541503129264
eval info: auc:0.6378, group_auc:0.6, mean_mrr:0.2556, ndcg@10:0.343, ndcg@5:0.2752
at epoch 1 , train time: 192.3 eval time: 67.0
at epoch 2
train info: logloss loss:0.46808412686521905
eval info: auc:0.6487, group_auc:0.6073, mean_mrr:0.2583, ndcg@10:0.3484, ndcg@5:0.281
at epoch 2 , train time: 189.8 eval time: 67.4
at epoch 3
train info: logloss loss:0.45928731221186964
eval info: auc:0.6517, group_auc:0.6234, mean_mrr:0.2699, ndcg@10:0.3612, ndcg@5:0.2967
at epoch 3 , train time: 191.1 eval time: 71.9
at epoch 4
train info: logloss loss:0.44989013579278603
eval info: auc:0.6534, group_auc:0.63, mean_mrr:0.2755, ndcg@10:0.3672, ndcg@5:0.3041
at epoch 4 , train time: 191.2 eval time: 71.6
at epoch 5
train info: logloss loss:0.43954558105511277
eval info: auc:0.6508, group_auc:0.6283, mean_mrr:0.2761, ndcg@10:0.369, ndcg@5:0.3038
at epoch 5 , train time: 185.7 eval time: 60.4
at epoch 6
train info: logloss loss:0.428255906759389

<reco_utils.recommender.deeprec.models.dkn.DKN at 0x2581afabe48>

## Evaluate the DKN model

In [16]:
test_file = os.path.join(data_path, "test_file2.txt")
output = os.path.join(data_path, "output2.txt")

In [17]:
model.predict(test_file,output)

<reco_utils.recommender.deeprec.models.dkn.DKN at 0x2581afabe48>

In [86]:
news_embedding = os.path.join(data_path,"news_embedding.txt")
model.run_get_embedding(news_feature_file,news_embedding)

<reco_utils.recommender.deeprec.models.dkn.DKN at 0x1582b563c88>

# 查看DKN模型的推荐结果

In [152]:
# import pandas as pd
news = pd.read_csv(train_news, sep='\t', names=['News ID', 'Category', 'SubCategory', 'Title', 'Abstract', 'URL', 'Title Entities', 'Abstract Entities'])
news = news.dropna(axis=0, how='any')

newsfile=open("../dataset/test_file2.txt")
newslist=[]
for n in newsfile.readlines():
    newslist.append(n.split(' ')[2].split('\n')[0])
    
f=open("../dataset/output2.txt")
scores=[]
for line in f.readlines():
    scores.append(line.split('\n')[0]) 

ns1 = pd.DataFrame()
ns1['newslist'] = newslist
ns1['scorelist'] = scores

ns1.sort_values("scorelist",inplace=True,ascending=False)
ns1=ns1.reset_index()
ns1 = ns1.drop(columns='index')

NewsID=ns1[0:10]['newslist'].values

ID=news['News ID'].values
df=news.copy()

import numpy as np

ilist=[]
for id in NewsID:
    ilist.extend(np.where(ID==id)[0])

In [141]:
#输出结果
df.iloc[ilist]

Unnamed: 0,News ID,Category,SubCategory,Title,Abstract,URL,Title Entities,Abstract Entities
26414,N12985,music,music-celebrity,Broadway Star Laurel Griggs Suffered Asthma At...,"Teen star Laurel Griggs, who passed away on No...",https://www.msn.com/en-us/music/music-celebrit...,"[{""Label"": ""Broadway theatre"", ""Type"": ""F"", ""W...","[{""Label"": ""Once (musical)"", ""Type"": ""W"", ""Wik..."
17978,N5323,lifestyle,lifestyleroyals,Prince Harry Talked to Another Royal About His...,Prince Albert of Monaco shares the personal ad...,https://www.msn.com/en-us/lifestyle/lifestyler...,"[{""Label"": ""Prince Harry, Duke of Sussex"", ""Ty...","[{""Label"": ""Albert II, Prince of Monaco"", ""Typ..."
20019,N13667,lifestyle,lifestyleroyals,Prince Harry and Meghan Markle just shared a n...,The Duke and Duchess of Sussex shared a new ph...,https://www.msn.com/en-us/lifestyle/lifestyler...,"[{""Label"": ""Charles, Prince of Wales"", ""Type"":...","[{""Label"": ""Charles, Prince of Wales"", ""Type"":..."
19953,N11302,tv,tv-celebrity,"Counting On's Josiah, Lauren Welcome 1st Child...","Counting On's Josiah, Lauren Welcome 1st Child...",https://www.msn.com/en-us/tv/tv-celebrity/coun...,"[{""Label"": ""19 Kids and Counting"", ""Type"": ""W""...","[{""Label"": ""19 Kids and Counting"", ""Type"": ""W""..."
306,N10331,lifestyle,lifestyleroyals,"Meghan and Harry to take 'family time' off, sa...",The Duke and Duchess of Sussex will take a bre...,https://www.msn.com/en-us/lifestyle/lifestyler...,"[{""Label"": ""Prince Harry, Duke of Sussex"", ""Ty...","[{""Label"": ""Duke of Sussex"", ""Type"": ""U"", ""Wik..."
3103,N16759,lifestyle,lifestyleroyals,How Kate Middleton and Prince William's royal ...,Harry and Meghan had a much more relaxed appro...,https://www.msn.com/en-us/lifestyle/lifestyler...,"[{""Label"": ""Prince William, Duke of Cambridge""...","[{""Label"": ""Prince William, Duke of Cambridge""..."
26685,N579,lifestyle,lifestyleroyals,Why Kate & Meghan Were on Different Balconies ...,There's no scandal here. It's all about the or...,https://www.msn.com/en-us/lifestyle/lifestyler...,"[{""Label"": ""Meghan, Duchess of Sussex"", ""Type""...",[]
24837,N19695,lifestyle,lifestyleroyals,Why Prince Harry Wore His Remembrance Poppy Di...,Why Prince Harry's Poppy Was Worn on His Cap,https://www.msn.com/en-us/lifestyle/lifestyler...,"[{""Label"": ""Prince Harry, Duke of Sussex"", ""Ty...","[{""Label"": ""Prince Harry, Duke of Sussex"", ""Ty..."
22434,N22351,lifestyle,lifestyleroyals,Meghan Markle and Prince Harry Won't Spend Chr...,They'll hang out with baby Archie and Meghan's...,https://www.msn.com/en-us/lifestyle/lifestyler...,"[{""Label"": ""Meghan, Duchess of Sussex"", ""Type""...","[{""Label"": ""Meghan, Duchess of Sussex"", ""Type""..."


# 查看训练用户历史数据--impression为0

In [79]:
train_data=open(train_file).readlines()
NewsID=[]
for i in range(len(train_data)):
    line=train_data[i]
    if line.split(' ')[0] == '0':
        User=line.split(' ')[1].split('train_')[1]
        if User=='U82271':
            NewsID.append(line.split(' ')[2].split('\n')[0])
print(NewsID)

ID=news['News ID'].values
df=news.copy()
ilist=[]
for i in range(len(ID)):
    id = ID[i]
    if id in NewsID:
        ilist.append(i)

df.iloc[ilist]

['N18305', 'N7547', 'N22727', 'N5403', 'N5403', 'N11658', 'N7547', 'N19411', 'N28012', 'N1472', 'N2526', 'N19039']


Unnamed: 0,News ID,Category,SubCategory,Title,Abstract,URL,Title Entities,Abstract Entities
25981,N1472,sports,football_nfl,Watch: Jason Garrett has great reaction to Ama...,Garrett was pretty excited about Cooper's impr...,https://www.msn.com/en-us/sports/nfl/watch-jas...,"[{""Label"": ""Amari Cooper"", ""Type"": ""P"", ""Wikid...","[{""Label"": ""Amari Cooper"", ""Type"": ""P"", ""Wikid..."
25982,N19039,weather,weathertopstories,Satellite Imagery Shows Scale of New South Wal...,Several emergency-level bushfires were burning...,https://www.msn.com/en-us/weather/weathertopst...,"[{""Label"": ""New South Wales"", ""Type"": ""G"", ""Wi...","[{""Label"": ""Richmond Valley Council"", ""Type"": ..."
25985,N2526,finance,finance-saving-investing,Opinion: Why a 'Santa Claus rally' for stocks ...,Stocks typically rise between Thanksgiving and...,https://www.msn.com/en-us/finance/finance-savi...,"[{""Label"": ""Santa Claus"", ""Type"": ""R"", ""Wikida...","[{""Label"": ""Mark Hulbert"", ""Type"": ""P"", ""Wikid..."
25994,N28012,weather,weathertopstories,WBZ Morning Forecast For November 12,Sarah Wroblewski has your latest weather forec...,https://www.msn.com/en-us/weather/weathertopst...,[],[]
26202,N11658,news,newsus,Vehicle Crashes Into Second Floor Of Toms Rive...,A vehicle somehow crashed into the second floo...,https://www.msn.com/en-us/news/newsus/vehicle-...,[],"[{""Label"": ""Toms River, New Jersey"", ""Type"": ""..."
26204,N7547,sports,football_nfl,Bold prediction of the week: Lions defense shu...,"After a rough outing against the Raiders, expe...",https://www.msn.com/en-us/sports/nfl/bold-pred...,"[{""Label"": ""Detroit Lions"", ""Type"": ""O"", ""Wiki...","[{""Label"": ""Mitchell Trubisky"", ""Type"": ""P"", ""..."
26206,N19411,news,newspolitics,Politician resigns after insensitive comments ...,"PROSPECT, CT (WFSB) The treasurer of the Dem...",https://www.msn.com/en-us/news/newspolitics/po...,"[{""Label"": ""Twitter"", ""Type"": ""O"", ""WikidataId...","[{""Label"": ""Prospect, Connecticut"", ""Type"": ""G..."
26207,N5403,sports,football_nfl,New Eagles safety has funny story from NFL Dra...,PHILADELPHIA -- Marcus Epps waited to hear his...,https://www.msn.com/en-us/sports/football_nfl/...,"[{""Label"": ""Tim Hauck"", ""Type"": ""P"", ""Wikidata...","[{""Label"": ""Philadelphia Eagles"", ""Type"": ""O"",..."
26209,N22727,sports,football_nfl,Frank Clark active against Titans; LeSean McCo...,The inactives list has a small surprise and ...,https://www.msn.com/en-us/sports/football_nfl/...,"[{""Label"": ""LeSean McCoy"", ""Type"": ""P"", ""Wikid...",[]


# 查看训练用户历史数据--impression为1

In [83]:
valid_data=open(valid_file).readlines()
NewsID=[]
for i in range(len(valid_data)):
    line=valid_data[i]
    if line.split(' ')[0] == '0':
        User=line.split(' ')[1].split('valid_')[1]
        if User=='U82271':
            NewsID.append(line.split(' ')[2].split('%')[0])
print(NewsID)

ID=news['News ID'].values
df=news.copy()
ilist=[]
for i in range(len(ID)):
    id = ID[i]
    if id in NewsID:
        ilist.append(i)

df.iloc[ilist]

['N12103', 'N20460', 'N7144', 'N26261', 'N25099', 'N10864', 'N7581']


Unnamed: 0,News ID,Category,SubCategory,Title,Abstract,URL,Title Entities,Abstract Entities
2,N12103,health,voices,I Was An NBA Wife. Here's How It Affected My M...,"I felt like I was a fraud, and being an NBA wi...",https://www.msn.com/en-us/health/voices/i-was-...,[],"[{""Label"": ""National Basketball Association"", ..."
3,N20460,health,medical,"How to Get Rid of Skin Tags, According to a De...","They seem harmless, but there's a very good re...",https://www.msn.com/en-us/health/medical/how-t...,"[{""Label"": ""Skin tag"", ""Type"": ""C"", ""WikidataI...","[{""Label"": ""Skin tag"", ""Type"": ""C"", ""WikidataI..."
6,N7144,news,newsscienceandtechnology,"How to report weather-related closings, delays","When there are active closings, view them here...",https://www.msn.com/en-us/news/newsscienceandt...,[],"[{""Label"": ""WXII-TV"", ""Type"": ""M"", ""WikidataId..."
14,N26261,sports,football_nfl,"John Dorsey admits talks with Washington, but ...","Team officials in Washington ""emphatically"" de...",https://www.msn.com/en-us/sports/football_nfl/...,"[{""Label"": ""John Dorsey (American football)"", ...","[{""Label"": ""John Dorsey (American football)"", ..."
18,N25099,health,weightloss,These Simple Diet Changes Helped This Guy Lose...,The part of me that knew I had to make changes...,https://www.msn.com/en-us/health/weightloss/th...,[],[]
19,N10864,weather,weathertopstories,"A little snow causes a big mess, more than 100...",A snowfall of a mere one-tenth of an inch caus...,https://www.msn.com/en-us/weather/weathertopst...,"[{""Label"": ""Minnesota"", ""Type"": ""G"", ""Wikidata...","[{""Label"": ""Minnesota State Patrol"", ""Type"": ""..."
7507,N7581,news,newscrime,Florida authorities bust trafficking ring smug...,Two men have been charged for poaching thousan...,https://www.msn.com/en-us/news/newscrime/flori...,"[{""Label"": ""Florida"", ""Type"": ""G"", ""WikidataId...","[{""Label"": ""Florida Fish and Wildlife Conserva..."


# 查看历史点击过的新闻

In [44]:
history_data = open(user_history_file).readlines()

line = history_data[0]

News = line.split(' ')[1].split('\n')[0]
NewsID=News.split(',')

ID=news['News ID'].values
df=news.copy()
ilist=[]
for i in range(len(ID)):
    id = ID[i]
    if id in NewsID:
        ilist.append(i)

df.iloc[ilist]

Unnamed: 0,News ID,Category,SubCategory,Title,Abstract,URL,Title Entities,Abstract Entities
0,N3112,lifestyle,lifestyleroyals,"The Brands Queen Elizabeth, Prince Charles, an...","Shop the notebooks, jackets, and more that the...",https://www.msn.com/en-us/lifestyle/lifestyler...,"[{""Label"": ""Prince Philip, Duke of Edinburgh"",...",[]
17,N25192,lifestyle,lifestyleroyals,Every outfit Duchess Kate has worn in 2019,See Kate Middleton's style choices this year f...,https://www.msn.com/en-us/lifestyle/lifestyler...,[],"[{""Label"": ""Catherine, Duchess of Cambridge"", ..."
66,N1172,lifestyle,lifestyleroyals,The surprising age differences between your fa...,Here are the age differences between Meghan Ma...,https://www.msn.com/en-us/lifestyle/lifestyler...,[],"[{""Label"": ""Meghan, Duchess of Sussex"", ""Type""..."
197,N16959,lifestyle,lifestyleroyals,Queen Elizabeth's Favorite Beauty Products Hav...,"Here, all the brands the British monarch swear...",https://www.msn.com/en-us/lifestyle/lifestyler...,[],[]


# CF部分

In [61]:
news = []
with open('./data/news_1000.txt','r') as f:
    for line in f:
        news.append(str(line.strip('\n')))

In [62]:
import numpy as np
df_list = []
with open('./data/output_result.txt','r') as f:
    for line in f:
        data = line.strip('\n')
#         print(data)
        df_list.append(float(data)*0.1)

In [63]:
df = {}
k = 0
for i in range(0,1000):
    df[i] = []
    for j in range(0,1000):
        df[i].append(df_list[k])
        k = k+1

In [64]:
left = 0
ra = np.mean(df[0])
for p in range(0,1000):# 新闻
        left += (df[0][p] - ra)*(df[0][p] - ra)

left = left ** 0.5


sim = {}
for b in range(0,1000): # 用户
    rb = np.mean(df[b])
    up = 0
    right = 0
    for p in range(0,1000):# 新闻
        up += (df[0][p] - ra)*(df[b][p] - rb)
        right += (df[b][p] - rb)*(df[b][p] - rb)

    right = right ** 0.5
    down = left*right
    sim[b] = up / down
    

In [65]:
pred = {}
for p in range(0,1000): # 新闻
    up = 0
    down = 0
    pred[p] = ra
    for b in range(0,1000): # 用户
        if (sim[b] > 0.5 and sim[b] < 0.8) or (sim[b] < -0.5 and sim[b] > -0.8):
            rb = np.mean(df[b])
            up += sim[b] * (df[b][p] - rb)
            down += sim[b]
        if(sim[b] >= 0.8) or (sim[b] <= -0.8):
            rb = np.mean(df[b])
            up += sim[b] * (df[b][p] - rb)*2
            down += sim[b]
        
    pred[p] = ra + up/down

In [66]:
result = sorted(pred.items(), key=lambda item:item[1],reverse=True)

In [67]:
newslist=[]
scorelist=[]
max=result[0][1]
for r in result:
    newslist.append(news[r[0]])
    scorelist.append(r[1]/max)

ns2=pd.DataFrame()
ns2['newslist']=newslist
ns2['scorelist'] = scorelist

# CF结果

In [68]:
ns2[0:20]

Unnamed: 0,newslist,scorelist
0,N17473,1.0
1,N5387,0.992123
2,N10887,0.966473
3,N3627,0.943871
4,N13667,0.892142
5,N10063,0.882228
6,N19904,0.88033
7,N22351,0.875072
8,N9951,0.862962
9,N2210,0.856681


# TF-IDF部分

In [91]:
# -*- coding: utf-8 -*-
"""
Created on Fri Oct 16 17:17:18 2020

@author: 雷雨寒
"""
import sys
sys.path.append("../../")

import pandas as pd
import requests
import bs4
import numpy as np
import os

from tfidf.tfidf_utils import TfidfRecommender


class reader:
    def __init__(self,uid,like):
        self.uid=uid
        self.like=[]
        self.dislike=[]
        self.like.extend(like)



def srisnull(news):
    for i in range(len(news)):
        if(any(news['Abstract'].loc[[i]].isnull())):
            news.drop(news.loc[[i]])

def remove_duplicates(df, cols):
    for col in cols:
        # Reset index
        df = df.reset_index(drop=True)

        # Find where the identifier variable is duplicated
        dup_rows = np.where(df.duplicated([col]) == True)[0]

        # Drop duplicated rows
        df = df.drop(dup_rows)

    return df


def remove_nan(df, cols):
    for col in cols:
        # Convert any empty string cells to nan
        df[col].replace("", np.nan, inplace=True)

        # Remove NaN rows
        df = df[df[col].notna()]

    return df


def clean_dataframe(df):
    # Remove duplicated rows
    cols = ["News ID"]
    df = remove_duplicates(df, cols)

    # Remove rows without values in specified columns
    cols = ['Category','SubCategory','Title','Abstract']
    df = remove_nan(df, cols)

    return df

def user_recommendation(reader):


    tf,vectors_tokenized=recommender.tokenize_text(news,text_col='Abstract')
    
    recommender.fit(tf,vectors_tokenized)
    
    topk=recommender.recommend_top_k_items(news,k=20)
    
    cols_to_keep=['News ID','Title','Abstract']
    
    #print(news.columns)
    
    #generate result
    rec = pd.DataFrame()
    like1 = reader.like
    for i in range(len(like1)):
        rec = rec.append(recommender.get_top_k_recommendations(news, like1[i], cols_to_keep))
    
    rec = rec.sort_values(by=['similarity_score'], ascending=False)
    rec['rank'] = range(20)
    rec = rec.reset_index()
    rec = rec.drop(columns='index')
    
    # output

    return rec

# 为所有新闻类别建立一个二级字典，结构为：{Category:{SubCategory:[News ID，],},}
# 暂时没调用
def create_dictionary(news):
    news_dict = {}
    for i in range(len(news)):
        # 遍历新闻列表
        newsitem = news.iloc[i]
        # 将小类放到大类对应的字典中
        if newsitem['Category'] not in news_dict.keys():
            news_dict[newsitem['Category']] = {}
        dic = news_dict[newsitem['Category']]
        # 将News ID放到小类对应的列表中
        if newsitem['SubCategory'] not in dic.keys():
            dic[newsitem['SubCategory']] = []
        dic[newsitem['SubCategory']].append(newsitem['News ID'])
    return news_dict

# 根据用户喜欢的文章，获取用户喜欢的类别，Category结构为{Category:[SubCategory,],}
def Get_like_Category(like):
    Category = {}
    # 遍历喜欢的文章
    for like_item in like:
        # 找到喜欢的类别
        key = news[news['News ID'] == like_item]['Category'].values[0]
        # 将小类存放到大类对应的列表中
        if key not in Category.keys():
            Category[key] = []
        Category[key].append(news[news['News ID'] == like_item]['SubCategory'].values[0])

    return Category

# 根据用户喜欢的新闻类别，重新生成推荐结果，news为原始的新闻数据，rec为tfidf生成的推荐结果
def re_rec(Category, rec):
    # 遍历推荐的文章
    max=1.25*rec.loc[0, 'similarity_score']
    print(max)
    for i in range(len(rec)):
        rec_item = rec.iloc[i]
        news_item = news[news['News ID'] == rec_item['News ID']]
        if news_item['Category'].values[0] not in Category.keys(): #如果不在喜欢的大类里面，相似值权重降低为0.8
            rec.loc[i, 'similarity_score'] = 0.8*rec_item['similarity_score']/max
        else:
            # 符合喜欢的大类，不符合喜欢的小类，权重值降低为0.9
            if news_item['SubCategory'].values[0] not in Category[news_item['Category'].values[0]]:
                rec.loc[i, 'similarity_score'] = 0.9 * rec_item['similarity_score']/max
            # 大类小类都符合，权重值提高为1.25
            else:
                rec.loc[i, 'similarity_score'] = 1.25 * rec_item['similarity_score']/max
    # 根据新的相似值重新排序
    re_rec = rec.sort_values(by=['similarity_score'], ascending=False)
    return re_rec



if __name__ == "__main__":
    # load news
    news = pd.read_csv('../dataset/train/news.tsv', sep='\t', names=['News ID', 'Category', 'SubCategory', 'Title', 'Abstract', 'URL', 'Title Entities', 'Abstract Entities'])

    # data preparation
    news = clean_dataframe(news)[0:2000]
    news = news.reset_index()
    news = news.drop(columns='index')
    news_dict = create_dictionary(news)
    # load recommender
    recommender = TfidfRecommender(id_col='News ID', tokenization_method='scibert')
    
    
    # load behavior
    behaviors = pd.read_csv('../dataset/train/behaviors.tsv', sep='\t', names=['Impression ID', 'User ID', 'Time', 'History', 'Impressions'])
    behaviors = behaviors.dropna(axis=0, how='any')


    

In [92]:
# create a reader
behave = behaviors[behaviors['Impression ID'] == 1]
# like1=str(behave['History'].values[0]).split(' ')
like1 = ['N3112', 'N25192', 'N1172', 'N16959']
uid = behave['User ID'].values
readername = reader(uid, like1)
history_news=pd.DataFrame()
ID=news['News ID'].values
df=news.copy()
ilist=[]
for i in range(len(ID)):
    id = ID[i]
    if id in readername.like:
        ilist.append(i)

df.iloc[ilist]

Unnamed: 0,News ID,Category,SubCategory,Title,Abstract,URL,Title Entities,Abstract Entities
0,N3112,lifestyle,lifestyleroyals,"The Brands Queen Elizabeth, Prince Charles, an...","Shop the notebooks, jackets, and more that the...",https://www.msn.com/en-us/lifestyle/lifestyler...,"[{""Label"": ""Prince Philip, Duke of Edinburgh"",...",[]
17,N25192,lifestyle,lifestyleroyals,Every outfit Duchess Kate has worn in 2019,See Kate Middleton's style choices this year f...,https://www.msn.com/en-us/lifestyle/lifestyler...,[],"[{""Label"": ""Catherine, Duchess of Cambridge"", ..."
65,N1172,lifestyle,lifestyleroyals,The surprising age differences between your fa...,Here are the age differences between Meghan Ma...,https://www.msn.com/en-us/lifestyle/lifestyler...,[],"[{""Label"": ""Meghan, Duchess of Sussex"", ""Type""..."
190,N16959,lifestyle,lifestyleroyals,Queen Elizabeth's Favorite Beauty Products Hav...,"Here, all the brands the British monarch swear...",https://www.msn.com/en-us/lifestyle/lifestyler...,[],[]


In [93]:
# 获取ta喜欢的新闻类别
Category = {}
# 遍历喜欢的文章
for like_item in readername.like:
    # 找到喜欢的类别
    cate = news[news['News ID'] == like_item]['Category'].values
    if len(cate):
        key = cate[0]
        # 将小类存放到大类对应的列表中
        if key not in Category.keys():
            Category[key] = []
        sub_cate=news[news['News ID'] == like_item]['SubCategory'].values[0]
        if sub_cate not in Category[key]:
            Category[key].append(sub_cate)

In [94]:
tf,vectors_tokenized=recommender.tokenize_text(news,text_col='Abstract')
    
recommender.fit(tf,vectors_tokenized)

topk=recommender.recommend_top_k_items(news,k=20)

cols_to_keep=['News ID','Category','SubCategory','Title']

#print(news.columns)

#generate result
rec = pd.DataFrame()
like1 = readername.like
for i in range(len(like1)):
    rec = rec.append(recommender.get_top_k_recommendations(news, like1[i], cols_to_keep))

rec = rec.sort_values(by=['similarity_score'], ascending=False)
rec['rank'] = range(len(rec))
rec = rec.reset_index()
rec = rec.drop(columns='index')

In [103]:
result1 = re_rec(Category,rec)
result1 = result1.sort_values(by=['similarity_score'], ascending=False)

1.25


In [104]:
newslist=[]
scorelist=[]
for i in range(len(result1)):
    newslist.append(result1.loc[i,'News ID'])
    scorelist.append(result1.loc[i,'similarity_score'])

In [105]:
ns3=pd.DataFrame()
ns3['newslist']=newslist
ns3['scorelist']=scorelist

# 模型结果混合部分

In [163]:
ns={}
for i in range(20):
    if ns1.loc[i,'newslist'] not in ns.keys():
        ns[ns1.loc[i,'newslist']]=0
    if ns2.loc[i,'newslist'] not in ns.keys():
        ns[ns2.loc[i,'newslist']]=0
    if ns3.loc[i,'newslist'] not in ns.keys():
        ns[ns3.loc[i,'newslist']]=0
    ns[ns1.loc[i,'newslist']]+=float(ns1.loc[i,'scorelist'])*0.4
    ns[ns2.loc[i,'newslist']]+=float(ns2.loc[i,'scorelist'])*0.5
    ns[ns3.loc[i,'newslist']]+=float(ns3.loc[i,'scorelist'])*0.1

In [164]:
result_dict=dict(sorted(ns.items(), key=lambda d:d[1], reverse = True))

In [165]:
NewsID=result_dict.keys()

ID=news['News ID'].values
df=news.copy()
ilist=[]
for id in NewsID:
    ilist.extend(np.where(ID==id)[0])
df.iloc[ilist[0:10]]

Unnamed: 0,News ID,Category,SubCategory,Title,Abstract,URL,Title Entities,Abstract Entities
20019,N13667,lifestyle,lifestyleroyals,Prince Harry and Meghan Markle just shared a n...,The Duke and Duchess of Sussex shared a new ph...,https://www.msn.com/en-us/lifestyle/lifestyler...,"[{""Label"": ""Charles, Prince of Wales"", ""Type"":...","[{""Label"": ""Charles, Prince of Wales"", ""Type"":..."
20187,N19904,lifestyle,lifestyleroyals,"Prince Harry and Prince William's Rift Is ""One...",Santa is shook.,https://www.msn.com/en-us/lifestyle/lifestyler...,"[{""Label"": ""Prince Harry, Duke of Sussex"", ""Ty...","[{""Label"": ""Santa Claus"", ""Type"": ""R"", ""Wikida..."
22434,N22351,lifestyle,lifestyleroyals,Meghan Markle and Prince Harry Won't Spend Chr...,They'll hang out with baby Archie and Meghan's...,https://www.msn.com/en-us/lifestyle/lifestyler...,"[{""Label"": ""Meghan, Duchess of Sussex"", ""Type""...","[{""Label"": ""Meghan, Duchess of Sussex"", ""Type""..."
3103,N16759,lifestyle,lifestyleroyals,How Kate Middleton and Prince William's royal ...,Harry and Meghan had a much more relaxed appro...,https://www.msn.com/en-us/lifestyle/lifestyler...,"[{""Label"": ""Prince William, Duke of Cambridge""...","[{""Label"": ""Prince William, Duke of Cambridge""..."
21225,N15539,lifestyle,lifestyleroyals,Harry and Meghan Revive Feud Rumors By Staying...,"So, what are you doing for Christmas?",https://www.msn.com/en-us/lifestyle/lifestyler...,"[{""Label"": ""Meghan, Duchess of Sussex"", ""Type""...","[{""Label"": ""Christmas"", ""Type"": ""H"", ""Wikidata..."
97,N17473,finance,finance-real-estate,The penthouse of NYC's Woolworth Building just...,"The condo comes as a ""white-box"" unit, meaning...",https://www.msn.com/en-us/finance/finance-real...,"[{""Label"": ""Woolworth Building"", ""Type"": ""F"", ...",[]
1271,N5387,lifestyle,lifestyleroyals,The Most Adorable Photos of Archie Harrison Mo...,The royal baby boy is growing up so fast!,https://www.msn.com/en-us/lifestyle/lifestyler...,[],[]
15207,N10887,lifestyle,lifestyleroyals,19 Rarely Seen Photos of Royal Siblings,We've pulled together some rarely seen photos ...,https://www.msn.com/en-us/lifestyle/lifestyler...,[],[]
551,N3627,news,newscrime,Motorcyclist dies after crash with car in Quee...,A man who was riding a motorcycle died of his ...,https://www.msn.com/en-us/news/newscrime/motor...,"[{""Label"": ""Queen Creek, Arizona"", ""Type"": ""G""...","[{""Label"": ""Queen Creek, Arizona"", ""Type"": ""G""..."
1267,N837,lifestyle,lifestyleroyals,"Already 6 Months! Prince Harry, Duchess Meghan...","Already 6 Months! Prince Harry, Duchess Meghan...",https://www.msn.com/en-us/lifestyle/lifestyler...,"[{""Label"": ""Prince Harry, Duke of Sussex"", ""Ty...","[{""Label"": ""Prince Harry, Duke of Sussex"", ""Ty..."
