In [3]:
# Data mining group project, 2021

import pandas as pd
import numpy as np
import re
import html as ihtml
from bs4 import BeautifulSoup
from urllib.request import urlopen
import random as random
from datetime import datetime
import ast
import time
import sys

# Original data from MIND
newsTSV = pd.read_csv("MINDlarge_train/news.tsv", sep="\t", header=None)
wikidataTSV = pd.read_csv("MINDlarge_train/entity_embedding.vec", sep="\t", header=None)
behaviorTSV = pd.read_csv("MINDlarge_train/behaviors.tsv", sep="\t", header=None)

# Processed news data
newsDF = None

# Processed behavior data
behaviorDF = None

# Processed embedding data
embeddingsDF = None

In [13]:
### ============= Handle news articles ==================

def getNewsArticleDataFromWeb(article):
    url = article[5]
    if len(url) < 5:  # Cannot be valid
        return None
    page = urlopen(url)
    html = page.read().decode("utf-8")
    dom = BeautifulSoup(html)
    
    # Get publish date
    pDate = None
    elementNames = ["time"]
    for eName in elementNames:
        eValue = dom.find(eName)
        if len(eValue) == 1:
            dStr = eValue.string.strip("\r\n").strip(" ")
            pDate = datetime.strptime(dStr, "%m/%d/%Y")
    
    return pDate

def addPublishDates(news, start, end):
    d = []
    for i in range(start, end):
        date = ""
        try:
            date = getNewsArticleDataFromWeb(news.iloc[i])
        except KeyboardInterrupt:
            raise
        except:
            print("Unexpected error for item:", news.iloc[i][0], ":", sys.exc_info()[0])
        news.loc[i, "Timestamp"] = date

def createNewsDataframe(rawData, start):
    # Add timestamp column to existing dataset
    CHUNK_SIZE = 500
    i = start//CHUNK_SIZE
    last = 0
    L = len(rawData)
    rawData["Timestamp"] = range(len(rawData))  # Add new column for date
    while last < L:
        last = min((i + 1) * CHUNK_SIZE, L)
        addPublishDates(rawData, i * CHUNK_SIZE, last)
        print("Total items read:", last)
        i = i + 1
    return rawData

### =========== Handle impressions ================

def createValidArticlesLookupTable(newsTSV, embeddingsDF):
    # For faster execution, create a dictionary with valid news article id as keys
    # An article is valid iff it has a non-zero embedding vector
    d = {}
    for i in range(len(newsTSV)):
        aid = newsTSV.iloc[i][0]
        rowDF = embeddingsDF[embeddingsDF["NID"] == aid]
        if rowDF.empty:
            continue        
        w = np.array(rowDF.iloc[0][1:].values)
        n = np.linalg.norm(w)
        if (n > 1e-6):
            d[aid] = True
        if i % 1000 == 0:
            print(i, end=", ")
    return d

def getBehaviorDataFromImpressions(impressions, lookupTable):
    # Create three lists of news articles from the given impressions, i.e.:
    # 1. history before the impression was shown
    # 2. chosen when the impression was shown
    # 3. not chosen when the impression was shown

    resultDF = pd.DataFrame()
    skipped = 0
    for i in range(len(impressions)):
        added = False
        row = impressions.iloc[i]
        iid = row[0]
        uid = row[1]
        timestamp = row[2]
        #try:
        if type(row[3]) == str and type(row[4] == str):
            
            history = [x for x in row[3].split(" ") if x in lookupTable]
            impressionList = row[4].split(" ")
            
            if len(history) > 0 and len(impressionList) > 0:

                # Create separate lists with news that the user selected and did not select
                positive = [r[:len(r)-2] for r in impressionList if r[-2:] == "-1" and r[:len(r)-2] in lookupTable]
                negative = [r[:len(r)-2] for r in impressionList if r[-2:] == "-0" and r[:len(r)-2] in lookupTable]

                if len(history) > 0 and (len(positive) > 0 or len(negative) > 0):
                    df = pd.DataFrame({"IID":iid, "UID":uid, "Timestamp":timestamp, "History":[history], "Positive":[positive], "Negative":[negative]})
                    resultDF = resultDF.append(df, ignore_index=True)
                    added = True

        #except KeyboardInterrupt:
        #    raise
        #except:
        #    print("Unexpected error for user:", uid, "with index", i, ":", sys.exc_info()[0])
        if not added:
            skipped += 1
        
    if skipped > 0:
        print("Skipped", skipped, end=", ")
        
    return resultDF

def createBehaviorDataframe(impressionDataset, lookupTable):
    # Create the frame with user history and selections (one row for each impression, only valid articles included)
    CHUNK_SIZE = 1000
    L = len(impressionDataset)
    last = 0
    i = 0
    df = pd.DataFrame()
    while last < L:
        last = min((i + 1) * CHUNK_SIZE, L)
        subset = impressionDataset[i * CHUNK_SIZE:last]
        df = df.append(getBehaviorDataFromImpressions(subset, lookupTable))
        print("Total items read:", len(df))
        i = i + 1
    return df

### ============== Handle embeddings ===============

def getWikidataReferences(newsArticle):
    t = [entity["WikidataId"] for entity in ast.literal_eval(newsArticle[6])]  # From title
    a = [entity["WikidataId"] for entity in ast.literal_eval(newsArticle[7])]  # From abstract
    return t + a

def createEmbeddingsDataframe(newsTSV, wikidataTSV):
    # For each news article, combine all its embeddings from Wikidata into one single vector
    N_VECTOR_COLS = 100    # Extract all numerical values except from the last column, which is almost always "nan"
    colList = ["NID"] + list(range(N_VECTOR_COLS))
    resultDF = pd.DataFrame(columns=colList)
    empty = 0
    for i in range(len(newsTSV)):
        try:
            article = newsTSV.iloc[i]
            sumVector = np.zeros(N_VECTOR_COLS)
            n_vectors = 0
            for e in getWikidataReferences(article):       # For each embedding reference
                row = wikidataTSV[wikidataTSV[0]==e]
                if not row.empty:                          # If we have data for this reference:
                    w = np.array(row.iloc[0][1:N_VECTOR_COLS + 1].values)   
                    sumVector = sumVector + w              # Combine embeddings by taking the sum
                    n_vectors = n_vectors + 1
                else:
                    empty += 1
        except KeyboardInterrupt:
            raise
        except:
            print("Unexpected error for item:", article[0], "with index", i, ":", sys.exc_info()[0])

        sumVector = sumVector / n_vectors  # Store the mean of the embeddings
        resultDF.loc[i] = [article[0]] + list(sumVector)
        if i % 1000 == 0:
            print("Read", i, "empty", empty)
    
    return resultDF   

In [None]:
# This takes around one hour for 50 000 articles...
#newsDF = createNewsDataframe(newsTSV, 0)

In [181]:
newsDF

Unnamed: 0,0,1,2,3,4,5,6,7,Timestamp
0,N55528,lifestyle,lifestyleroyals,"The Brands Queen Elizabeth, Prince Charles, an...","Shop the notebooks, jackets, and more that the...",https://assets.msn.com/labs/mind/AAGH0ET.html,"[{""Label"": ""Prince Philip, Duke of Edinburgh"",...",[],2019-09-02 00:00:00
1,N19639,health,weightloss,50 Worst Habits For Belly Fat,These seemingly harmless habits are holding yo...,https://assets.msn.com/labs/mind/AAB19MK.html,"[{""Label"": ""Adipose tissue"", ""Type"": ""C"", ""Wik...","[{""Label"": ""Adipose tissue"", ""Type"": ""C"", ""Wik...",2019-05-07 00:00:00
2,N61837,news,newsworld,The Cost of Trump's Aid Freeze in the Trenches...,Lt. Ivan Molchanets peeked over a parapet of s...,https://assets.msn.com/labs/mind/AAJgNsz.html,[],"[{""Label"": ""Ukraine"", ""Type"": ""G"", ""WikidataId...",2019-10-25 00:00:00
3,N53526,health,voices,I Was An NBA Wife. Here's How It Affected My M...,"I felt like I was a fraud, and being an NBA wi...",https://assets.msn.com/labs/mind/AACk2N6.html,[],"[{""Label"": ""National Basketball Association"", ...",2019-06-03 00:00:00
4,N38324,health,medical,"How to Get Rid of Skin Tags, According to a De...","They seem harmless, but there's a very good re...",https://assets.msn.com/labs/mind/AAAKEkt.html,"[{""Label"": ""Skin tag"", ""Type"": ""C"", ""WikidataI...","[{""Label"": ""Skin tag"", ""Type"": ""C"", ""WikidataI...",2019-05-01 00:00:00
...,...,...,...,...,...,...,...,...,...
51277,N16909,weather,weathertopstories,"Adapting, Learning And Soul Searching: Reflect...",Woolsey Fire Anniversary: A community is forev...,https://assets.msn.com/labs/mind/BBWzQJK.html,"[{""Label"": ""Woolsey Fire"", ""Type"": ""N"", ""Wikid...","[{""Label"": ""Woolsey Fire"", ""Type"": ""N"", ""Wikid...",2019-11-11 00:00:00
51278,N47585,lifestyle,lifestylefamily,Family says 13-year-old Broadway star died fro...,,https://assets.msn.com/labs/mind/BBWzQYV.html,"[{""Label"": ""Broadway theatre"", ""Type"": ""F"", ""W...",[],2019-11-11 00:00:00
51279,N7482,sports,more_sports,St. Dominic soccer player tries to kick cancer...,"Sometimes, what happens on the sidelines can b...",https://assets.msn.com/labs/mind/BBWzQnK.html,[],[],2019-11-11 00:00:00
51280,N34418,sports,soccer_epl,How the Sounders won MLS Cup,"Mark, Jeremiah and Casey were so excited they ...",https://assets.msn.com/labs/mind/BBWzQuK.html,"[{""Label"": ""MLS Cup"", ""Type"": ""U"", ""WikidataId...",[],2019-11-11 00:00:00


In [11]:
embeddingsDF = createEmbeddingsDataframe(newsTSV, wikidataTSV)

3
Read 0 empty 0
4
2
1
1
1
4
1
2
2


In [12]:
embeddingsDF

Unnamed: 0,NID,0,1,2,3,4,5,6,7,8,...,90,91,92,93,94,95,96,97,98,99
0,N88753,0.004057,-0.039917,-0.008374,0.079142,-0.02023,-0.022213,0.040848,0.001435,0.061829,...,-0.073342,0.014616,0.055785,0.024076,0.026349,-0.052182,0.046934,-0.02492,0.071699,-0.036128
1,N45436,0.011154,0.002282,0.033238,-0.00608,-0.011816,-0.007698,0.022015,-0.016496,-0.016267,...,0.023262,-0.001199,-0.026635,-0.059118,0.036749,-0.082116,0.052,-0.039464,0.016651,-0.015847
2,N23144,-0.013597,-0.009758,0.01712,-0.051993,0.037963,0.045238,0.077176,-0.033402,0.032126,...,-0.085929,-0.059981,0.004588,-0.028985,-0.059973,-0.035562,0.106053,-0.10042,0.051723,0.001144
3,N86255,0.063956,0.003075,-0.005839,-0.026747,0.027664,-0.01637,-0.012593,0.0614,-0.040613,...,0.048834,0.039181,-0.076808,-0.04662,0.084239,-0.071485,0.011918,-0.017153,-0.010607,-0.005017
4,N93187,-0.065324,-0.088163,-0.015203,-0.031949,0.091263,-0.228807,-0.005629,-0.199308,0.158042,...,-0.147075,-0.088895,-0.102027,-0.058871,-0.156666,-0.099688,0.020397,-0.086153,-0.046413,0.066968
5,N75236,0.003752,-0.061771,-0.037073,0.02677,-0.090658,0.012813,-0.092285,0.074664,-0.066141,...,-0.007802,0.051335,0.028594,-0.113466,-0.007955,-0.051726,0.01268,-0.058242,-0.035847,0.010902
6,N99744,-0.002881,0.004462,0.021646,-0.031915,0.01663,0.010055,-0.019161,-0.021661,-0.018657,...,-0.02565,0.00089,-0.033017,-0.041724,0.022862,-0.059798,0.030373,-0.070557,0.033943,0.026709
7,N5771,-0.095814,-0.135361,0.107758,-0.050339,-0.115539,-0.206377,0.055279,-0.019078,-0.051827,...,-0.079915,0.103355,0.06308,-0.049345,-0.117866,-0.096519,0.127443,-0.081883,-0.116518,0.063741
8,N124534,0.048373,-0.033677,-0.054055,0.031752,-0.025667,-0.016133,0.004332,0.080615,-0.081341,...,-0.035274,0.053175,-0.053105,-0.061278,0.180822,-0.109324,0.012499,-0.01182,0.018744,0.087919
9,N51947,-0.011671,-0.040334,-0.030651,-0.072773,0.028622,0.047524,-0.059179,-0.052514,-0.003913,...,0.046963,-0.076553,0.032888,0.014859,-0.047316,-0.076394,0.052694,-0.056931,0.036251,0.085739


In [187]:
lookupTable = createValidArticlesLookupTable(newsTSV, embeddingsDF)

0, 1000, 2000, 3000, 4000, 5000, 6000, 7000, 8000, 9000, 10000, 11000, 12000, 13000, 14000, 15000, 16000, 17000, 18000, 19000, 20000, 21000, 22000, 23000, 24000, 25000, 26000, 27000, 28000, 29000, 30000, 31000, 32000, 33000, 34000, 35000, 36000, 37000, 38000, 39000, 40000, 41000, 42000, 43000, 44000, 45000, 46000, 47000, 48000, 49000, 50000, 51000, 52000, 53000, 54000, 55000, 56000, 57000, 58000, 59000, 60000, 61000, 62000, 63000, 64000, 65000, 66000, 67000, 68000, 69000, 70000, 71000, 72000, 73000, 74000, 75000, 76000, 77000, 78000, 79000, 80000, 81000, 82000, 83000, 84000, 85000, 86000, 87000, 88000, 89000, 90000, 91000, 92000, 93000, 94000, 95000, 96000, 97000, 98000, 99000, 100000, 101000, 

In [None]:
behaviorDF = createBehaviorDataframe(behaviorTSV, lookupTable)
behaviorDF

In [193]:
embeddingsDF.to_csv("MINDlarge_train/embeddingsDF.csv", index=False)

In [194]:
behaviorDF.to_csv("MINDlarge_train/behaviorDF.csv", index=False)

In [None]:
#newsDF.to_csv("MINDlarge_train/newsDF.csv", index=False)