In [23]:
# from newsrec.config import model_name
import pandas as pd
import swifter
import json
import math
from tqdm import tqdm
from os import path
from pathlib import Path
import random
from nltk.tokenize import word_tokenize
import numpy as np
import csv
import importlib
from transformers import RobertaTokenizer, RobertaModel
import torch

In [3]:
train_dir = '../../data/train'
source = path.join(train_dir, 'behaviors.tsv')
target = path.join(train_dir, 'behaviors_parsed.tsv')
user2int_path = path.join(train_dir, 'user2int.tsv')

In [6]:
behaviors = pd.read_table(
    source,
    header=None,
    names=['impression_id', 'user', 'time', 'clicked_news', 'impressions'])
behaviors.clicked_news.fillna(' ', inplace=True)
behaviors.impressions = behaviors.impressions.str.split()

In [12]:
behaviors.head()

Unnamed: 0,impression_id,user,time,clicked_news,impressions
0,1,1,11/10/2019 11:30:54 AM,N8668 N39081 N65259 N79529 N73408 N43615 N2937...,"[N78206-0, N26368-0, N7578-0, N58592-0, N19858..."
1,2,2,11/12/2019 1:45:29 PM,N56056 N8726 N70353 N67998 N83823 N111108 N107...,"[N47996-0, N82719-0, N117066-0, N8491-0, N1237..."
2,3,3,11/13/2019 11:23:03 AM,N128643 N87446 N122948 N9375 N82348 N129412 N5...,"[N103852-0, N53474-0, N127836-0, N47925-1]"
3,4,4,11/12/2019 12:24:09 PM,N31043 N39592 N4104 N8223 N114581 N92747 N1207...,"[N38902-0, N76434-0, N71593-0, N100073-0, N108..."
4,5,5,11/14/2019 8:03:01 PM,N65250 N122359 N71723 N53796 N41663 N41484 N11...,"[N76209-0, N48841-0, N67937-0, N62235-0, N6307..."


In [9]:
user2int = {}
for row in behaviors.itertuples(index=False):
    if row.user not in user2int:
        user2int[row.user] = len(user2int) + 1

pd.DataFrame(user2int.items(), columns=['user', 'int']).to_csv(user2int_path,
                                                       sep='\t',
                                                       index=False)

In [10]:
print(
        f'Please modify `num_users` in `src/config.py` into 1 + {len(user2int)}'
    )

Please modify `num_users` in `src/config.py` into 1 + 711222


In [11]:
for row in behaviors.itertuples():
    behaviors.at[row.Index, 'user'] = user2int[row.user]

In [24]:
for row in tqdm(behaviors.itertuples(), desc="Balancing data"):
    positive = iter([x for x in row.impressions if x.endswith('1')])
    negative = [x for x in row.impressions if x.endswith('0')]
    random.shuffle(negative)
    negative = iter(negative)
    pairs = []
    try:
        while True:
            pair = [next(positive)]
            for _ in range(2):
                pair.append(next(negative))
            pairs.append(pair)
    except StopIteration:
        pass
    behaviors.at[row.Index, 'impressions'] = pairs

Balancing data: 2232748it [02:52, 12961.09it/s]


In [25]:
behaviors.head()

Unnamed: 0,impression_id,user,time,clicked_news,impressions
0,1,1,11/10/2019 11:30:54 AM,N8668 N39081 N65259 N79529 N73408 N43615 N2937...,"[[N94157-1, N40282-0, N78206-0], [N78699-1, N3..."
1,2,2,11/12/2019 1:45:29 PM,N56056 N8726 N70353 N67998 N83823 N111108 N107...,"[[N25587-1, N108379-0, N20780-0], [N36266-1, N..."
2,3,3,11/13/2019 11:23:03 AM,N128643 N87446 N122948 N9375 N82348 N129412 N5...,"[[N47925-1, N103852-0, N53474-0]]"
3,4,4,11/12/2019 12:24:09 PM,N31043 N39592 N4104 N8223 N114581 N92747 N1207...,"[[N114935-1, N46262-0, N41710-0]]"
4,5,5,11/14/2019 8:03:01 PM,N65250 N122359 N71723 N53796 N41663 N41484 N11...,"[[N86258-1, N120147-0, N67955-0]]"


In [26]:
behaviors = behaviors.explode('impressions').dropna(
        subset=["impressions"]).reset_index(drop=True)

In [27]:
behaviors.head()

Unnamed: 0,impression_id,user,time,clicked_news,impressions
0,1,1,11/10/2019 11:30:54 AM,N8668 N39081 N65259 N79529 N73408 N43615 N2937...,"[N94157-1, N40282-0, N78206-0]"
1,1,1,11/10/2019 11:30:54 AM,N8668 N39081 N65259 N79529 N73408 N43615 N2937...,"[N78699-1, N32954-0, N58258-0]"
2,1,1,11/10/2019 11:30:54 AM,N8668 N39081 N65259 N79529 N73408 N43615 N2937...,"[N71090-1, N58592-0, N27822-0]"
3,1,1,11/10/2019 11:30:54 AM,N8668 N39081 N65259 N79529 N73408 N43615 N2937...,"[N31174-1, N97778-0, N26368-0]"
4,2,2,11/12/2019 1:45:29 PM,N56056 N8726 N70353 N67998 N83823 N111108 N107...,"[N25587-1, N108379-0, N20780-0]"


In [28]:
behaviors[['candidate_news', 'clicked']] = pd.DataFrame(
        behaviors.impressions.map(
            lambda x: (' '.join([e.split('-')[0] for e in x]), ' '.join(
                [e.split('-')[1] for e in x]))).tolist())

In [29]:
behaviors.head()

Unnamed: 0,impression_id,user,time,clicked_news,impressions,candidate_news,clicked
0,1,1,11/10/2019 11:30:54 AM,N8668 N39081 N65259 N79529 N73408 N43615 N2937...,"[N94157-1, N40282-0, N78206-0]",N94157 N40282 N78206,1 0 0
1,1,1,11/10/2019 11:30:54 AM,N8668 N39081 N65259 N79529 N73408 N43615 N2937...,"[N78699-1, N32954-0, N58258-0]",N78699 N32954 N58258,1 0 0
2,1,1,11/10/2019 11:30:54 AM,N8668 N39081 N65259 N79529 N73408 N43615 N2937...,"[N71090-1, N58592-0, N27822-0]",N71090 N58592 N27822,1 0 0
3,1,1,11/10/2019 11:30:54 AM,N8668 N39081 N65259 N79529 N73408 N43615 N2937...,"[N31174-1, N97778-0, N26368-0]",N31174 N97778 N26368,1 0 0
4,2,2,11/12/2019 1:45:29 PM,N56056 N8726 N70353 N67998 N83823 N111108 N107...,"[N25587-1, N108379-0, N20780-0]",N25587 N108379 N20780,1 0 0
