In [None]:
!wget https://mind201910small.blob.core.windows.net/release/MINDsmall_train.zip
!unzip /content/MINDsmall_train.zip -d mind_small_train

In [2]:
import pandas as pd

df_news = pd.read_csv("/content/mind_small_train/news.tsv", sep='\t', names=['News ID','Category', 'SubCategory', 'Title', 'Abstract','Url','Title Entities','Abstract Entites'])
df_users = pd.read_csv("/content/mind_small_train/behaviors.tsv", sep='\t', names=['Impression ID','User ID', 'Time', 'History', 'Impressions'])

In [3]:
from tqdm import tqdm
import json


user_category_counter = {}
user_subcategory_counter = {}
user_entity_counter = {}

all_categories = set()
all_subcategories = set()
all_entities = set()

user_news_mapping = {}
for index,row in tqdm(df_users.iterrows(), total=len(df_users.index)):
  user_id = row['User ID']
  all_clicked_news = user_news_mapping.get(user_id, set())
  if row['History'] == row['History']:
    for news_id in row['History'].split(' '):
      all_clicked_news.add(news_id)
  for impression in row['Impressions'].split(' '):
    if impression.endswith("-1"):
      all_clicked_news.add(impression.replace("-1", ""))
  user_news_mapping[user_id] = all_clicked_news

for user_id,all_clicked_news in tqdm(user_news_mapping.items(), total=len(user_news_mapping.keys())):
  user_category_counter[user_id] = user_category_counter.get(user_id, {})
  user_subcategory_counter[user_id] = user_subcategory_counter.get(user_id, {})
  user_entity_counter[user_id] = user_entity_counter.get(user_id, {})

  clicked_news_df = df_news[df_news['News ID'].isin(all_clicked_news)]
  clicked_categories = clicked_news_df['Category'].value_counts()
  clicked_subcategories = clicked_news_df['SubCategory'].value_counts()

  for key, value in clicked_categories.items():
    user_category_counter[user_id][key] = user_category_counter[user_id].get(key, 0) + value
    all_categories.add(key)

  for key, value in clicked_subcategories.items():
    user_subcategory_counter[user_id][key] = user_subcategory_counter[user_id].get(key, 0) + value
    all_subcategories.add(key)

  for entity_column in clicked_news_df['Title Entities']:
    if entity_column != entity_column:
      continue
    entities = json.loads(entity_column)
    for entity in entities:
      entity = entity['Label']
      current_count = user_entity_counter[user_id].get(entity, 0) + 1
      user_entity_counter[user_id][entity] = current_count
      all_entities.add(entity)

#print(user_category_counter)
#print(user_entity_counter)

100%|██████████| 156965/156965 [00:13<00:00, 11623.61it/s]
100%|██████████| 50000/50000 [04:15<00:00, 195.69it/s]


In [4]:
def make_dataframe(user_data, all_columns, sparse=False):
  if sparse:
    user_category_counter_data = {'user': [], 'tags': []}
    for user, cats in user_data.items():
      user_category_counter_data['user'].append(user)
      user_category_counter_data['tags'].append(";".join(cats.keys()))
    categories_df = pd.DataFrame(data=user_category_counter_data)
    return categories_df
  user_category_counter_data = {'user': []}
  for cat in all_columns:
    user_category_counter_data[cat] = []

  for user, cats in user_data.items():
    for cat in user_category_counter_data.keys():
      user_category_counter_data[cat].append(0)
    user_category_counter_data['user'][-1] = user
    for cat, count in cats.items():
      user_category_counter_data[cat][-1] = count

  categories_df = pd.DataFrame(data=user_category_counter_data)
  return categories_df

df_categories = make_dataframe(user_category_counter, all_categories)
df_categories.to_csv("user_category_frequency.csv")

df_subcategories = make_dataframe(user_subcategory_counter, all_subcategories)
df_subcategories.to_csv("user_subcategory_frequency.csv")

df_entities = make_dataframe(user_entity_counter, all_entities, sparse=True)
df_entities.to_csv("user_entity_frequency.csv")

In [None]:
df_categories.head()

Unnamed: 0,user,movies,foodanddrink,kids,finance,music,sports,lifestyle,weather,news,health,entertainment,travel,video,tv,northamerica,middleeast,autos
0,U13740,1,0,0,0,0,3,1,0,4,0,0,1,0,2,0,0,0
1,U91836,1,4,0,8,1,3,6,2,60,1,0,7,2,0,0,0,0
2,U73700,0,1,0,1,0,6,6,1,2,0,0,0,0,1,0,0,1
3,U34670,0,0,0,2,1,4,1,0,1,0,0,0,1,4,0,0,0
4,U8125,0,0,0,0,0,0,0,1,0,1,0,1,0,0,0,0,2


In [None]:
df_subcategories.head()

Unnamed: 0,user,finance-healthcare,autoshybrids,newsvideo,causes,causes-poverty,retirement,middleeast-top-stories,autosownership,musicvideos,...,basketball_wnba,sports,baseball_mlb_videos,causes-military-appreciation,lifestylemindandsoul,tv-recaps,popculture,ads-latingrammys,travel-points-rewards,elections-2020-us
0,U13740,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,U91836,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,U73700,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,U34670,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,U8125,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
df_entities.head()

Unnamed: 0,user,tags
0,U13740,Joe Biden;South Carolina;New York Yankees;Hous...
1,U91836,Cincinnati Public Schools;Democratic Republic ...
2,U73700,United States;United Kingdom;LeGarrette Blount...
3,U34670,Jim Farmer;National Basketball Association;Was...
4,U8125,Jerusalem
