In [1]:
import sys
sys.path.append('/Users/isabelcorpus/.pyenv/versions/3.9.0/lib/python3.9/site-packages')
import pandas as pd
import numpy as np 
import matplotlib as plt
import seaborn as sns
from sklearn.manifold import TSNE
from sklearn.model_selection import train_test_split
from analysis_helper import *
import torch
import json
import seaborn as sns
import torch 
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torcheval.metrics import MulticlassAccuracy
from sentence_transformers import SentenceTransformer

In this notebook we complete the following tasks: 
- Load data from .vec and .tsv files 
- Process data for use 
- Create mean pooled Wikidata Knowledge Graph entity embeddings (100d) at the news item level 
- Generate news title embeddings using pretrained SBERT model (384d)
- Predict news category through three simple networks of similar architecture, such that we may compare performance when using entity embeddings, sbert embeddings, and a concatenation of both sources as input (484d).

I find that the test accuracy for the third case has the best performance, implying that there is information to be gained beyond the sbert sentence embeddings. In future work, it would be interesting to explore performance by category or the linguistic features/Wiki entities that contribute most heavily to each prediction. 

In [2]:
# Prepare data: 
# load files, normalize embeddings, preprocess for analysis and modeling 
news = pd.read_csv("data/MINDsmall_train/news.tsv", sep='\t', 
                   names = ["news_id", "category", "subcategory", "title", "abstract", "url", "title_entities", "abstract_entities"])
news = process_tsv(news, ['title_entities', 'abstract_entities'])

entity_id, entity_vec = load_embeddings('data/MINDsmall_train/entity_embedding.vec')
relation_id, relation_vec = load_embeddings('data/MINDsmall_train/relation_embedding.vec')

entity_vec = normalize(entity_vec)

# extract WikiData Knowledge Graph entity IDs for title and abstract
news['title_entity_ids'] = extract_entity_list(news, 'title_entities', 'WikidataId')
news['abstract_entity_ids'] = extract_entity_list(news, 'abstract_entities', 'WikidataId')

# mean pool entity embeddings to create news embeddings  
news_embeddings = mean_pooled_news_embeddings(entity_vec, entity_id, news, "title_entity_ids")

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col_i].fillna('{}', inplace=True)


In [4]:
# Generate sentence-bert embeddings using pretrained SBERT model 
# load pre-trained sbert 
sbert = SentenceTransformer('all-MiniLM-L6-v2')
title_embeddings = sbert.encode(np.vstack(news.title).flatten())

# join embeddings with news metadata 
title_embedding_df = pd.DataFrame(zip(news.news_id, title_embeddings), columns = ["news_id", "title_embedding"])
news_title_entity = pd.merge(title_embedding_df, news_embeddings, how = "inner", on = "news_id")

In [5]:
# recode category field as integer, create Dataset and load into DataLoader for training 
news_embeddings['Numerical_Category'] = cat_to_int(news_embeddings, "category")

# assign weights to each class to account for imbalance in loss fn 
weights = torch.tensor(list(1 - news_embeddings.Numerical_Category.value_counts().sort_index()/len(news_embeddings)))

# create Dataset for which each X - pooled entity embeddings, y - category of news
news_dataset = NewsDataset(np.vstack(news_embeddings.entity_vec), news_embeddings['Numerical_Category'])

# split 0.8 training, 0.2 test 
train_dataset, test_dataset = test_training_split(news_dataset, 0.8)
loader = DataLoader(train_dataset, shuffle=True, batch_size = 10)
test_loader = DataLoader(test_dataset, shuffle=True)

In [32]:
# simple sequential NN to predict news category  
model = multiclass_classifier()
loss_fn = nn.CrossEntropyLoss(weight=weights) 
optimizer = optim.Adam(model.parameters(), lr=0.001) 

trained_model, train_acc, train_loss = train_model(model, loss_fn, optimizer, 20, loader)
validate_model(trained_model, loss_fn, test_loader)

epoch: 0, training loss: 6918.407646417618, training accuracy: 0.5401629209518433
epoch: 1, training loss: 6715.028743863106, training accuracy: 0.6223741173744202
epoch: 2, training loss: 6670.079519748688, training accuracy: 0.6316994428634644
epoch: 3, training loss: 6638.904370546341, training accuracy: 0.6391395330429077
epoch: 4, training loss: 6616.911562800407, training accuracy: 0.6447616219520569
epoch: 5, training loss: 6596.905938267708, training accuracy: 0.6483638286590576
epoch: 6, training loss: 6578.543019890785, training accuracy: 0.6502491235733032
epoch: 7, training loss: 6566.764093637466, training accuracy: 0.6522690653800964
epoch: 8, training loss: 6552.1642454862595, training accuracy: 0.6546593308448792
epoch: 9, training loss: 6540.041749000549, training accuracy: 0.6555683016777039
epoch: 10, training loss: 6530.636615276337, training accuracy: 0.6564099192619324
epoch: 11, training loss: 6520.210317611694, training accuracy: 0.6570832133293152
epoch: 12, tr

(tensor(0.6478), 16258.88407254219)

In [33]:
# Use SBERT embeddings as input to NN, compare results 
news_title_entity['Numerical_Category'] = cat_to_int(news_title_entity, "category")
news_title_entity_ds = NewsDataset(np.vstack(news_title_entity.title_embedding), news_title_entity['Numerical_Category'])
sbert_train_dataset, sbert_test_dataset = test_training_split(news_title_entity_ds, 0.8)
sbert_loader = DataLoader(sbert_train_dataset, shuffle=True, batch_size = 10)
sbert_test_loader = DataLoader(sbert_test_dataset, shuffle=True)

In [34]:
# sbert model; out performs mean pooled entity embeddings 
model = multiclass_classifier_title()
loss_fn = nn.CrossEntropyLoss(weight = weights) 
optimizer = optim.Adam(model.parameters(), lr=0.001) 

trained_model_sbert, train_acc, train_loss = train_model(model, loss_fn, optimizer, 20, sbert_loader)
validate_model(trained_model_sbert, loss_fn, sbert_test_loader)

epoch: 0, training loss: 6477.412103056908, training accuracy: 0.6004241704940796
epoch: 1, training loss: 6238.686894536018, training accuracy: 0.7168731689453125
epoch: 2, training loss: 6199.517315745354, training accuracy: 0.7342782020568848
epoch: 3, training loss: 6173.627158045769, training accuracy: 0.7369041442871094
epoch: 4, training loss: 6151.771329283714, training accuracy: 0.7405399680137634
epoch: 5, training loss: 6133.178672194481, training accuracy: 0.7431322336196899
epoch: 6, training loss: 6117.650912761688, training accuracy: 0.7486533522605896
epoch: 7, training loss: 6102.00263261795, training accuracy: 0.748788058757782
epoch: 8, training loss: 6087.744012594223, training accuracy: 0.7533329129219055
epoch: 9, training loss: 6074.91376376152, training accuracy: 0.7537368535995483
epoch: 10, training loss: 6063.97824037075, training accuracy: 0.7557231187820435
epoch: 11, training loss: 6053.770397186279, training accuracy: 0.7590223550796509
epoch: 12, trainin

(tensor(0.7508), 15643.966765522957)

In [35]:
# Re-train simple NN with different set of inputs: concatenate pooled entity vectors with SBERT embedding  of title
concat_embeddings = np.concatenate((np.vstack(news_title_entity.title_embedding), np.vstack(news_title_entity.entity_vec)), axis = 1)


In [36]:
# predicting category with concatenated embeddings (entity; title sbert) provides best training, test accuracy, hooray!
news_title_entity_ds = NewsDataset(concat_embeddings, news_title_entity['Numerical_Category'])
concat_train_dataset, concat_test_dataset = test_training_split(news_title_entity_ds, 0.8)
concat_loader = DataLoader(concat_train_dataset, shuffle = True, batch_size = 10)
concat_test_loader = DataLoader(concat_test_dataset, shuffle = True)

model = multiclass_classifier_concat()
loss_fn = nn.CrossEntropyLoss(weight = weights) 
optimizer = optim.Adam(model.parameters(), lr=0.001) 

trained_model_concat, train_acc, train_loss = train_model(model, loss_fn, optimizer, 20, concat_loader)
validate_model(trained_model_concat, loss_fn, concat_test_loader)

epoch: 0, training loss: 6407.796445846558, training accuracy: 0.6763062477111816
epoch: 1, training loss: 6201.842008829117, training accuracy: 0.7361298203468323
epoch: 2, training loss: 6158.539595723152, training accuracy: 0.75026935338974
epoch: 3, training loss: 6129.76941382885, training accuracy: 0.7564637660980225
epoch: 4, training loss: 6106.763685464859, training accuracy: 0.7617155909538269
epoch: 5, training loss: 6088.338180184364, training accuracy: 0.7645098567008972
epoch: 6, training loss: 6071.910728812218, training accuracy: 0.767876386642456
epoch: 7, training loss: 6057.455435872078, training accuracy: 0.7709062695503235
epoch: 8, training loss: 6045.088632583618, training accuracy: 0.7751481533050537
epoch: 9, training loss: 6033.756937980652, training accuracy: 0.7762254476547241
epoch: 10, training loss: 6025.061386466026, training accuracy: 0.7796929478645325
epoch: 11, training loss: 6014.832895755768, training accuracy: 0.7798949480056763
epoch: 12, trainin

(tensor(0.7731), 15520.653519511223)