# Feature Engineering + XGBoost

In [137]:
# import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import json

from xgboost import XGBClassifier as xgb
from sklearn.preprocessing import OneHotEncoder
from sentence_transformers import SentenceTransformer
from sklearn.decomposition import PCA

from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, average_precision_score, roc_auc_score
from sklearn.metrics import make_scorer
from sklearn.model_selection import RandomizedSearchCV

## Import Data

In [138]:
# change directory if data is in a different folder
import os
os.chdir("C:/Users/tanmb/Downloads/SJSU/CMPE_256/project/project_data")

In [139]:
train_clicked = pd.read_csv('full_train_clicked.csv')
train_not_clicked = pd.read_csv('full_train_not_clicked.csv')
val_clicked = pd.read_csv('full_val_clicked.csv')
val_not_clicked = pd.read_csv('full_val_not_clicked.csv')

news_data = pd.read_csv('full_news.csv')

In [140]:
train = pd.concat([train_clicked, train_not_clicked]).sort_values(by = ['user_id', 'timestamp'])
val = pd.concat([val_clicked, val_not_clicked]).sort_values(by = ['user_id', 'timestamp'])

## Data Exploration

In [None]:
train_users = train['user_id'].unique()
val_users = val['user_id'].unique()

intersection = list(set(train_users).intersection(val_users))
len(intersection)

In [None]:
print("Shape: ", train.shape)
print("Duplicates:", train.duplicated().sum())
print("Missing values:", train.isna().sum().sum())
train.head()

In [None]:
print("Shape: ", val.shape)
print("Duplicates:", val.duplicated().sum())
print("Missing values:", val.isna().sum().sum())
val.head()

In [None]:
print("Duplicate articles:", news_data.duplicated().sum())
print("Duplicate titles:", news_data['title'].duplicated().sum())
print("Duplicate abstracts:", news_data['abstract'].duplicated().sum())
print("Empty titles:", news_data['title'].isna().sum())
print("Empty abstracts:", news_data['abstract'].isna().sum())
news_data.head()

In [9]:
news_data['abstract'] = news_data['abstract'].fillna('')

In [None]:
type(news_data['title_entities'][0])

In [None]:
news_data[news_data['abstract_entities'].apply(lambda x: x == '[]')]

## Data Preprocessing

### One-Hot Encode Categories and Subcategories
We want to first encode the categories and subcategories. This would result in 288 sparse columns. So, we perform PCA to reduce dimensions for our model to work with.

In [142]:
embedding_dim = news_data['category'].nunique()

# one hot encode category and subcategory
encoder = OneHotEncoder()
cat_subcat = encoder.fit_transform(news_data[['category', 'subcategory']])

# reduce dimension
cat_pca = PCA(n_components = embedding_dim, random_state = 33)
cat_subcat_reduced = cat_pca.fit_transform(cat_subcat)

# store as dataframe
enc_categories = pd.DataFrame(cat_subcat_reduced, columns=[f'cat_subcat_{i}' for i in range(embedding_dim)])

In [143]:
metadata = pd.concat([news_data['news_id'], enc_categories], axis = 1)
metadata.head()

Unnamed: 0,news_id,cat_subcat_0,cat_subcat_1,cat_subcat_2,cat_subcat_3,cat_subcat_4,cat_subcat_5,cat_subcat_6,cat_subcat_7,cat_subcat_8,cat_subcat_9,cat_subcat_10,cat_subcat_11,cat_subcat_12,cat_subcat_13,cat_subcat_14,cat_subcat_15,cat_subcat_16,cat_subcat_17
0,88753,-0.018882,-0.488202,0.010104,-0.091991,0.031561,-0.058868,0.01454,0.152775,0.834534,-0.2097,-0.013646,-0.074151,4e-06,-0.00088,-0.003836,-0.001757,-0.001577,-0.016354
1,45436,0.659947,0.14515,-0.437201,0.128645,-0.015542,0.015045,-0.002529,-0.012732,-0.009378,0.052221,-0.308645,-0.003905,-3e-06,0.001065,0.730545,-0.00378,-0.543145,-0.021678
2,23144,-0.017946,-0.432692,0.006179,-0.053802,0.016998,-0.02848,0.006388,0.052576,0.092527,0.268533,0.025262,0.893187,0.01452,-0.01299,-0.037335,-0.012409,-0.009619,-0.133298
3,86255,-0.018313,-0.451082,0.006822,-0.059627,0.018955,-0.031971,0.007205,0.059871,0.107289,0.327346,0.031095,1.146309,0.018963,-0.017333,-0.050648,-0.017341,-0.014158,-0.215537
4,93187,0.655141,0.142849,-0.41762,0.122585,-0.014749,0.014213,-0.002381,-0.011899,-0.00863,0.045309,-0.263694,-0.00304,-2e-06,0.000704,0.425983,-0.001346,0.836732,-0.042007


### Titles and Abstract Embeddings

In [None]:
avg_title_len = news_data['title'].str.split().str.len().mean() 
avg_abstract_len = news_data['abstract'].str.split().str.len().mean()

print("Average title length:", avg_title_len)
print("Average abstract length:", avg_abstract_len)

In [144]:
# find embeddings for title and abstract together
news_data['title'] = news_data['title'].fillna('')
news_data['abstract'] = news_data['abstract'].fillna('')
descriptions = news_data['title'].str.cat(news_data['abstract'], sep = ' ').tolist()

title_transformer = SentenceTransformer('all-MiniLM-L6-v2')
title_abs_emb = title_transformer.encode(descriptions, show_progress_bar=True)

Batches:   0%|          | 0/3255 [00:00<?, ?it/s]

In [145]:
title_emb_dim = 50
#abstract_emb_dim = 5

# reduce dimensions
title_abs_pca = PCA(n_components = title_emb_dim, random_state = 33)
title_abs_emb_reduced = title_abs_pca.fit_transform(title_abs_emb)

# store as dataframe
enc_title_abs = pd.DataFrame(title_abs_emb_reduced, columns=[f'title_emb_{i}' for i in range(title_emb_dim)])

In [146]:
# add to news_data df
news_metadata = pd.concat([metadata, enc_title_abs], axis = 1)
news_metadata.head()

Unnamed: 0,news_id,cat_subcat_0,cat_subcat_1,cat_subcat_2,cat_subcat_3,cat_subcat_4,cat_subcat_5,cat_subcat_6,cat_subcat_7,cat_subcat_8,...,title_emb_40,title_emb_41,title_emb_42,title_emb_43,title_emb_44,title_emb_45,title_emb_46,title_emb_47,title_emb_48,title_emb_49
0,88753,-0.018882,-0.488202,0.010104,-0.091991,0.031561,-0.058868,0.01454,0.152775,0.834534,...,-0.00562,-0.115405,-0.002657,0.108949,-0.07159,-0.056999,0.053448,0.066973,-0.016452,-0.06142
1,45436,0.659947,0.14515,-0.437201,0.128645,-0.015542,0.015045,-0.002529,-0.012732,-0.009378,...,0.020287,-0.047038,-0.05109,0.032425,-0.017206,-0.072582,-0.009733,-0.007023,0.167145,-0.13234
2,23144,-0.017946,-0.432692,0.006179,-0.053802,0.016998,-0.02848,0.006388,0.052576,0.092527,...,0.097464,-0.040598,0.040232,-0.068787,0.115315,0.034783,-0.013779,-0.089367,-0.092419,-0.088302
3,86255,-0.018313,-0.451082,0.006822,-0.059627,0.018955,-0.031971,0.007205,0.059871,0.107289,...,0.105761,0.054931,0.046098,0.037243,-0.027349,0.029842,-0.021076,0.072479,-0.122242,0.06613
4,93187,0.655141,0.142849,-0.41762,0.122585,-0.014749,0.014213,-0.002381,-0.011899,-0.00863,...,-0.021612,-0.060522,-0.011518,0.061675,0.05614,0.030998,0.106994,-0.075153,-0.102072,0.039118


### Sentiment Analysis - decreased score

In [100]:
from transformers import pipeline
classifier = pipeline("sentiment-analysis")

def get_sentiment(text) :
    if len(text) == 0:
        return 0
    n = (len(text) // 512) + 1
    words = text.split()
    words_per_piece = (len(words) // n) + 1
    pieces = [words[i:i+words_per_piece] for i in range(0, len(words), words_per_piece)]
    text = [" ".join(piece) for piece in pieces]
        
    result = classifier(text)

    scores = [r['score'] if r['label'] == 'POSITIVE' else -r['score'] for r in result]

    return np.mean(scores)

No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision 714eb0f (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
Device set to use mps:0


In [101]:
news_data['title'] = news_data['title'].fillna('')
news_data['abstract'] = news_data['abstract'].fillna('')

In [102]:
news_data['title_sentiment'] = news_data['title'].apply(get_sentiment)

In [103]:
news_data['abstract_sentiment'] = news_data['abstract'].apply(get_sentiment)

In [115]:
news_metadata = pd.concat([metadata, enc_title_abs, news_data['title_sentiment']], axis = 1)

### User Click History - also useless

In [105]:
# all_timestamps = pd.concat([train[['user_id', 'timestamp']], val[['user_id', 'timestamp']]], ignore_index = True).sort_values(by = ['timestamp', 'user_id']).drop_duplicates().reset_index(drop=True)
# all_timestamps.head()

In [106]:
# all_clicked = pd.concat([train_clicked, val_clicked], ignore_index = True).sort_values(by = ['timestamp', 'user_id']).drop(columns = 'impression').reset_index(drop = True)
# clicked_metadata = all_clicked.merge(news_metadata, on = 'news_id', how = 'left').drop(columns = [col for col in news_metadata.columns if col.startswith('cat_subcat_')])
# clicked_metadata.head()

In [107]:
# history_emb = []
# for _, row in all_timestamps.iterrows() :
#     user_id = row['user_id']
#     timestamp = row['timestamp']
#     past_clicks = clicked_metadata[(clicked_metadata['user_id'] == user_id) & (clicked_metadata['timestamp'] < timestamp)].drop(columns = ['user_id', 'timestamp', 'news_id'])

#     history = past_clicks.mean().fillna(0)
#     history_emb.append(history)

# history_emb[-10:]

In [108]:
# history_emb = pd.DataFrame(history_emb)

In [109]:
# timestamps_with_history = pd.concat([all_timestamps, history_emb], axis = 1)
# timestamps_with_history.head()

### Embeddings from Dataset - Did not help

In [84]:
# train_entity_emb = pd.read_csv('train_entity_embedding.vec', sep = '\t', header = None).drop(columns = [101])
# train_entity_emb = train_entity_emb.rename(columns = {0: 'entity_id'}).set_index('entity_id')
# train_entity_emb.head()

In [85]:
# val_entity_emb = pd.read_csv('val_entity_embedding.vec', sep = '\t', header = None).drop(columns = [101])
# val_entity_emb = val_entity_emb.rename(columns = {0: 'entity_id'}).set_index('entity_id')
# val_entity_emb.head()

In [86]:
# all_entities_emb = pd.concat([train_entity_emb, val_entity_emb]).drop_duplicates()
# all_entities_emb.head()

In [87]:
# #replacing any NaN values in title_entities and abstract_entities columns with empty list string '[]'
# news_data['title_entities'] = news_data['title_entities'].apply(
#    lambda x: '[]' if pd.isna(x) else x)
# news_data['abstract_entities'] = news_data['abstract_entities'].apply(
#    lambda x: '[]' if pd.isna(x) else x)

In [88]:
# # getting entity ids for titles
# title_wikidata_ids = []

# for i, row in news_data.iterrows():
#     title_entities = json.loads(row['title_entities'])
#     n_ids = [e['WikidataId'] for e in title_entities]     
#     title_wikidata_ids.append(n_ids)

# news_data['title_entities_ids'] = title_wikidata_ids
# news_data.head()

In [89]:
# # getting entity ids for abstracts
# abstract_wikidata_ids = []

# for i, row in news_data.iterrows():
#     abstract_entities = json.loads(row['abstract_entities'])
#     n_ids = [e['WikidataId'] for e in abstract_entities]     
#     abstract_wikidata_ids.append(n_ids)

# news_data['abstract_entities_ids'] = abstract_wikidata_ids
# news_data.head()

In [90]:
# # wikidata id to embedding mapping
# embedding_dict = {
#     entity_id: row.values for entity_id, row in all_entities_emb.iterrows()
# }

In [91]:
# # # calculating entity weights
# from collections import Counter

# # flattening all entity lists and counting how many times each one shows up
# all_entity_ids = news_data['title_entities_ids'].explode().dropna().tolist()
# entity_freq = Counter(all_entity_ids)

# # normalizing to get probabilities
# total = sum(entity_freq.values())
# entity_weights = {e_id: freq / total for e_id, freq in entity_freq.items()}

In [92]:
# # # combines title and abstract embeddings and returns 100dim vector for each article (if it has entities)
# def get_weighted_article_embedding(entity_ids, embedding_dict, weight_dict, dim = 100):
#     vectors = []
#     weights = []
#     for e_id in entity_ids:
#         if e_id in embedding_dict:
#             vectors.append(embedding_dict[e_id])
#             weights.append(weight_dict.get(e_id, 1e-3)) #add small default weight if no weight found
    
#     if not vectors:
#         return np.zeros(dim)
#     return np.average(vectors, axis = 0, weights = weights)

In [93]:
# news_data['title_abstract_embedding'] = news_data.apply(
#     lambda row: get_weighted_article_embedding(
#         (row['title_entities_ids'] or []) + (row['abstract_entities_ids'] or []),
#         embedding_dict, entity_weights),
#     axis=1
# )

In [94]:
# embedding_expanded = news_data['title_abstract_embedding'].apply(pd.Series)
# embedding_expanded.columns = [f'title_abstract_{i}' for i in range(100)]

# embedding_expanded.head()

In [95]:
#news_metadata = pd.concat([metadata, embedding_expanded], axis = 1)

In [96]:
#news_metadata.head()

### Training + Validation Sets

In [147]:
def print_scores(y_act, y_pred, y_score) :
    print("- Confusion Matrix: ",confusion_matrix(y_act, y_pred))
    print("- ROC-AUC: ", roc_auc_score(y_act, y_score))
    print("- Accuracy: ", accuracy_score(y_act, y_pred))
    print("- Precision: ", precision_score(y_act, y_pred))
    print("- Recall: ", recall_score(y_act, y_pred))
    print("- F1: ", f1_score(y_act, y_pred))
    print("- AUC-PR: ", average_precision_score(y_act, y_score))

In [None]:
train_metadata = train.merge(news_metadata, on = 'news_id', how = 'left')
val_metadata = val.merge(news_metadata, on = 'news_id', how = 'left')

In [117]:
# train_metadata_history = train_metadata.merge(timestamps_with_history, on = ['user_id', 'timestamp'], how = 'left')
# val_metadata_history = val_metadata.merge(timestamps_with_history, on = ['user_id', 'timestamp'], how = 'left')

In [None]:
X_train = train_metadata.drop(columns = ['impression'])
y_train = train_metadata['impression']
X_val = val_metadata.drop(columns = ['impression'])

### Handle Imbalanced Dataset

In [None]:
ratio = len(train_not_clicked) / len(train_clicked)

### Initial Model

In [None]:
# initialize model
model = xgb(n_estimators = 50, scale_pos_weight=ratio, eval_metric = 'logloss', random_state = 33)
# fit the model to training set
model.fit(X_train, y_train)

# predict validation set
val_pred = val.copy()
y_score = model.predict_proba(X_val)[:,1]
y_pred = model.predict(X_val)
y_act = val_pred['impression']
print_scores(y_act, y_pred, y_score)

#### With nothing :
- Confusion Matrix: [[2025900  603715]
 [  84120   27263]]
- ROC-AUC:  0.5201805694312953
- Accuracy:  0.7490567304317625
- Precision:  0.04320752863015826
- Recall:  0.24476805257534812
- F1:  0.07344944036661409

#### With categories + subcategories:
- Confusion Matrix:  
[[2058308  571307]
 [  69239   42144]]
- ROC-AUC:  0.6097792532230576
- Accuracy:  0.7663092056251044
- Precision:  0.06869986355878464
- Recall:  0.37837012829605954
- F1:  0.1162859358142692

### With categories + subcategories, title+abstract :

#### With categories + subcategories, title+abstract (25) :
- Confusion Matrix:  [[2201348  428267]
 [  78534   32849]]
- ROC-AUC:  0.6160573335316327
- Accuracy:  0.8151034769087756
- Precision:  0.07123803988584218
- Recall:  0.29491933239363277
- F1:  0.11475653232582066

### With categories + subcategories, title+abstract (50) : best
n_estimators = 200
- Confusion Matrix:  [[2068370  561245]
 [  65302   46081]]
- ROC-AUC:  0.6437394670245854
- Accuracy:  0.7714164694757165
- Precision:  0.07587523010705947
- Recall:  0.41371663539319287
- F1:  0.1282327061439331

n_estimators = 100
- Confusion Matrix:  [[2027765  601850]
 [  62517   48866]]
- ROC-AUC:  0.646595477228097
- Accuracy:  0.7576185754239879
- Precision:  0.07509574069179181
- Recall:  0.43872045105626534
- F1:  0.12824055667308315
#### With categories + subcategories, title+abstract (100) :
- Confusion Matrix:  [[2056468  573147]
 [  65599   45784]]
- ROC-AUC:  0.6438007004436539
- Accuracy:  0.7669659007412628
- Precision:  0.07397270455026489
- Recall:  0.411050160257849
- F1:  0.12538168513817344

#### With categories + subcategories, title+abstract (150) :
- Confusion Matrix:  [[2099019  530596]
 [  71326   40057]]
- ROC-AUC:  0.637131801729021
- Accuracy:  0.7804004234953837
- Precision:  0.07019502219387264
- Recall:  0.359632978102583
- F1:  0.11746300781776915

#### With categories + subcategories, title+abstract (200) :
- Confusion Matrix:  [[2047624  581991]
 [  67830   43553]]
- ROC-AUC:  0.6264987861907454
- Accuracy:  0.7629254016237881
- Precision:  0.06962419909710588
- Recall:  0.3910201736351149
- F1:  0.11820166719362976

#### With categories + subcategories, dataset embeddings (100) :
- Confusion Matrix:  [[2108234  521381]
 [  72734   38649]]
- ROC-AUC:  0.6103668805968359
- Accuracy:  0.7832486561464109
- Precision:  0.06901237433708908
- Recall:  0.3469919107942864
- F1:  0.11512735082579574

#### With categories + subcategories, title+abstract (50), history :
- Confusion Matrix:  [[2108234  521381]
 [  72734   38649]]
- ROC-AUC:  0.6103668805968359
- Accuracy:  0.7832486561464109
- Precision:  0.06901237433708908
- Recall:  0.3469919107942864
- F1:  0.11512735082579574

#### With categories + subcategories, title+abstract, sentiment analysis :
- Confusion Matrix:  [[2083570  546045]
 [  69644   41739]]
- ROC-AUC:  0.6265757744394764
- Accuracy:  0.7753778003486321
- Precision:  0.07101077947000939
- Recall:  0.3747340258387725
- F1:  0.11939636739148157

 Results with sentiment analysis
 Confusion Matrix:  [[1914592  715023]
  [  58135   53248]]
 ROC-AUC:  0.6411829551735185
 Accuracy:  0.7179282874339931
 Precision:  0.06930887668544042
 Recall:  0.478062181841035
 F1:  0.12106578268273663

In [133]:
model = xgb(eval_metric = 'logloss', random_state = 33)
ratio = len(train_not_clicked) / len(train_clicked)
param_dist = {
    'scale_pos_weight': [ratio, ratio*0.5, ratio*2],
    'max_depth': [3, 5, 7, 9],
    'min_child_weight': [1, 3, 5],
    'gamma': [0, 0.5, 1, 5],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'n_estimators': [100, 300],
    'max_delta_step': [1, 5, 10]
}

# Use F1-score as the scoring metric
precision = make_scorer(precision_score)

# Randomized Search
random_search = RandomizedSearchCV(
    estimator=model,
    param_distributions=param_dist,
    n_iter=100,  # increase for more thorough search
    scoring=precision,
    cv=3,
    verbose=2,
    random_state=42,
    n_jobs=-1
)

# Fit search
random_search.fit(X_train, y_train)

# Best model and parameters
print("Best Precision score: ", random_search.best_score_)
print("Best parameters:\n", random_search.best_params_)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


OSError: [WinError 1450] Insufficient system resources exist to complete the requested service

In [None]:
# initialize model
model = xgb(random_search.best_params_)
# fit the model to training set
model.fit(X_train, y_train)

# predict validation set
val_pred = val.copy()
y_score = model.predict_proba(X_val)[:,1]
y_pred = model.predict(X_val)
y_act = val_pred['impression']
print_scores(y_act, y_pred, y_score)

### Evaluation
Using the metrics that the competition used to guage. This allows us to have a general idea of how well our model should perform. The code is gathered from here: https://github.com/msnews/MIND/blob/master/evaluate.py

Ranked 1st scores: AUC: 0.7304 | MRR: 0.3770 | nDCG@5: 0.4180 | nDCG@10: 0.4718

Ranked 10th scores: AUC: 0.7243	| MRR: 0.3706 | nDCG@5: 0.4101 | nDCG@10: 0.4644

Ranked 50th Scores: AUC: 0.7059	| MRR: 0.3514 | DCG@5: 0.3847 | nDCG@10: 0.4418

In [None]:
# change to easily run evaluation on any model
impressions = val_pred.groupby(by = ['user_id', 'timestamp'])
all_y_true = (impressions['impression'].apply(list).reset_index())['impression']
all_y_score = (impressions['pred_impression'].apply(list).reset_index())['pred_impression']

In [None]:
def dcg_score(y_true, y_score, k=10):
    order = np.argsort(y_score)[::-1]
    y_true = np.take(y_true, order[:k])
    gains = 2 ** y_true - 1
    discounts = np.log2(np.arange(len(y_true)) + 2)
    return np.sum(gains / discounts)
    

def ndcg_score(y_true, y_score, k=10):
    best = dcg_score(y_true, y_true, k)
    actual = dcg_score(y_true, y_score, k)
    return actual / best


def mrr_score(y_true, y_score):
    order = np.argsort(y_score)[::-1]
    y_true = np.take(y_true, order)
    rr_score = y_true / (np.arange(len(y_true)) + 1)
    return np.sum(rr_score) / np.sum(y_true)

In [None]:
auc_list = []
mrr_list = []
ndcg5_list = []
ndcg10_list = []

for y_true, y_score in zip(all_y_true, all_y_score):
    if sum(y_true) == 0:
        # No clicked article — skip this impression
        continue
    
    auc_list.append(roc_auc_score(y_true, y_score))
    mrr_list.append(mrr_score(y_true, y_score))
    ndcg5_list.append(ndcg_score(y_true, y_score, k=5))
    ndcg10_list.append(ndcg_score(y_true, y_score, k=10))

# Average metrics
print("AUC: ", sum(auc_list) / len(auc_list))
print("MRR: ", sum(mrr_list) / len(mrr_list))
print("nDCG@5: ", sum(ndcg5_list) / len(ndcg5_list))
print("nDCG@10: ", sum(ndcg10_list) / len(ndcg10_list))

### best scores : 
cat+subcat embeddings (18 dim), title embeddings (384)
- AUC:  0.6025434736909249
- MRR:  0.2611942541800999

cat+subcat, title_embeddings (200)
- AUC:  0.5934906011364117
- MRR:  0.2712393252597269
- nDCG@5:  0.2884957793194139
- nDCG@10:  0.3503003894574587

### Export predictions

In [None]:
val_pred.to_csv('val_pred.csv', index = False)

In [None]:
news_embeddings = news_metadata.drop(columns = [cols for cols in news_metadata if cols.startswith('cat_subcat_')])
news_embeddings.head()

In [None]:
news_embeddings.to_csv('news_embeddings.csv', index = False)