# Practical Work

<b> Student: </b> Jonas Fallmann <br>
<b> Matrikulation number </b> 12018700 <br>
<b> Topic:</b> Evaulating gender bias of explained recommender systems<br>
<b> Institute: </b> Institute of Computational Perception, Johannes Kepler University Linz

## Outline
Recent advances in the field of recommender systems focus in a big part on explanations of recommended Items. Explainability has been shown to aid user trust and results in higher conversion rates in the online shopping domain. As an example, explanations in the music domain could be of the following form:

<b>Recommended Item:</b> Clocks and Whoopty by Pintel Ragetti<br>
<b> Explanation: </b>  



# Train Model

### Load config and create Dataset

In [None]:
from logging import getLogger
from recbole.config import Config
from recbole.data import create_dataset, data_preparation
from recbole.model.knowledge_aware_recommender import KGAT
from recbole.trainer import KGATTrainer
from recbole.utils import init_seed, init_logger

train_model = True
create_paths = True

In [None]:

# configurations initialization
config = Config(model='KGAT', dataset='rb_lfm', config_file_list=['lfm.yaml'])

# init random seed
init_seed(config['seed'], config['reproducibility'])

# logger initialization
init_logger(config)
logger = getLogger()

# write config info into log
logger.info(config)

# dataset creating and filtering
dataset = create_dataset(config)
logger.info(dataset)

# dataset splitting
train_data, valid_data, test_data = data_preparation(config, dataset)


### Train model and evaluate

In [None]:
if train_model:
    # model loading and initialization
    model = KGAT(config, train_data._dataset).to(config['device'])
    logger.info(model)

    # trainer loading and initialization
    trainer = KGATTrainer(config, model)

    print('Starting to fit model')
    # model training
    best_valid_score, best_valid_result = trainer.fit(train_data, valid_data, saved=True, show_progress=True)

    print('Evaluating model')
    # model evaluation
    test_result = trainer.evaluate(test_data, load_best_model=True)
    print(test_result)

# Load best model

In [None]:
import os
import torch

In [None]:
latest_model = 'saved/%s' % os.listdir('saved')[-1]

In [None]:
latest_model

In [None]:
state_dict = torch.load(latest_model, map_location='cuda')
model = KGAT(config, train_data._dataset).to(config['device'])

model.load_state_dict(state_dict['state_dict'])

In [None]:
trainer = KGATTrainer(config, model)
test_result = trainer.evaluate(test_data, load_best_model=False)
print(test_result)

# Prepare for model evaluation

## Create graph from attention matrix

In [None]:
import os
import torch
from scipy.sparse import coo_matrix
from scipy.sparse.csgraph import shortest_path
import numpy as np
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt
from recbole.data.interaction import Interaction


In [None]:
def get_path(Pr, j):
    path = [j]
    k = j
    while Pr[k] != -9999:
        path.append(Pr[k])
        k = Pr[k]
    return path[::-1]

In [None]:
from enum import Enum
class DistanceMode(Enum):
    Inverse=1
    RBF=2
    Linear=3

In [None]:
attention_matrix = model.A_in.cpu().coalesce()
attention_matrix_size = attention_matrix.size()[0] # since it is square

In [None]:
def create_scipy_matrix(distance_mode: str, size: int):
    attention_values = attention_matrix.values().numpy()
    if distance_mode == DistanceMode.Inverse:
        attention_values = 1/attention_values
    if distance_mode == DistanceMode.RBF:
        attention_values = np.exp(-attention_values**2)
    if distance_mode == DistanceMode.Linear:
        attention_values = 1-attention_values
    return coo_matrix((attention_values, (attention_matrix.indices()[0].numpy(), attention_matrix.indices()[1].numpy())), shape=(size,size))

In [None]:
graph_matrix = create_scipy_matrix(DistanceMode.RBF, attention_matrix_size)

## Create shortest path for k top predictions

<b> Important note:</b> Id's in the attention matrix are shiftet by the number of users `dataset.user_num`. So fetching an id using the token i.e. token2id results in an id not yet shifted. So to get the corresponding attention-matrix-id we need to shift items and entities!

In [None]:
user_paths = {}
if create_paths:
    for batch_idx, batched_data in enumerate(tqdm(test_data)):
        interaction, history_index, positive_u, positive_i = batched_data
        user_id = interaction['user_id'].cpu().numpy().item()

        predictions = model.full_sort_predict(interaction).detach().cpu().numpy()
        recommended_items = np.argsort(predictions)
        recommended_items = recommended_items[np.isin(recommended_items, positive_i + dataset.user_num)]

        # now calculate shortest distances for current user
        shortest_distances, predecessors = shortest_path(graph_matrix, directed=True,  return_predecessors = True, indices=[user_id])

        # calculate the paths for the top k recommended items
        paths = []
        for item_idx in recommended_items:
            path = get_path(predecessors[0], item_idx)
            paths.append(np.asarray(path, dtype=int))
        user_paths[user_id] = paths

In [None]:
import pickle
file_name = 'paths.pkl'
if create_paths:
    with open(file_name, 'wb') as file:
        pickle.dump(user_paths, file)
        print(f'Object successfully saved to "{file_name}"')
else:
    with open(file_name, 'rb') as file:
        user_paths = pickle.load(file)
        print(f'Object successfully loaded from "{file_name}"')

In [None]:
gender_ids = dataset.token2id('entity_id', ['m', 'f'])

In [None]:
male_attention_row = np.asarray(graph_matrix.getrow(dataset.user_num + gender_ids[0]).todense())
male_attention_row

In [None]:
np.nonzero(male_attention_row)

In [None]:
male_attention_col = np.asarray(graph_matrix.getcol(dataset.user_num + gender_ids[0]).todense())
male_attention_col

In [None]:
np.nonzero(male_attention_col)

In [None]:
flat_list = [item for sublist in list(user_paths.values()) for item in sublist]
flat_lengths = [len(item) for item in flat_list]
plt.hist(flat_lengths)

# Evaluation

In [None]:
import seaborn as sns

## Load all users and divide into male and female

In [None]:
import pandas as pd
user_df = pd.read_csv('data/lfm/75percent_subset_users.tsv', sep='\t')

In [None]:
user_ids = list(user_paths.keys())

In [None]:
external_user_ids = [int(item) for item in dataset.id2token(dataset.uid_field, user_ids)]

In [None]:
# remove users that recbole removed in the dataset creation process
user_df = user_df[user_df.user_id.isin(external_user_ids)]

In [None]:
male_users = user_df[user_df.gender == 'm']
female_users = user_df[user_df.gender =='f']

In [None]:
print('Male users: %s, %s(%%)' % (len(male_users), len(male_users)/(len(male_users)+len(female_users))))
print('Male users: %s, %s(%%)' % (len(female_users), len(female_users)/(len(male_users)+len(female_users))))

In [None]:
plt.bar(['male user count', 'female user count'], [len(male_users), len(female_users)])

In [None]:
male_internal_user_ids = dataset.token2id(dataset.uid_field, [str(item) for item in male_users.user_id.to_numpy()])
female_internal_user_ids = dataset.token2id(dataset.uid_field, [str(item) for item in female_users.user_id.to_numpy()])

In [None]:
def internal_id_to_gender(idx: int):
    return 'm' if idx in male_internal_user_ids else 'f'

In [None]:
def is_male(idx:int):
    return idx in male_internal_user_ids

### Calculate average path length on per user basis

In [None]:
user_average_path_lengths = {}
for key in user_paths.keys():
    paths_for_user = user_paths[key]
    average_path_length = np.mean([len(path) for path in paths_for_user])
    user_average_path_lengths[key] = average_path_length

In [None]:
length_distribution_dataframe = pd.DataFrame({'gender': [internal_id_to_gender(idx) for idx in user_paths.keys()], 'average path length': list(user_average_path_lengths.values()), 'male/female': ['_' for _ in user_paths.keys()]})
length_distribution_dataframe

In [None]:
import matplotlib.pyplot as plt
f, (ax1, ax2) = plt.subplots(1,2, figsize=(15,7))

sns.histplot(data=length_distribution_dataframe[length_distribution_dataframe.gender == 'm'], ax=ax1, stat="proportion",
             x="average path length", kde=False,
             color="#3899C7",
             element="bars", legend=True)
ax1.set_title("Explanation path length distribution for males")
ax1.set_xlabel("Path length")

sns.histplot(data=length_distribution_dataframe[length_distribution_dataframe.gender == 'f'], ax=ax2, stat="proportion", multiple="stack",
             x="average path length", kde=False,
             color='#C76638',
             element="bars", legend=True)
ax2.set_title("Explanation path length distribution for females")
ax2.set_xlabel("Path length")

In [None]:
import matplotlib.pyplot as plt
f = plt.figure(figsize=(15,7))
sns.violinplot(data=length_distribution_dataframe, y='average path length', x='male/female', hue='gender', split=True, palette="pastel", stat='proportion', legend=True)

In [None]:
import matplotlib.pyplot as plt
f = plt.figure(figsize=(10,10))
sns.boxplot(y="average path length", x="male/female",
            hue="gender", palette="pastel",
            data=length_distribution_dataframe)

## Evaluate path length per gender per user

In [None]:
import math

In [None]:
male_path_lengths = {}
for key in user_paths.keys():
    if key in male_internal_user_ids:
        if not math.isnan(user_average_path_lengths[key]):
            male_path_lengths[key] = user_average_path_lengths[key]

female_path_lengths = {}
for key in user_paths.keys():
    if key in female_internal_user_ids:
        if not math.isnan(user_average_path_lengths[key]):
            female_path_lengths[key] = user_average_path_lengths[key]

In [None]:
np.mean(list(male_path_lengths.values()))

In [None]:
np.median(list(male_path_lengths.values()))

In [None]:
np.mean(list(female_path_lengths.values()))

In [None]:
np.median(list(female_path_lengths.values()))

## Evaluate path length per gender

In [None]:
male_paths = {}
for key in user_paths.keys():
    if key in male_internal_user_ids:
        male_paths[key] = user_paths[key]

female_paths = {}
for key in user_paths.keys():
    if key in female_internal_user_ids:
        female_paths[key] = user_paths[key]

In [None]:
def average_path_length(paths: dict):
    flat_list = [item for sublist in list(paths.values()) for item in sublist]
    lengths = [len(item) for item in flat_list]
    return np.mean(lengths), lengths

In [None]:
male_average, male_lengths = average_path_length(male_paths)

In [None]:
female_average, female_lengths = average_path_length(female_paths)

In [None]:
female_average, male_average

## Evaluate shared entities

In [None]:
male_shared_entities = []
female_shared_entities = []
for user_id in user_paths.keys():
    paths_for_user = user_paths[user_id]
    paths_without_start_and_end = [path[1:] for path in paths_for_user]
    paths_without_start_and_end = list(filter(lambda x: len(x) > 0, paths_without_start_and_end))

    try:
        concatenated_paths = np.concatenate(paths_without_start_and_end)
    except:
        pass
    
    if is_male(user_id):
        male_shared_entities.append(concatenated_paths)
    else:
        female_shared_entities.append(concatenated_paths)
    
male_shared_entities_concatenated = np.concatenate(male_shared_entities)
female_shared_entities_concatenated = np.concatenate(female_shared_entities)

In [None]:
def evaluate_gender_of_shared_entities(data):
    gender_token, counts = np.unique(data, return_counts=True)
    distribution = {gender_token[0]: counts[0]/np.sum(counts)}
    if len(gender_token) == 2:
        distribution[gender_token[1]] = counts[1]/np.sum(counts)
    return (distribution, len(data))

### Evaluate grouped by user by gender

In [None]:
male_shared_gender_statistics = []
for shared_entities in male_shared_entities:
    shared_users = shared_entities[shared_entities < dataset.user_num]
    shared_gender_tokens = [internal_id_to_gender(idx) for idx in shared_users]
    male_shared_gender_statistics.append(evaluate_gender_of_shared_entities(shared_gender_tokens))
    
female_shared_gender_statistics = []
for shared_entities in female_shared_entities:
    shared_users = shared_entities[shared_entities < dataset.user_num]
    shared_gender_tokens = [internal_id_to_gender(idx) for idx in shared_users]
    female_shared_gender_statistics.append(evaluate_gender_of_shared_entities(shared_gender_tokens))

In [None]:
male_shared_gender_df = pd.DataFrame({'gender': ['m' for _ in male_shared_gender_statistics], 'male_proportion': [float(item['m']) if 'm' in item else 0 for item, _ in male_shared_gender_statistics],'female_proportion': [float(item['f']) if 'f' in item else 0 for item,_ in male_shared_gender_statistics], 'male - female': ['_' for item in male_shared_gender_statistics] })
female_shared_gender_df = pd.DataFrame({'gender': ['f' for _ in female_shared_gender_statistics], 'male_proportion': [float(item['m']) if 'm' in item else 0 for item, _ in female_shared_gender_statistics],'female_proportion': [float(item['f']) if 'f' in item else 0 for item,_ in female_shared_gender_statistics], 'male - female': ['_' for item in female_shared_gender_statistics] })

In [None]:
male_shared_gender_df.mean()

In [None]:
female_shared_gender_df.mean()

In [None]:
f, (ax1, ax2) = plt.subplots(1,2, figsize=(15,7))

sns.histplot(data=male_shared_gender_df, ax=ax1, stat="proportion",
             x="male_proportion", kde=True,
             color="#3899C7",
             element="bars", legend=True, binwidth=.02)
ax1.set_title("Explanation paths for males")
ax1.set_xlabel("Proportion of shared entites being males")
ax1.set_ylim(0,0.30)

sns.histplot(data=female_shared_gender_df, ax=ax2, stat="proportion", multiple="stack",
             x="male_proportion", kde=True,
             color='#C76638',
             element="bars", legend=True, binwidth=.02)
ax2.set_title("Explanation paths for females")
ax2.set_xlabel("Proportion of shared entites being males")
ax2.set_ylim(0,0.30)


In [None]:
f, (ax1, ax2) = plt.subplots(1,2, figsize=(15,7))

sns.histplot(data=male_shared_gender_df, ax=ax1, stat="proportion",
             x="female_proportion", kde=True,
             color="#3899C7",
             element="bars", legend=True, binwidth=.02)
ax1.set_title("Explanation paths for males")
ax1.set_xlabel("Proportion of shared entites being females")
ax1.set_ylim(0,0.30)

sns.histplot(data=female_shared_gender_df, ax=ax2, stat="proportion", multiple="stack",
             x="female_proportion", kde=True,
             color='#C76638',
             element="bars", legend=True, binwidth=.02)
ax2.set_title("Explanation paths for females")
ax2.set_xlabel("Proportion of shared entites being females")
ax2.set_ylim(0,0.30)


### Evaluate grouped by gender

In [None]:
male_shared_item_ids, male_shared_item_counts = np.unique(male_shared_entities_concatenated, return_counts=True)
female_shared_item_ids, female_shared_item_counts = np.unique(female_shared_entities_concatenated, return_counts=True)

In [None]:
male_shared_item_ids = np.flip(male_shared_item_ids[np.argsort(male_shared_item_counts)])
male_shared_item_counts = np.flip(male_shared_item_counts[np.argsort(male_shared_item_counts)])
female_shared_item_ids = np.flip(female_shared_item_ids[np.argsort(female_shared_item_counts)])
female_shared_item_counts = np.flip(female_shared_item_counts[np.argsort(female_shared_item_counts)])

In [None]:
male_shared_entity_count = np.sum(male_shared_item_counts)
female_shared_entity_count = np.sum(female_shared_item_counts)

In [None]:
for entity_idx, shared_entity in enumerate(male_shared_item_ids[:10]):
    print('Entity %s. \t %s \t %s%%' % (shared_entity, male_shared_item_counts[entity_idx], male_shared_item_counts[entity_idx]*100/male_shared_entity_count))

In [None]:
for entity_idx, shared_entity in enumerate(female_shared_item_ids[:10]):
    print('Entity %s. \t %s \t %s%%' % (shared_entity, female_shared_item_counts[entity_idx], female_shared_item_counts[entity_idx]*100/female_shared_entity_count))

In [None]:
male_shared_users = male_shared_item_ids[male_shared_item_ids < dataset.user_num]
female_shared_users = female_shared_item_ids[female_shared_item_ids < dataset.user_num]

In [None]:
male_shared_user_gender = [internal_id_to_gender(idx) for idx in male_shared_users]
female_shared_user_gender = [internal_id_to_gender(idx) for idx in female_shared_users]

In [None]:
evaluate_gender_of_shared_entities(male_shared_user_gender)

In [None]:
evaluate_gender_of_shared_entities(female_shared_user_gender)

In [None]:
evaluate_gender_of_shared_entities(male_shared_user_gender[:100])

In [None]:
evaluate_gender_of_shared_entities(female_shared_user_gender[:100])