In [1]:
import pandas as pd
import numpy as np
from kafkanator import gini,lorentz_curve
import matplotlib.pyplot as pyplt
from plotly.subplots import make_subplots
import plotly.graph_objects as go

<h1> Description </h1>
<br/>
This notebook assigns a discriminative score to the defined sensitive attributes of a population. For this example
the population is a set of attributes corresponding to the artists of our coolsongs.com web app example.
<br/>
<br/>
This discriminative score, gives you an idea of how fair your recommendation algorithm is doing with respect to those sensitive values.
<br/>
You could potentially use this notebook on any recommendation system making the tuning that every business need.
You should have two data frames.
<h2>Dataframe 1 :</h2> An  output list of recommendations for a query, as ranking comma separated (see rankings).
<h2>Dataframe 2 :</h2> The sensitive attributes of your population.(sens_attr)

In [2]:
# Df and join are two dataframes 

In [3]:
rankings = pd.read_csv('./data/ranking.csv',sep=',',header=0)
sens_attr = pd.read_csv('./data/artist_join_sens_attr.csv',sep=',',header=0)

In [4]:
rankings

Unnamed: 0,query,Rankings
0,1,2571012
1,2,16768
2,3,5791012
3,4,1451021
4,5,289101
5,6,467910
6,7,13591516
7,8,1756181
8,9,12345
9,10,2019181714


In [5]:
sens_attr

Unnamed: 0,artist_id,age,gender,nationality
0,1,20,M,national
1,2,52,F,foreign
2,3,36,M,national
3,4,25,F,foreign
4,5,67,M,national
5,6,45,F,foreign
6,7,59,M,national
7,8,23,F,foreign
8,9,18,M,national
9,10,56,F,foreign


In [6]:
sensitive_attributes = [ 'nationality','gender' ]
id_column = 'artist_id'

In [7]:
sensitive_columns = sens_attr[sensitive_attributes].merge(sens_attr[id_column].to_frame(),left_index=True, right_index=True)

In [8]:
sensitive_columns

Unnamed: 0,nationality,gender,artist_id
0,national,M,1
1,foreign,F,2
2,national,M,3
3,foreign,F,4
4,national,M,5
5,foreign,F,6
6,national,M,7
7,foreign,F,8
8,national,M,9
9,foreign,F,10


In [18]:
Anationals = sensitive_columns[sensitive_columns['nationality'] == 'national']['artist_id'].values

In [19]:
Anationals

array([ 1,  3,  5,  7,  9, 11, 13, 15, 16, 17, 18, 19, 20])

<h1> Score hash initialization </h1>
<br/>
This loop initializes a dictionnary containing keys as column names and values dictionnaries of each possible value in the column. For the moment it only works with categorical variables.

In [9]:
counters_sens_attributes = {}
for s in sensitive_attributes: 
    theKeys = set(sensitive_columns[s])
    theValues = 0
    senses_dict = dict.fromkeys(theKeys, theValues)
    counters_sens_attributes[s] = senses_dict

In [10]:
counters_sens_attributes

{'nationality': {'national': 0, 'foreign': 0}, 'gender': {'F': 0, 'M': 0}}

<h1>Score hash algorithm</h1>

<i>sensitive_columns</i> dataframe contains the sensitive information (information that could be source of discrimination) in the artist population that is making the content generation for the example web app coolsongs.com.
<br/>
<br/>
As a general description, this algorithm parses each line of the rankings dataframe, each line is a comma-separated string containing a list of artist identifiers, as is a ranking, the order is important meaning that the first position is somehow more important than the second according to a business defined score.

The idea is, for each artist k in a dataset containing n artists, the artist recommended score (RA) will compute how recommended this artist is by computing.
<br/>
<br/>
<center>$RA_k$ = $ \sum_{i=1}^{n} (  N - pos_i(Artist_k) ) $.</center>

Being pos(Artist) the position of the artist in the list located in row i.

The artist recommended score $RA_k$ can be splitted over the values of a categorical sensitive column :
So for a sensitive categorical column S that could be any of Sj possibilities $\{S_1,S_2,S_3, ... , S_p\}$
we have :
<center>$SC_j = \sum_{i=1}^{n} ( if Artist_k(S) == S_j  : N - pos_i(Artist_k) ) $.</center>
And you must have : 
<br/>
<br/>
<center> $\sum_{j=1}^{n} ( SC_j ) = \sum_{j=1}^{n} ( if Artist_k(S) == j : RA_k ) $ </center>
<br/>
<br/>
The summatory results will be stored in <i>rock_artist</i> variable.

In [15]:
rock_artists = {}
for i in range(1,21):
    rock_artists[i] = 0
# Take the ranking column
for r in rankings["Rankings"].values:
    seq = r.split(',')
    # Take each position of the ranking, find the artist id and add
    # that score, the same for each sensitive artist attribute.
    N = len(seq)
    scoring = list(range(1,N+1))
    scoring.reverse()
    together = list(zip(scoring,seq))
    for (pos,s) in together:
        rock_artists[int(s)] = rock_artists[int(s)] + pos
        for a in sensitive_attributes:
            value_of_attribute = sensitive_columns[sensitive_columns[id_column]==int(s)][a] 
            counters_sens_attributes[a][ value_of_attribute.item() ] = counters_sens_attributes [  a  ][ value_of_attribute.item() ] + pos

In [22]:
rock_artists

{1: 13,
 2: 16,
 3: 3,
 4: 7,
 5: 22,
 6: 13,
 7: 13,
 8: 5,
 9: 11,
 10: 10,
 11: 0,
 12: 2,
 13: 5,
 14: 6,
 15: 2,
 16: 1,
 17: 7,
 18: 5,
 19: 4,
 20: 5}

In [29]:
sumNationals = 0
for i in range(0, len(Anationals)):
    print(Anationals[i], ' ', rock_artists[ Anationals[i] ])
    sumNationals = sumNationals + rock_artists[ Anationals[i] ]

1   13
3   3
5   22
7   13
9   11
11   0
13   5
15   2
16   1
17   7
18   5
19   4
20   5


In [30]:
sumNationals

91

In [13]:
counters_sens_attributes

{'nationality': {'national': 91, 'foreign': 59}, 'gender': {'F': 70, 'M': 80}}

In [14]:
scoring_gains_sorted = sorted([y for (x,y) in list( rock_artists.items() )])
sensitives = []
for k,sensitive in counters_sens_attributes.items():
    sens_gain_sorted= sorted([y for (x,y) in list( sensitive.items() )])
    sensitives.append(sens_gain_sorted)

In [264]:
scoring_gains_sorted

[0, 1, 2, 2, 3, 4, 5, 5, 5, 5, 6, 7, 7, 10, 11, 13, 13, 13, 16, 22]

In [265]:
gini(np.array(scoring_gains_sorted))

0.402

In [266]:
gns = list(map(gini,np.array(sensitives)))

In [267]:
sensitives

[[59, 91], [70, 80]]

In [268]:
gns

[0.10666666666666667, 0.03333333333333333]

In [271]:
lorentz_score_x,lorentz_score_y = lorentz_curve(np.repeat(1,20),scoring_gains_sorted)
lorentz_gender_x,lorentz_gender_y = lorentz_curve(np.repeat(1,2),sensitives[0])
lorentz_nation_x,lorentz_nation_y = lorentz_curve(np.repeat(1,2),sensitives[1])

In [279]:
fig = make_subplots(rows=1, cols=3)

fig.add_trace(
    go.Scatter(x=lorentz_score_x, y=lorentz_score_y),
    row=1, col=1
)

fig.add_trace(
    go.Scatter(x=lorentz_gender_x, y=lorentz_gender_y),
    row=1, col=2
)

fig.add_trace(
    go.Scatter(x=lorentz_nation_x, y=lorentz_nation_y),
    row=1, col=3
)


fig.update_layout(height=600, width=800, title_text="Ginis per attribute")
fig.show()