## 1: [Fairness](#fairness)
## 2: [Diversity](#diversity)
## 3: [Consistency](#consistency)

In [1]:
import pandas as pd
import numpy as np
import locale
locale.setlocale(locale.LC_ALL, 'en_US')
from collections import Counter
import matplotlib.pyplot as plt
import string
import random

In [2]:
df = pd.read_csv('Data.csv')

#Get total views
videoKeys = list(df.columns)
videoKeys.remove("Unnamed: 0")
df['sumVideoViews'] = df[videoKeys].sum(axis=1)

In [3]:
#Extract Country,Age and Gender
df['country'] = df['Unnamed: 0'].str.split("_",expand=True)[0]
df['age'] = df['Unnamed: 0'].str.split("_",expand=True)[1]
df['gender'] = df['Unnamed: 0'].str.split("_",expand=True)[2]

df = df.drop('Unnamed: 0',axis=1)

In [4]:
print("Total Views",df[videoKeys].sum(axis=0).sum())

Total Views 1581079336


In [5]:
df.loc[df.age=="65-",'age'] = "65+" #Minor Fix

In [6]:
df.groupby("age")["sumVideoViews"].sum().sort_values(ascending=False)

age
25-34    379308837
35-44    266945329
18-24    251841056
45-54    225989220
55-64    177114047
65+      156130826
13-17    123750021
Name: sumVideoViews, dtype: int64

In [7]:
#Top 10 countries by view
df.groupby("country")["sumVideoViews"].sum().sort_values(ascending=False)[:10]

country
US    114584035
IN     56850635
GB     38824159
CA     32873633
MY     21705597
PK     20176644
AU     19427786
PH     18552997
DE     16990377
ZA     16731074
Name: sumVideoViews, dtype: int64

In [8]:
dralgos = ["Clustering","PCA","NMF","LDA","SE","UMAP"]

In [9]:
def fixAge(temp): #Change Age Format
    for i in range(temp.iloc[1,1:].shape[0]):
        if int(temp.iloc[1,1+i]) <= 17:
            temp.iloc[1,1+i] = "13-17"
        elif int(temp.iloc[1,1+i]) > 17 and int(temp.iloc[1,1+i]) <= 24:
            temp.iloc[1,1+i] = "18-24"
        elif int(temp.iloc[1,1+i]) > 24 and int(temp.iloc[1,1+i]) <= 34:
            temp.iloc[1,1+i] = "25-34"
        elif int(temp.iloc[1,1+i]) > 34 and int(temp.iloc[1,1+i]) <= 44:
            temp.iloc[1,1+i] = "35-44"
        elif int(temp.iloc[1,1+i]) > 44 and int(temp.iloc[1,1+i]) <= 54:
            temp.iloc[1,1+i] = "45-54"
        elif int(temp.iloc[1,1+i]) > 54 and int(temp.iloc[1,1+i]) <= 64:
            temp.iloc[1,1+i] = "55-64"
        elif int(temp.iloc[1,1+i]) > 64:
            temp.iloc[1,1+i] = "65+"

    return temp

In [10]:
df["country"].nunique()

185

In [11]:
ccdf = pd.read_csv("CountryCode.csv") #Read country code DF
for i in range(df.shape[0]): #Change Country codes to full country names
    if df["country"].iloc[i] == "NA": #Pandas converted "NA" to "null", when we read the above csv file. So just fixing that.
        df["country"].iloc[i] = "Namibia"
    else:
        df["country"].iloc[i] = ccdf.loc[ccdf["Code"]==df["country"].iloc[i],'Name'].iloc[0]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)


In [12]:
df["country"].nunique()

185

# <a id="fairness">Fairness</a>

Calculating Fairness for age, gender and country.

In [13]:
totalViews = df["sumVideoViews"].sum()

#Create a CSV file to store the results
outputDF = pd.DataFrame(columns=["Dimensionality Reduction Algorithm","Number of Personas","GroupName","Subject","Percentage in Original Data","Unique Values in Original Data","Count in Persona Set","Percentage in Persona set","Total Unique Values in Persona Set","Statistical Parity"])

In [14]:
#Append all personas in a list
dfs = []
for dralgo in dralgos:
    for i in range(3):
        temp = pd.read_excel("personas/"+dralgo+".xlsx",sheet_name=2-i)
        temp = fixAge(temp)
        dfs.append([temp,temp.shape[1]-1,dralgo])

In [15]:
#Calculate fairness for gender
totalMale = df.loc[df["gender"]=="male"].groupby("gender")["sumVideoViews"].sum().values[0]
for cdf in dfs:
    cTotal = (cdf[0].iloc[2,1:].values=="male").sum()
    outputDF = outputDF.append(pd.DataFrame([[cdf[2],cdf[1],"Gender","Male",np.round(totalMale/totalViews*100,3),df["gender"].nunique(),cTotal,np.round(cTotal/cdf[1]*100,3),cdf[0].iloc[2,1:].nunique(),np.round(cTotal/cdf[1]-totalMale/totalViews,3)]], columns=outputDF.columns))


In [16]:
#Calculate fairness for Age
    
for age in ["25-34","35-44","18-24","45-54","55-64","65+","13-17"]:
    totalAge = df.loc[df["age"]==age].groupby("age")["sumVideoViews"].sum().values[0]
    for cdf in dfs:
        cTotal = (cdf[0].iloc[1,1:].values==age).sum()
        outputDF = outputDF.append(pd.DataFrame([[cdf[2],cdf[1],"Age",age,np.round(totalAge/totalViews*100,3),df["age"].nunique(),cTotal,np.round(cTotal/cdf[1]*100,3),cdf[0].iloc[1,1:].nunique(),np.round(cTotal/cdf[1]-totalAge/totalViews,3)]], columns=outputDF.columns))


In [17]:
#Calculate fairness for Country

df = df.loc[~df["country"].isnull()]

for country in df.country.unique().tolist():

    totalCountry = df.loc[df["country"]==country].groupby("country")["sumVideoViews"].sum().values[0]
    for cdf in dfs:
        cTotal = (cdf[0].iloc[3,1:].values==country).sum()
        outputDF = outputDF.append(pd.DataFrame([[cdf[2],cdf[1],"Country",country,np.round(totalCountry/totalViews*100,3),df["country"].nunique(),cTotal,np.round(cTotal/cdf[1]*100,3),cdf[0].iloc[3,1:].nunique(),np.round(cTotal/cdf[1]-totalCountry/totalViews,3)]], columns=outputDF.columns))


In [18]:
outputDF['Statistical Parity_Positive'] = np.abs(outputDF['Statistical Parity'])

In [19]:
outputDF.to_csv("stat_parity.csv",index=False)

In [20]:
#Save country fairness data
outputDF.loc[outputDF["GroupName"]=="Country"].groupby(['Dimensionality Reduction Algorithm','GroupName','Number of Personas'])['Statistical Parity_Positive'].mean().to_csv('Country_Stat_Parity_Mean.csv',header=True)

In [21]:
#Save Age fairness data
outputDF.loc[outputDF["GroupName"]=="Age"].groupby(['Dimensionality Reduction Algorithm','GroupName','Number of Personas'])['Statistical Parity_Positive'].mean().to_csv('Age_Stat_Parity_Mean.csv',header=True)

In [22]:
#Save Gender fairness data
outputDF.loc[outputDF["GroupName"]=="Gender"].groupby(['Dimensionality Reduction Algorithm','GroupName','Number of Personas'])['Statistical Parity_Positive'].mean().to_csv('Gender_Stat_Parity_Mean.csv',header=True)

# <a id="diversity">Diversity</a>

In [23]:
diversitydf = outputDF.groupby(["Dimensionality Reduction Algorithm","Number of Personas","GroupName"], as_index=False)["Total Unique Values in Persona Set"].first()

In [24]:
diversitydf["persona_group"] = "coverage"+"_"+diversitydf["GroupName"] + "_" + diversitydf["Number of Personas"].astype(str)
diversitydf = diversitydf.drop(["Number of Personas","GroupName"],axis=1)

In [25]:
diversitydf = diversitydf.pivot(index='persona_group', columns='Dimensionality Reduction Algorithm', values='Total Unique Values in Persona Set')

In [26]:
diversitydf = diversitydf.reset_index().rename_axis(None, axis=1)
diversitydf

Unnamed: 0,persona_group,Clustering,LDA,NMF,PCA,SE,UMAP
0,coverage_Age_10,5,4,4,3,6,3
1,coverage_Age_15,5,6,7,3,6,6
2,coverage_Age_5,3,3,3,3,5,4
3,coverage_Country_10,3,6,3,8,7,6
4,coverage_Country_15,4,9,15,5,10,10
5,coverage_Country_5,1,3,2,2,2,3
6,coverage_Gender_10,2,2,2,2,2,2
7,coverage_Gender_15,2,2,2,1,2,2
8,coverage_Gender_5,2,2,2,2,2,2


In [27]:
uniqueValues = outputDF.groupby(["GroupName"])["Unique Values in Original Data"].first()

In [28]:
for group in ["Age","Country","Gender"]:
    for numPersonas in [5,10,15]:
        persona_group = "coverage_"+group+"_"+str(numPersonas)
        diversitydf = diversitydf.append(pd.DataFrame([[persona_group,"","","","","",""]], columns=diversitydf.columns))
        for algorithm in diversitydf.columns[1:]:
            uniqueNumInPersona = int(diversitydf.loc[diversitydf["persona_group"]==persona_group,algorithm].iloc[0])
            uniqueNumInData = uniqueValues[group]
            diversitydf[algorithm].iloc[diversitydf.shape[0]-1] = uniqueNumInPersona/uniqueNumInData*100

In [29]:
diversitydf

Unnamed: 0,persona_group,Clustering,LDA,NMF,PCA,SE,UMAP
0,coverage_Age_10,5.0,4.0,4.0,3.0,6.0,3.0
1,coverage_Age_15,5.0,6.0,7.0,3.0,6.0,6.0
2,coverage_Age_5,3.0,3.0,3.0,3.0,5.0,4.0
3,coverage_Country_10,3.0,6.0,3.0,8.0,7.0,6.0
4,coverage_Country_15,4.0,9.0,15.0,5.0,10.0,10.0
5,coverage_Country_5,1.0,3.0,2.0,2.0,2.0,3.0
6,coverage_Gender_10,2.0,2.0,2.0,2.0,2.0,2.0
7,coverage_Gender_15,2.0,2.0,2.0,1.0,2.0,2.0
8,coverage_Gender_5,2.0,2.0,2.0,2.0,2.0,2.0
0,coverage_Age_5,42.8571,42.8571,42.8571,42.8571,71.4286,57.1429


In [30]:
diversitydf.to_csv("diversitydf.csv",index=False)

# <a id="consistency">Consistency</a>

In [31]:
consistencyDF = pd.DataFrame(columns=["Algorithm","Consistency Score"])

In [32]:
def randomString(stringLength): #Generate a random string
    letters = string.ascii_lowercase
    return ''.join(random.choice(letters) for i in range(stringLength))

def common(lst1, lst2): 
    #This function finds the count of common values between two lists.
    #But we also do not want to use a single value from first list to match two values in the second list.
    #Example:
    #List1 = [1,1,2,3,4]
    #List2 = [1,1,1,2,2,3,4]
    #Score = 5. Because Two ones in list1 match two times. One two in list1 match one time. 3 and 4 match 1 time. So total 5 matches
    counter = 0
    lst11,lst22= lst1.copy(),lst2.copy() #Copy them, so we don't overwrite the original 
    for i in range(len(lst11)):
        for j in range(len(lst22)):
            if lst11[i]==lst22[j]:
                counter +=1
                lst11[i] = randomString(10000) #We don't want to count it again. So we just assign a random value to it.
                lst22[j] = randomString(10000)
                break
    return counter

In [33]:
for dralgo in dralgos: #Need to process files of all algorithms
    uniqueValues = []
    for i in range(3): #Need to read the file 3 times with all 3 sheets. (5,10,15)
        temp = pd.read_excel("personas/"+dralgo+".xlsx",sheet_name=2-i)
        uniqueValues_tmp = []
        for i in range(temp.shape[1]-1):
            #Join Age, Gender and country. 
            value = str(temp.iloc[1,1+i:2+i].values[0])+temp.iloc[2,1+i:2+i].values[0]+temp.iloc[3,1+i:2+i].values[0]
            uniqueValues_tmp.append(value)
        uniqueValues.append(uniqueValues_tmp)
      

    # Find common between 5 persona set and 10
    score = common(uniqueValues[0], uniqueValues[1])/5
    # Find common between 5 persona set and 15
    score += common(uniqueValues[0], uniqueValues[2])/5
    # Find common between 10 persona set and 15
    score += common(uniqueValues[1], uniqueValues[2])/10
    score = score/3
    
    consistencyDF = consistencyDF.append(pd.DataFrame([[dralgo,score]], columns=consistencyDF.columns))

In [34]:
consistencyDF

Unnamed: 0,Algorithm,Consistency Score
0,Clustering,0.0
0,PCA,0.233333
0,NMF,0.066667
0,LDA,0.0
0,SE,0.0
0,UMAP,0.0


In [35]:
consistencyDF.to_csv("consistencyDF.csv",index=False)