# TOPSIS Ranking for Batsmen

In [5]:
import numpy as np               # for linear algebra
import pandas as pd              # for tabular output
from scipy.stats import rankdata # for ranking the candidates

## Step 0 - Obtaining and processing the data

The data from the Excel sheet is saved into CSV files and stored in the `data` folder at the root of the project. The criteria, their rankings, the players' scores based on the mentioned criteria are stored in Numpy arrays and processed for the next step.

Note that an attribute can be beneficial attribute (in which case, we will want to maximize it's contribution) or a cost attribute (which we will need to minimize). We call the set of beneficial attributes $J_1$ and that of cost attributes $J_2 = J_1^C$.

In [6]:
attributes_data = pd.read_csv('../data/batting_criteria.csv')
attributes_data

Unnamed: 0,Name,Ranking,Ideally
0,SR,1,Higher
1,Avg,2,Higher
2,Runs,3,Higher
3,Inn,4,Higher
4,NO,5,Higher
5,6s,6,Higher
6,4s,7,Higher
7,100s,8,Higher
8,50s,9,Higher
9,Mat,10,Higher


In [7]:
benefit_attributes = set()
attributes = []
ranks = []
n = 0

for i, row in attributes_data.iterrows():
    attributes.append(row['Name'])
    ranks.append(float(row['Ranking']))
    n += 1
    
    if row['Ideally'] == 'Higher':
        benefit_attributes.add(i)

ranks = np.array(ranks)

In [14]:
weights = 2 * (n + 1 - ranks) / (n * (n + 1))
pd.DataFrame(data=weights, index=attributes, columns=['Weight'])

Unnamed: 0,Weight
SR,0.153846
Avg,0.141026
Runs,0.128205
Inn,0.115385
NO,0.102564
6s,0.089744
4s,0.076923
100s,0.064103
50s,0.051282
Mat,0.038462


In [5]:
original_dataframe = pd.read_csv('../data/batsmen.csv')
candidates = original_dataframe['Name'].to_numpy()
raw_data = pd.DataFrame(original_dataframe, columns=attributes).to_numpy()

dimensions = raw_data.shape
m = dimensions[0]
n = dimensions[1]

pd.DataFrame(data=raw_data, index=candidates, columns=attributes)

Unnamed: 0,SR,Avg,Runs,Inn,NO,6s,4s,100s,50s,Mat,HS,BF
AB de Villiers,154.0,44.2,442.0,13.0,3.0,26.0,31.0,0.0,5.0,13.0,82.0,287.0
Andre Russel,204.81,56.67,510.0,13.0,4.0,52.0,31.0,0.0,4.0,14.0,80.0,249.0
Ben Stokes,124.24,20.5,123.0,9.0,3.0,4.0,8.0,0.0,0.0,9.0,46.0,99.0
Chris Gayle,153.6,40.83,490.0,13.0,1.0,34.0,45.0,0.0,4.0,13.0,99.0,319.0
Chris Lynn,139.65,31.15,405.0,13.0,0.0,22.0,41.0,0.0,4.0,13.0,82.0,290.0
David Warner,143.86,69.2,692.0,12.0,2.0,21.0,57.0,1.0,8.0,12.0,100.0,481.0
Faf Du Plessis,123.36,36.0,396.0,12.0,1.0,15.0,36.0,0.0,3.0,12.0,96.0,321.0
Jonny Bairstow,157.24,55.63,445.0,10.0,2.0,18.0,48.0,1.0,2.0,10.0,114.0,283.0
Jos Buttler,151.7,38.88,311.0,8.0,0.0,14.0,38.0,0.0,3.0,8.0,89.0,205.0
Kane Williamson,120.0,22.29,156.0,9.0,2.0,5.0,12.0,0.0,1.0,9.0,70.0,130.0


## Step 1 - Normalizing the ratings

$$r_{ij}=\frac{x_{ij}}{\sqrt{\sum_{i = 1}^{m} x_{ij}^2}}$$

where $i = 1, 2, \ldots, m$ and $j = 1, 2, \ldots, n$.

In [6]:
divisors = np.empty(n)
for j in range(n):
    column = raw_data[:,j]
    divisors[j] = np.sqrt(column @ column)

raw_data /= divisors
pd.DataFrame(data=raw_data, index=candidates, columns=attributes)

Unnamed: 0,SR,Avg,Runs,Inn,NO,6s,4s,100s,50s,Mat,HS,BF
AB de Villiers,0.264163,0.266301,0.277322,0.26926,0.264135,0.287807,0.222189,0.0,0.354441,0.260889,0.24583,0.259994
Andre Russel,0.35132,0.341432,0.319987,0.26926,0.35218,0.575614,0.222189,0.0,0.283552,0.280957,0.239834,0.22557
Ben Stokes,0.213114,0.123511,0.077173,0.186411,0.264135,0.044278,0.057339,0.0,0.0,0.180615,0.137905,0.089684
Chris Gayle,0.263477,0.245997,0.307438,0.26926,0.088045,0.376363,0.322533,0.0,0.283552,0.260889,0.296795,0.288983
Chris Lynn,0.239548,0.187676,0.254107,0.26926,0.0,0.243529,0.293863,0.0,0.283552,0.260889,0.24583,0.262712
David Warner,0.24677,0.416924,0.434178,0.248548,0.17609,0.23246,0.408542,0.707107,0.567105,0.24082,0.299792,0.43574
Faf Du Plessis,0.211605,0.216897,0.24846,0.248548,0.088045,0.166043,0.258026,0.0,0.212664,0.24082,0.287801,0.290795
Jonny Bairstow,0.269721,0.335166,0.279204,0.207123,0.17609,0.199251,0.344035,0.707107,0.141776,0.200683,0.341763,0.256371
Jos Buttler,0.260218,0.234249,0.195129,0.165699,0.0,0.154973,0.272361,0.0,0.212664,0.160547,0.266815,0.18571
Kane Williamson,0.205841,0.134295,0.097878,0.186411,0.17609,0.055348,0.086009,0.0,0.070888,0.180615,0.209855,0.117767


## Step 2 - Calculating the Weighted Normalized Ratings

$$v_{ij} = w_j r_{ij}$$

where $i = 1, 2, \ldots, m$ and $j = 1, 2, \ldots, n$.

In [7]:
raw_data *= weights
pd.DataFrame(data=raw_data, index=candidates, columns=attributes)

Unnamed: 0,SR,Avg,Runs,Inn,NO,6s,4s,100s,50s,Mat,HS,BF
AB de Villiers,0.04064,0.037555,0.035554,0.031068,0.027091,0.025829,0.017091,0.0,0.018176,0.010034,0.006303,0.003333
Andre Russel,0.054049,0.048151,0.041024,0.031068,0.036121,0.051658,0.017091,0.0,0.014541,0.010806,0.00615,0.002892
Ben Stokes,0.032787,0.017418,0.009894,0.021509,0.027091,0.003974,0.004411,0.0,0.0,0.006947,0.003536,0.00115
Chris Gayle,0.040535,0.034692,0.039415,0.031068,0.00903,0.033776,0.02481,0.0,0.014541,0.010034,0.00761,0.003705
Chris Lynn,0.036854,0.026467,0.032578,0.031068,0.0,0.021855,0.022605,0.0,0.014541,0.010034,0.006303,0.003368
David Warner,0.037965,0.058797,0.055664,0.028679,0.018061,0.020862,0.031426,0.045327,0.029082,0.009262,0.007687,0.005586
Faf Du Plessis,0.032555,0.030588,0.031854,0.028679,0.00903,0.014901,0.019848,0.0,0.010906,0.009262,0.00738,0.003728
Jonny Bairstow,0.041496,0.047267,0.035795,0.023899,0.018061,0.017882,0.026464,0.045327,0.007271,0.007719,0.008763,0.003287
Jos Buttler,0.040034,0.033035,0.025017,0.019119,0.0,0.013908,0.020951,0.0,0.010906,0.006175,0.006841,0.002381
Kane Williamson,0.031668,0.018939,0.012549,0.021509,0.018061,0.004967,0.006616,0.0,0.003635,0.006947,0.005381,0.00151


## Step 3 - Identifying PIS ($A^*$) and NIS ($A^-$)

$$
\begin{align}
A^* &= \left\{v_1^*, v_2^*, \ldots, v_n^*\right\} \\
A^- &= \left\{v_1^-, v_2^-, \ldots, v_n^-\right\} \\
\end{align}
$$

And we define

$$
\begin{align}
v_j^* &=
\begin{cases}
\max{(v_{ij})}, \text{ if} j \in J_1 \\
\min{(v_{ij})}, \text{ if} j \in J_2
\end{cases}
\\
v_j^- &=
\begin{cases}
\min{(v_{ij})}, \text{ if} j \in J_1 \\
\max{(v_{ij})}, \text{ if} j \in J_2
\end{cases}
\\
\end{align}
$$

where $i = 1, 2, \ldots, m$ and $j = 1, 2, \ldots, n$.

In [8]:
a_pos = np.zeros(n)
a_neg = np.zeros(n)
for j in range(n):
    column = raw_data[:,j]
    max_val = np.max(column)
    min_val = np.min(column)
    
    # See if we want to maximize benefit or minimize cost (for PIS)
    if j in benefit_attributes:
        a_pos[j] = max_val
        a_neg[j] = min_val
    else:
        a_pos[j] = min_val
        a_neg[j] = max_val

pd.DataFrame(data=[a_pos, a_neg], index=["$A^*$", "$A^-$"], columns=attributes)

Unnamed: 0,SR,Avg,Runs,Inn,NO,6s,4s,100s,50s,Mat,HS,BF
$A^*$,0.054049,0.058797,0.055664,0.040628,0.054182,0.051658,0.031426,0.045327,0.029082,0.013122,0.008763,0.005586
$A^-$,0.030612,0.017418,0.009894,0.019119,0.0,0.003974,0.004411,0.0,0.0,0.006175,0.003536,0.00115


## Step 4 and 5 - Calculating Separation Measures and Similarities to PIS

The separation or distance between the alternatives can be measured by the $n$-dimensional Euclidean distance. The separation from the PIS $A^*$ and NIS $A^-$ are $S^*$ and $S^-$ respectively.

$$
\begin{align}
S_i^* &= \sqrt{\sum_{j = 1}^n \left(v_{ij} - v^*_j\right)^2} \\
S_i^- &= \sqrt{\sum_{j = 1}^n \left(v_{ij} - v^-_j\right)^2} \\
\end{align}
$$

where $i = 1, 2, \ldots, m$ and $j = 1, 2, \ldots, n$.

We also calculate

$$
C^*_i = \frac{S_i^-}{S_i^* + S_i^-},\text{ where }i = 1, 2, \ldots, m
$$

In [9]:
sp = np.zeros(m)
sn = np.zeros(m)
cs = np.zeros(m)

for i in range(m):
    diff_pos = raw_data[i] - a_pos
    diff_neg = raw_data[i] - a_neg
    sp[i] = np.sqrt(diff_pos @ diff_pos)
    sn[i] = np.sqrt(diff_neg @ diff_neg)
    cs[i] = sn[i] / (sp[i] + sn[i])

pd.DataFrame(data=zip(sp, sn, cs), index=candidates, columns=["$S^*$", "$S^-$", "$C^*$"])

Unnamed: 0,$S^*$,$S^-$,$C^*$
AB de Villiers,0.070196,0.055112,0.439813
Andre Russel,0.056888,0.081165,0.587927
Ben Stokes,0.106526,0.027294,0.203959
Chris Gayle,0.076169,0.055195,0.420169
Chris Lynn,0.090296,0.04084,0.31143
David Warner,0.051679,0.090778,0.637231
Faf Du Plessis,0.088862,0.036606,0.291756
Jonny Bairstow,0.062814,0.069648,0.525796
Jos Buttler,0.09581,0.032634,0.254072
Kane Williamson,0.105748,0.01912,0.15312


## Step 6 - Ranking the candidates/alternatives

We choose the candidate with the maximum $C^*$ or rank all the alternatives in descending order according to their $C^*$ values. This process can also be done for the $S^*$ and $S^-$ values.

In [10]:
def rank_according_to(data):
    ranks = (rankdata(data) - 1).astype(int)
    storage = np.zeros_like(candidates)
    storage[ranks] = candidates
    return storage[::-1]

In [11]:
cs_order = rank_according_to(cs)
sp_order = rank_according_to(sp)
sn_order = rank_according_to(sn)

pd.DataFrame(data=zip(cs_order, sp_order[::-1], sn_order), index=range(1, m + 1),
             columns=["$C^*$", "$S^*$", "$S^-$"])

Unnamed: 0,$C^*$,$S^*$,$S^-$
1,David Warner,David Warner,David Warner
2,Andre Russel,Andre Russel,Andre Russel
3,Jonny Bairstow,Jonny Bairstow,Jonny Bairstow
4,Kieron Pollard,AB de Villiers,Kieron Pollard
5,AB de Villiers,Chris Gayle,Marcus Stoinis
6,Chris Gayle,Kieron Pollard,Chris Gayle
7,Marcus Stoinis,Quinton de Kock,AB de Villiers
8,Quinton de Kock,Marcus Stoinis,Quinton de Kock
9,Chris Lynn,Faf Du Plessis,Shane Watson
10,Shane Watson,Chris Lynn,Chris Lynn


In [12]:
print("The best candidate/alternative according to C* is " + cs_order[0])
print("The preferences in descending order are " + ", ".join(cs_order) + ".")

The best candidate/alternative according to C* is David Warner
The preferences in descending order are David Warner, Andre Russel, Jonny Bairstow, Kieron Pollard, AB de Villiers, Chris Gayle, Marcus Stoinis, Quinton de Kock, Chris Lynn, Shane Watson, Faf Du Plessis, Steve Smith, Jos Buttler, Moeen Ali, Ben Stokes, Kane Williamson.
