In [12]:
from scipy.stats import mannwhitneyu, linregress
from scipy.spatial.distance import cdist
from itertools import combinations
import numpy
import pandas

import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
%matplotlib inline

DATA_FILE = 'DataSetName.csv'
TEAM_ID_COLUMN_INDEX = 1 # Recall that Python indexes from 0
IMPORTED_DATA = pandas.read_csv(DATA_FILE, index_col=False, sep=',')

# Dealing with different sized data

In order to deal with data from different sized teams, we're going to divide out the size of the team from the vector $\mathbf{x}\in\mathbb{R}^d$.  This has the effect of normalizing vector sizes $\mathbf{x}_\text{norm} = \frac{1}{d}\mathbf{x}$.

Note that this still does not completely account for issues where attributes are on totally different scales (if an attribute is roughly on the order of 5 and another is on the order of 50, then not all senses of distance are as effective).  But the impact of that is primarily in a statistical context: the mathematics still works as it should.

Normalization is activated by default, but can be shut off by passing `normalize=False` to the comparison function.

# This is the definition of the relevant functions

I can move these to a separate `.py` file to clean this notebook up some, but leaving things here makes it clear exactly what computation is happening.  It also allows us to make modifications on the fly (which shouldn't be needed but could be useful).

In [13]:
def extract_columns(columns, all_data=IMPORTED_DATA, team_id_column_index=TEAM_ID_COLUMN_INDEX):
    data_matrix = all_data.to_numpy()
    results = {}
    for row_num, row in enumerate(data_matrix):
        try:
            team = int(row[TEAM_ID_COLUMN_INDEX])
        except ValueError:
            raise ValueError('Unable to convert team id to integer in row {0}: {1} ... is it an integer?'.format(row_num, row[TEAM_ID_COLUMN_INDEX]))
        else:
            if team != row[TEAM_ID_COLUMN_INDEX]:
                raise ValueError('team in row {0} does not seem to be an integer: {1} ... should it be?'.format(row_num, row[TEAM_ID_COLUMN_INDEX]))
        if team not in results:
            results[team] = {column: [] for column in columns}
        for column in columns:
            results[team][column].append(all_data[column][row_num])
        
    return results

In [14]:
def print_comparisons_csv(base_column, comparison_columns, distance_type='cosine', all_data=IMPORTED_DATA, normalize=True):
    assert type(base_column) == str
    assert type(comparison_columns) == list and all([type(c) == str for c in comparison_columns])
    results = extract_columns(set([base_column] + comparison_columns), all_data)
    
    print('Team  ' + base_column + ' vs.')
    
    max_header_length = max(map(len, comparison_columns))
    header = '      '
    types = ' '.join(['{' + str(i) + ':' + str(max_header_length) + 's}' for i in range(len(comparison_columns))])
    header += types.format(*[str(d) for d in comparison_columns])
    print(header)
    print('-' * (5 + (max_header_length + 1) * len(comparison_columns) - 1))
    
    for team, info in sorted(results.items(), key=lambda x: x[0]):
        c1 = numpy.array(info[base_column], dtype=float)
        if normalize:
            c1 /= len(c1)
        results = []
        for column in comparison_columns:
            c2 = numpy.array(info[column], dtype=float)
            if normalize:
                c2 /= len(c2)
            if distance_type == 'cosine':
                results.append(numpy.arccos(numpy.dot(c1, c2) / (numpy.linalg.norm(c1) * numpy.linalg.norm(c2) + 1e-10)))
            else:
                results.append(numpy.linalg.norm(c1 - c2, ord=distance_type))
        stuff = '{0:4d} '.format(int(team))
        types = ' '.join(['{' + str(i) + ':' + str(max_header_length) + '.3f}' for i in range(len(comparison_columns))])
        stuff += types.format(*results)
        print(stuff)

In [15]:
base_column = 'Attribute1'
comparison_columns = ['Attribute2','Attribute3','Attribute4','Attribute5', 'Attribute6']
distance_type = 2


print_comparisons_csv(base_column, comparison_columns, distance_type=distance_type)

Team  cEXTRA vs.
      cHEXEX   cSTAR    cCONTR   cPERF    ESTARscl
-------------------------------------------------
 102    0.277    0.414    0.582    0.583    0.340
 103    0.154    0.311    0.271    0.232    0.223
 104    0.426    0.658    0.400    0.465    0.411
 105    0.292    0.543    0.322    0.335    0.263
 106    0.416    0.490    0.368    0.331    0.376
 107    0.120    0.620    0.719    0.748    0.322
 108    0.426    0.466    0.635    0.454    0.384
 109    0.416    0.308    0.732    0.734    0.382
 110    0.220    0.560    0.465    0.478    0.392
 111    0.254    0.352    0.244    0.214    0.241
 112    0.473    0.128    0.208    0.124    0.119
 113    0.426    0.255    0.236    0.212    0.230
 114    0.384    0.427    0.407    0.374    0.307
 115    0.347    2.145    1.777    1.761    1.259
 116    0.293    0.380    0.272    0.248    0.261
 117    0.229    0.577    0.568    0.641    0.383
 118    0.328    0.562    0.454    0.410    0.383
 119    0.217    0.349    0.521 